model for classification

lina · April 2011

hi to everyone! i'm still working on opinion mining and classification into positive and negative comments.my process is this:

 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.001" expanded="true" name="Process">
    <process expanded="true" height="396" width="687">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
        <list key="text_directories">
          <parameter key="positive" value="C:\Users\Fotis-Linaki\Desktop\diplwmatikh\rapidminer\opinion mining\pos"/>
          <parameter key="negative" value="C:\Users\Fotis-Linaki\Desktop\diplwmatikh\rapidminer\opinion mining\neg"/>
        </list>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="vector_creation" value="Term Occurrences"/>
        <process expanded="true" height="396" width="705">
          <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize" width="90" x="179" y="120"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.1.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="380" y="120">
            <parameter key="min_chars" value="3"/>
          </operator>
          <operator activated="true" class="text:stem_snowball" compatibility="5.1.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="514" y="30">
            <parameter key="language" value="Spanish"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="18"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.1.001" expanded="true" height="130" name="Validation" width="90" x="246" y="75">
        <parameter key="sampling_type" value="linear sampling"/>
        <process expanded="true" height="396" width="327">
          <operator activated="true" class="naive_bayes_kernel" compatibility="5.1.001" expanded="true" height="76" name="Naive Bayes (Kernel)" width="90" x="119" y="94"/>
          <connect from_port="training" to_op="Naive Bayes (Kernel)" to_port="training set"/>
          <connect from_op="Naive Bayes (Kernel)" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="18"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="36"/>
        </process>
        <process expanded="true" height="396" width="327">
          <operator activated="true" class="apply_model" compatibility="5.1.001" expanded="true" height="76" name="Apply Model" width="90" x="45" y="75">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.1.001" expanded="true" height="76" name="Performance" width="90" x="179" y="120"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="36"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
          <portSpacing port="sink_averagable 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_model" compatibility="5.1.001" expanded="true" height="60" name="Write Model" width="90" x="405" y="182">
        <parameter key="model_file" value="C:\Users\Fotis-Linaki\Desktop\diplwmatikh\rapidminer\opinion mining\double1.mod"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_op="Write Model" to_port="input"/>
      <connect from_op="Validation" from_port="training" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <connect from_op="Validation" from_port="averagable 2" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="18"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="72"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

could someone tell me which is the appropriate model for classification to choose? i'm using naive bayes as you can see from the code.but the accuracy in the results is 0%!
what's wrong about it?i have also tried other available models as well.
thank you in advance!

lina · April 2011

sorry for the 2nd post! I fixed sth and now the accuracy is 50% +/-16%. for negative about 66% and positive about 33%.
to be honest, what exactly does it mean? it is about estimation, isn' it?but could somebody explain me in a more detailed way what is it about and what does it mean to my proccess?is it good?i'm afraid not!i'm sorry if i sound silly :-[ !

land · April 2011

Hi,
we cannot explain you the basics of data analytics here in this forum. Sorry for that. I would suggest to refer either to a good book about it, or participate in one of the basic training courses we offer.

With kind regards,
Sebastian Land

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

model for classification

Answers