Options

Problems with the classification of comments

josejose Member Posts: 16 Contributor II
Hello,
I want to read a text containing positive and negative comments and rank, that is, identify which comments are positive and which are negative.

I created two procesoss.

The first reads and negative comments posistivos are in different files. Here I apply Naive-Bayes for classification and building the model.

The second process reads the comments I want to classify (all in different files) and apply the model generated above.

My problem is that not consistent classification. I classified as negative comments, comments which are really positive.

Anyone have any idea of ​​why this happens?

The xml of the processes are:

Process 1:
 
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.000" expanded="true" name="Process">
    <process expanded="true" height="235" width="480">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
        <list key="text_directories">
          <parameter key="positivos" value="C:\Users\yop\Desktop\archivos de tesis\comentarios positivos"/>
          <parameter key="negativos" value="C:\Users\yop\Desktop\archivos de tesis\comentarios negativos"/>
        </list>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="vector_creation" value="Term Frequency"/>
        <process expanded="true" height="352" width="586">
          <operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="75"/>
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="179" y="75"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="313" y="75"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="447" y="75">
            <parameter key="language" value="Spanish"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.2.000" expanded="true" height="130" name="Validation" width="90" x="179" y="30">
        <parameter key="number_of_validations" value="2"/>
        <process expanded="true" height="352" width="268">
          <operator activated="true" class="naive_bayes" compatibility="5.2.000" expanded="true" height="76" name="Naive Bayes" width="90" x="99" y="59">
            <parameter key="laplace_correction" value="false"/>
          </operator>
          <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
          <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="352" width="279">
          <operator activated="true" class="apply_model" compatibility="5.2.000" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.2.000" expanded="true" height="76" name="Performance" width="90" x="112" y="165"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
          <portSpacing port="sink_averagable 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_model" compatibility="5.2.000" expanded="true" height="60" name="Write Model" width="90" x="380" y="165">
        <parameter key="model_file" value="C:\Users\yop\Desktop\archivos de tesis\model\model"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_op="Write Model" to_port="input"/>
      <connect from_op="Validation" from_port="training" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <connect from_op="Validation" from_port="averagable 2" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>


and process 2:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.000" expanded="true" name="Process">
    <process expanded="true" height="325" width="547">
      <operator activated="true" class="read_model" compatibility="5.2.000" expanded="true" height="60" name="Read Model" width="90" x="41" y="44">
        <parameter key="model_file" value="C:\Users\yop\Desktop\archivos de tesis\model\model"/>
      </operator>
      <operator activated="true" class="text:process_document_from_file" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="165">
        <list key="text_directories">
          <parameter key="comentarios" value="C:\Users\yop\Desktop\archivos de tesis\comentarios"/>
        </list>
        <process expanded="true" height="352" width="586">
          <operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="37" y="40"/>
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="179" y="30"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="380" y="30">
            <parameter key="language" value="Spanish"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.2.000" expanded="true" height="76" name="Apply Model" width="90" x="179" y="75">
        <list key="application_parameters"/>
        <parameter key="create_view" value="true"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.2.000" expanded="true" height="76" name="Select Attributes" width="90" x="313" y="75">
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="write_excel" compatibility="5.2.000" expanded="true" height="60" name="Write Excel" width="90" x="447" y="120">
        <parameter key="excel_file" value="C:\Users\yop\Desktop\archivos de tesis\aml\exit.xls"/>
      </operator>
      <connect from_op="Read Model" from_port="output" to_op="Apply Model" to_port="model"/>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
    </process>
  </operator>
</process>

Thanks

Answers

  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    I have to admit that I did not look yet at your processes, but data mining is not perfect, and sometimes your algorithms don't get some concepts obvious to the human reader and make mistakes. Data mining is always a matter of trial and error and a lot of optimization, there are no general rules of thumb on how to classify something (even though the experienced data miner can of course gain from his experience).

    Best, Marius
Sign In or Register to comment.