[SOLVED] Low Training Accuracy using Naive Bayes

yvncruzyvncruz Member Posts: 8 Contributor II
edited September 2019 in Help
Hello Everyone!

I have a small data set (101 datas) which i am using to train a naive bayes classifier with three class. Correct, Wrong, Uncertain. The results im getting shows low class recall with the "Wrong" class even though it has the largest number of data involved (55/101).

The process i am using.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
   <parameter key="logverbosity" value="off"/>
   <process expanded="true">
     <operator activated="true" class="read_excel" compatibility="5.3.013" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
       <parameter key="excel_file" value="C:\Users\user1.user\Desktop\Latest Training.xlsx"/>
       <parameter key="imported_cell_range" value="A1:B112"/>
       <list key="annotations">
         <parameter key="0" value="Name"/>
       </list>
       <list key="data_set_meta_data_information">
         <parameter key="0" value="Text.true.text.attribute"/>
         <parameter key="1" value="Label.true.text.label"/>
       </list>
       <parameter key="read_not_matching_values_as_missings" value="false"/>
     </operator>
     <operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
       <parameter key="include_special_attributes" value="true"/>
     </operator>
     <operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
       <parameter key="attribute_name" value="Label"/>
       <parameter key="target_role" value="label"/>
       <list key="set_additional_roles"/>
     </operator>
     <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
       <parameter key="keep_text" value="true"/>
       <parameter key="prune_below_absolute" value="2"/>
       <parameter key="prune_above_absolute" value="9999"/>
       <list key="specify_weights">
         <parameter key="Text" value="1.0"/>
       </list>
       <process expanded="true">
         <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75"/>
         <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="165"/>
         <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="246" y="75">
           <parameter key="min_chars" value="2"/>
           <parameter key="max_chars" value="999"/>
         </operator>
         <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="246" y="165"/>
         <connect from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
         <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
         <connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
       <parameter key="use_local_random_seed" value="true"/>
       <process expanded="true">
         <operator activated="false" class="weka:W-NaiveBayes" compatibility="5.3.001" expanded="true" height="76" name="W-NaiveBayes" width="90" x="112" y="75"/>
         <operator activated="true" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes" width="90" x="112" y="165"/>
         <operator activated="false" class="k_nn" compatibility="5.3.013" expanded="true" height="76" name="k-NN" width="90" x="112" y="255">
           <parameter key="measure_types" value="NumericalMeasures"/>
         </operator>
         <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
         <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
         <portSpacing port="source_training" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
         <portSpacing port="sink_through 1" spacing="0"/>
       </process>
       <process expanded="true">
         <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
           <list key="application_parameters"/>
         </operator>
         <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="276" y="30">
           <parameter key="main_criterion" value="accuracy"/>
           <parameter key="classification_error" value="true"/>
           <parameter key="kappa" value="true"/>
           <list key="class_weights"/>
         </operator>
         <operator activated="true" class="store" compatibility="5.3.013" expanded="true" height="60" name="Store" width="90" x="180" y="120">
           <parameter key="repository_entry" value="../data/AsthmaDiabetesModel"/>
         </operator>
         <connect from_port="model" to_op="Apply Model" to_port="model"/>
         <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
         <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
         <connect from_op="Apply Model" from_port="model" to_op="Store" to_port="input"/>
         <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
         <portSpacing port="source_model" spacing="0"/>
         <portSpacing port="source_test set" spacing="0"/>
         <portSpacing port="source_through 1" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Read Excel" from_port="output" to_op="Select Attributes" to_port="example set input"/>
     <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
     <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
     <connect from_op="Process Documents from Data" from_port="example set" to_op="Validation" to_port="training"/>
     <connect from_op="Validation" from_port="model" to_port="result 1"/>
     <connect from_op="Validation" from_port="training" to_port="result 2"/>
     <connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
     <portSpacing port="sink_result 4" spacing="0"/>
   </process>
 </operator>
</process>
Is there a problem on how I am applying naive bayes or is it a problem regarding my data? I also tried it with only two class (true/false) but it shows the same result. Sometimes the "Correct" class will yield a low class recall. Although it did give a proper reading when applied using SVM (using 2 class).


Thanks in advance!
Regards, Yvan

Answers

  • awchisholmawchisholm RapidMiner Certified Expert, Member Posts: 458 Unicorn
    Hello

    Difficult to know without the data but the output from the "Process Documents from Data" operator includes the original text so this might interfere with the classifier. Try clearing the "keep text" check box.

    regards

    Andrew
  • yvncruzyvncruz Member Posts: 8 Contributor II
    Hello Andrew!

    Thank you for the reply! Unfortunately unticking the "Keep Text" in the Process Data from Documents didn't work. It seems to me that I either didn't have the  pre-processing procedure done right or my data set isn't that good.

    Here is a link to my the data set. https://www.dropbox.com/s/ial79zraxsn5svh/Training%20Set.xlsx?dl=0
    (If its not allowed to link this, please notify me)

    Is there a procedure which I have to follow in order for naive bayes to work properly?

    Best Regards, Yvan
  • yvncruzyvncruz Member Posts: 8 Contributor II
    Update.

    I tried using different naive bayes operators from Weka. Specifically the "W-NaiveBayesUpdatable". changed the pre-processing by adding the "filter stopwords" operator.

    The results now show high class recall for "Uncertain" and "Wrong" data but yields low class recall for the "Correct" class.

    Here is the data set i am using. https://www.dropbox.com/s/ial79zraxsn5svh/Training%20Set.xlsx?dl=0

    The Process I am currently running right now.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.013">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
        <parameter key="logverbosity" value="off"/>
        <process expanded="true">
          <operator activated="true" class="read_excel" compatibility="5.3.013" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
            <parameter key="excel_file" value="C:\Users\user1.user\Desktop\Training Set.xlsx"/>
            <parameter key="imported_cell_range" value="A1:B1274"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="Text.true.text.attribute"/>
              <parameter key="1" value="Label.true.text.label"/>
            </list>
            <parameter key="read_not_matching_values_as_missings" value="false"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
            <parameter key="attribute_name" value="Label"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
            <parameter key="keep_text" value="true"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="9999"/>
            <list key="specify_weights">
              <parameter key="Text" value="1.0"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75"/>
              <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="165"/>
              <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="246" y="75">
                <parameter key="min_chars" value="1"/>
              </operator>
              <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="246" y="165"/>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="380" y="165"/>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
              <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="514" y="120">
            <parameter key="use_local_random_seed" value="true"/>
            <process expanded="true">
              <operator activated="false" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="30"/>
              <operator activated="false" class="weka:W-NaiveBayes" compatibility="5.3.001" expanded="true" height="76" name="W-NaiveBayes" width="90" x="179" y="120">
                <parameter key="S" value="4.0"/>
              </operator>
              <operator activated="false" class="decision_tree" compatibility="5.3.013" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="120">
                <parameter key="criterion" value="information_gain"/>
                <parameter key="minimal_gain" value="0.25"/>
                <parameter key="maximal_depth" value="-1"/>
                <parameter key="confidence" value="0.14"/>
              </operator>
              <operator activated="false" class="weka:W-ComplementNaiveBayes" compatibility="5.3.001" expanded="true" height="76" name="W-ComplementNaiveBayes" width="90" x="112" y="210">
                <parameter key="S" value="4.5"/>
              </operator>
              <operator activated="true" class="weka:W-NaiveBayesUpdateable" compatibility="5.3.001" expanded="true" height="76" name="W-NaiveBayesUpdateable" width="90" x="179" y="30">
                <parameter key="K" value="true"/>
                <parameter key="O" value="true"/>
              </operator>
              <connect from_port="training" to_op="W-NaiveBayesUpdateable" to_port="training set"/>
              <connect from_op="W-NaiveBayesUpdateable" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="208" y="30">
                <parameter key="main_criterion" value="accuracy"/>
                <parameter key="classification_error" value="true"/>
                <parameter key="kappa" value="true"/>
                <list key="class_weights"/>
              </operator>
              <operator activated="true" class="store" compatibility="5.3.013" expanded="true" height="60" name="Store" width="90" x="45" y="120">
                <parameter key="repository_entry" value="../data/AsthmaDiabetesModel"/>
              </operator>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Apply Model" from_port="model" to_op="Store" to_port="input"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="false" class="optimize_parameters_grid" compatibility="5.3.013" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="246" y="165">
            <list key="parameters">
              <parameter key="W-NaiveBayes.N" value="true,false"/>
              <parameter key="W-NaiveBayes.S" value="[1.0;10.0;30;linear]"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="log" compatibility="5.3.013" expanded="true" height="76" name="Log" width="90" x="313" y="75">
                <list key="log">
                  <parameter key="Performance" value="operator.Performance.value.accuracy"/>
                  <parameter key="S Value" value="operator.W-NaiveBayes.parameter.S"/>
                  <parameter key="N Value" value="operator.W-NaiveBayes.parameter.N"/>
                </list>
              </operator>
              <connect from_op="Log" from_port="through 1" to_port="performance"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="sink_performance" spacing="0"/>
              <portSpacing port="sink_result 1" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Read Excel" from_port="output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="model" to_port="result 1"/>
          <connect from_op="Validation" from_port="training" to_port="result 2"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
        </process>
      </operator>
    </process>
    Any help would be appreciated! Thank you!

    Best Regards,

    Yvan
  • awchisholmawchisholm RapidMiner Certified Expert, Member Posts: 458 Unicorn
    Hello

    My mistake, the inclusion of text doesn't make a difference because it is a special attribute so is ignored later

    The process is fine - the issue you are facing is maybe because the data doesn't support good classification performance with Naive Bayes.

    You could try generating term N grams and add a suitable selection operator.

    regards

    Andrew
  • yvncruzyvncruz Member Posts: 8 Contributor II
    Hello Andrew

    It seems that my data set is at fault here. The accuracy increased when I applied different classifiers such as Decision Tree or K-NN.

    Thank you for your inputs!

    Best Regards,
    Yvan
Sign In or Register to comment.