[SOLVED] Training a classifier to have an else statement

yvncruzyvncruz Member Posts: 8 Contributor I
edited November 2019 in Help
Hi everyone!

I'm trying to train a classifier so that it can classify a data as true, false and "unclassified" depending on the data set. The output should be that if the data is within or is relation to the data set, it will be classified accordingly. If not, it should be classified as "unclassified".

Currently i only have a process that can classify true/false. Is there a process in rapid miner in which i can include the unclassified class or do i need to add "unclassified" data into my data set for it to work?

Our data is about health beliefs involving asthma and diabetes. Whenever it is not included in our data set, it should be unclassified.

Sample data:
Tweet                                              Category
diabetes causes heart problems false
Diabetes causes shingles         false
Dirt treats Asthma                       false
oatmeal medicates diabetes           true
Obesity causes Asthma        true
Obesity causes Diabetes        true
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="read_excel" compatibility="5.3.013" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
       <parameter key="excel_file" value="C:\Users\user1.user\Desktop\Training Set.xlsx"/>
       <parameter key="imported_cell_range" value="A1:B77"/>
       <list key="annotations">
         <parameter key="0" value="Name"/>
       </list>
       <list key="data_set_meta_data_information">
         <parameter key="0" value="Text.true.text.attribute"/>
         <parameter key="1" value="Category.true.binominal.label"/>
       </list>
       <parameter key="read_not_matching_values_as_missings" value="false"/>
     </operator>
     <operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
       <parameter key="include_special_attributes" value="true"/>
     </operator>
     <operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
       <parameter key="attribute_name" value="Category"/>
       <parameter key="target_role" value="label"/>
       <list key="set_additional_roles"/>
     </operator>
     <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
       <parameter key="keep_text" value="true"/>
       <parameter key="prune_below_absolute" value="2"/>
       <parameter key="prune_above_absolute" value="9999"/>
       <list key="specify_weights">
         <parameter key="Text" value="1.0"/>
       </list>
       <process expanded="true">
         <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75"/>
         <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="165"/>
         <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="246" y="75">
           <parameter key="min_chars" value="3"/>
           <parameter key="max_chars" value="999"/>
         </operator>
         <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="246" y="165"/>
         <connect from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
         <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
         <connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
       <process expanded="true">
         <operator activated="true" class="support_vector_machine" compatibility="5.3.013" expanded="true" height="112" name="SVM" width="90" x="45" y="30">
           <parameter key="kernel_cache" value="80"/>
         </operator>
         <operator activated="false" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="165"/>
         <operator activated="false" class="decision_tree" compatibility="5.3.013" expanded="true" height="76" name="Decision Tree" width="90" x="179" y="165"/>
         <operator activated="false" class="weka:W-J48" compatibility="5.3.001" expanded="true" height="76" name="W-J48" width="90" x="45" y="255"/>
         <connect from_port="training" to_op="SVM" to_port="training set"/>
         <connect from_op="SVM" from_port="model" to_port="model"/>
         <portSpacing port="source_training" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
         <portSpacing port="sink_through 1" spacing="0"/>
       </process>
       <process expanded="true">
         <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
           <list key="application_parameters"/>
         </operator>
         <operator activated="true" class="store" compatibility="5.3.013" expanded="true" height="60" name="Store" width="90" x="45" y="165">
           <parameter key="repository_entry" value="../data/AsthmaDiabetesModel"/>
         </operator>
         <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="213" y="30">
           <parameter key="main_criterion" value="accuracy"/>
           <parameter key="classification_error" value="true"/>
           <parameter key="kappa" value="true"/>
           <list key="class_weights"/>
         </operator>
         <connect from_port="model" to_op="Apply Model" to_port="model"/>
         <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
         <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
         <connect from_op="Apply Model" from_port="model" to_op="Store" to_port="input"/>
         <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
         <portSpacing port="source_model" spacing="0"/>
         <portSpacing port="source_test set" spacing="0"/>
         <portSpacing port="source_through 1" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Read Excel" from_port="output" to_op="Select Attributes" to_port="example set input"/>
     <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
     <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
     <connect from_op="Process Documents from Data" from_port="example set" to_op="Validation" to_port="training"/>
     <connect from_op="Validation" from_port="model" to_port="result 1"/>
     <connect from_op="Validation" from_port="training" to_port="result 3"/>
     <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
     <portSpacing port="sink_result 4" spacing="0"/>
   </process>
 </operator>
</process>

Thanks in advance!
Yvan
Tagged:

Answers

  • mschmitzmschmitz Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,249 RM Data Scientist
    Hello,

    what you could do is relabel everything between 0.4 and 0.6 as "unknown". The effect would be, that you call everything, where the classifier is unsure "unknown". I this what you want?

    The drop uncertain operator may help you in this case
    - Head of Data Science Services at RapidMiner -
    Dortmund, Germany
  • yvncruzyvncruz Member Posts: 8 Contributor I
    Yes! that is what i'd like to happen. The model should classify unsure data as "unknown". Thank you for your input! i will try it out and see how it works. Thanks!
  • mschmitzmschmitz Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,249 RM Data Scientist
    I've done something similar in this thread: http://rapid-i.com/rapidforum/index.php/topic,8416.0.html

    But there was the third class already avaible. It might help you.

    If you need further help, just post here,than i'll create an example process
    - Head of Data Science Services at RapidMiner -
    Dortmund, Germany
  • yvncruzyvncruz Member Posts: 8 Contributor I
    Hello,

    Thank you for your help. I've read through link you gave me. I might be able to use the generate attribute using expressions to handle this problem.

    Problems im going through right now is the values i should use as minimum and maximum confidence. The data set i'm trying to use is diverse compared to the data im trying to categorize (confidence values like 0.550 is still correct).

    Also with regards to having three class, i'm not able to use 3 class since our thesis is focused on three classifiers (Naive, SVM). I tried using Naive bayes but it has confidence values of almost all of them as 1.

    Thank you for you help! We're really new to rapid miner so your help is greatly appreciated.

    Best,
    Yvan
  • mschmitzmschmitz Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,249 RM Data Scientist
    I would go for Drop Uncertain in a cross validation.

    You can then have a loop or Optimize operator around and simply try out every cut on the confidences (and your SVM parameters).

    Attached is a process doing it on Sonar. It delivers the best performance and logs all performances for the differen values at drop uncertain.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.1.001-SNAPSHOT">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.1.001-SNAPSHOT" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="6.1.001-SNAPSHOT" expanded="true" height="60" name="Retrieve Sonar" width="90" x="45" y="30">
            <parameter key="repository_entry" value="//Samples/data/Sonar"/>
          </operator>
          <operator activated="true" class="optimize_parameters_grid" compatibility="6.1.001-SNAPSHOT" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="179" y="30">
            <list key="parameters">
              <parameter key="Drop Uncertain Predictions.min_confidence" value="[0.3;0.7;10;linear]"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="x_validation" compatibility="5.0.000" expanded="true" height="112" name="Validation" width="90" x="45" y="30">
                <description>A cross-validation evaluating a decision tree model.</description>
                <process expanded="true">
                  <operator activated="true" class="naive_bayes" compatibility="6.1.001-SNAPSHOT" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="30"/>
                  <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
                  <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
                  <portSpacing port="source_training" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="apply_model" compatibility="5.0.000" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="true" class="drop_uncertain_predictions" compatibility="6.1.001-SNAPSHOT" expanded="true" height="76" name="Drop Uncertain Predictions" width="90" x="179" y="30">
                    <parameter key="min_confidence" value="0.7"/>
                    <list key="min_confidences"/>
                  </operator>
                  <operator activated="true" class="performance" compatibility="5.0.000" expanded="true" height="76" name="Performance" width="90" x="380" y="30"/>
                  <connect from_port="model" to_op="Apply Model" to_port="model"/>
                  <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
                  <connect from_op="Apply Model" from_port="labelled data" to_op="Drop Uncertain Predictions" to_port="example set input"/>
                  <connect from_op="Drop Uncertain Predictions" from_port="example set output" to_op="Performance" to_port="labelled data"/>
                  <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_averagable 1" spacing="0"/>
                  <portSpacing port="sink_averagable 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="log" compatibility="6.1.001-SNAPSHOT" expanded="true" height="76" name="Log" width="90" x="246" y="75">
                <list key="log">
                  <parameter key="Confidence Cut" value="operator.Drop Uncertain Predictions.parameter.min_confidence"/>
                  <parameter key="Performance" value="operator.Validation.value.performance"/>
                </list>
              </operator>
              <connect from_port="input 1" to_op="Validation" to_port="training"/>
              <connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
              <connect from_op="Log" from_port="through 1" to_port="performance"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_performance" spacing="0"/>
              <portSpacing port="sink_result 1" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve Sonar" from_port="output" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
          <connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    - Head of Data Science Services at RapidMiner -
    Dortmund, Germany
  • yvncruzyvncruz Member Posts: 8 Contributor I
    Hello!

    Again, thank you for the help! With regards to applying the model (ie testing it with categorized data), will it still be able to drop uncertain data or do we have to add the same operator?

    Here is the process we use to test the model that we created.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.013">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
        <parameter key="logverbosity" value="off"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="5.3.013" expanded="true" height="60" name="Retrieve" width="90" x="179" y="30">
            <parameter key="repository_entry" value="//Local Repository/data/AsthmaDiabetesModel"/>
          </operator>
          <operator activated="true" class="read_excel" compatibility="5.3.013" expanded="true" height="60" name="Read Excel" width="90" x="45" y="120">
            <parameter key="excel_file" value="C:\Users\user1.user\Desktop\Classify.xlsx"/>
            <parameter key="imported_cell_range" value="A1:A90000"/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="TEXT.true.text.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="120">
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights">
              <parameter key="Text" value="1.0"/>
              <parameter key="Category" value="1.0"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="246" y="210"/>
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="246" y="120"/>
              <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="380" y="120">
                <parameter key="min_chars" value="3"/>
                <parameter key="max_chars" value="999"/>
              </operator>
              <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (2)" width="90" x="380" y="210"/>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="514" y="210"/>
              <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
              <connect from_op="Filter Tokens (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
              <connect from_op="Stem (2)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="313" y="30">
            <list key="application_parameters"/>
            <parameter key="create_view" value="true"/>
          </operator>
          <operator activated="false" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="380" y="255">
            <list key="class_weights"/>
          </operator>
          <operator activated="false" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes" width="90" x="514" y="255">
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="value_type" value="binominal"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Apply Model" to_port="model"/>
          <connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    Again thank you! we will be using the example process you gave as a template for us to test different values for our parameters. Thank you!

    Best Regards,
    Yvan
Sign In or Register to comment.