Options

"[SOLVED]Using PCA with Test Set Getting Error"

NonaNona Member Posts: 15 Contributor II
edited June 2019 in Help
Hello,
I am working on a dataset with large number of attributes(590) and also this dataset has class imbalance problem.It is a binomial classification problem. i am trying to do PCA first and then apply the reduced dataset for cross validation using decision tree. Then i have grouped these two models and applied on the test set. But the rapidminer gives error at the applymodel saying :"The setup does not seem to contain any obvious errors, but you should check the log messages or activate the debug mode in the settings dialog in order to get more information about this problem." when i checked the log the last line :
SEVERE: java.lang.NullPointerException. please help me with that  ::) .Here is my xml code:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve semiconductortesting" width="90" x="45" y="255">
        <parameter key="repository_entry" value="../data/semiconductor/semiconductortesting"/>
      </operator>
      <operator activated="true" class="materialize_data" compatibility="5.3.015" expanded="true" height="76" name="Materialize Data (2)" width="90" x="246" y="255"/>
      <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve kmedoid_undersampled_data" width="90" x="45" y="30">
        <parameter key="repository_entry" value="../data/semiconductor/kmedoid_undersampled_data"/>
      </operator>
      <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve SMOTE_oversampled_data" width="90" x="45" y="120">
        <parameter key="repository_entry" value="../data/semiconductor/SMOTE_oversampled_data"/>
      </operator>
      <operator activated="true" class="append" compatibility="5.3.015" expanded="true" height="94" name="Append" width="90" x="179" y="30"/>
      <operator activated="true" class="materialize_data" compatibility="5.3.015" expanded="true" height="76" name="Materialize Data" width="90" x="313" y="30"/>
      <operator activated="false" class="shuffle" compatibility="5.3.015" expanded="true" height="76" name="Shuffle" width="90" x="983" y="210">
        <parameter key="use_local_random_seed" value="true"/>
        <parameter key="local_random_seed" value="532"/>
      </operator>
      <operator activated="true" class="principal_component_analysis" compatibility="5.3.015" expanded="true" height="94" name="PCA" width="90" x="447" y="30">
        <parameter key="number_of_components" value="250"/>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
        <parameter key="number_of_validations" value="2"/>
        <parameter key="sampling_type" value="shuffled sampling"/>
        <process expanded="true">
          <operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree" width="90" x="133" y="30"/>
          <connect from_port="training" to_op="Decision Tree" to_port="training set"/>
          <connect from_op="Decision Tree" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="ApplyDEcisiontree" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance (3)" width="90" x="179" y="30"/>
          <operator activated="false" class="performance_binominal_classification" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance (2)" width="90" x="112" y="165">
            <parameter key="AUC" value="true"/>
            <parameter key="f_measure" value="true"/>
          </operator>
          <operator activated="false" class="performance_costs" compatibility="5.3.015" expanded="true" height="76" name="trainingcostperformance" width="90" x="246" y="165">
            <parameter key="cost_matrix" value="[0.0 4.0;1.0 0.0]"/>
            <enumeration key="class_order_definition">
              <parameter key="class_name" value="-1.0"/>
              <parameter key="class_name" value="1.0"/>
            </enumeration>
          </operator>
          <operator activated="false" breakpoints="after" class="performance_classification" compatibility="5.3.015" expanded="true" height="76" name="trainingperformance" width="90" x="380" y="255">
            <parameter key="main_criterion" value="accuracy"/>
            <parameter key="classification_error" value="true"/>
            <parameter key="weighted_mean_recall" value="true"/>
            <parameter key="weighted_mean_precision" value="true"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="ApplyDEcisiontree" to_port="model"/>
          <connect from_port="test set" to_op="ApplyDEcisiontree" to_port="unlabelled data"/>
          <connect from_op="ApplyDEcisiontree" from_port="labelled data" to_op="trainingperformance (3)" to_port="labelled data"/>
          <connect from_op="trainingperformance (3)" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="group_models" compatibility="5.3.015" expanded="true" height="94" name="Group Models" width="90" x="313" y="165"/>
      <operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="applyontestdata" width="90" x="447" y="255">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="PerformanceTestdata" width="90" x="581" y="210"/>
      <connect from_op="Retrieve semiconductortesting" from_port="output" to_op="Materialize Data (2)" to_port="example set input"/>
      <connect from_op="Materialize Data (2)" from_port="example set output" to_op="applyontestdata" to_port="unlabelled data"/>
      <connect from_op="Retrieve kmedoid_undersampled_data" from_port="output" to_op="Append" to_port="example set 1"/>
      <connect from_op="Retrieve SMOTE_oversampled_data" from_port="output" to_op="Append" to_port="example set 2"/>
      <connect from_op="Append" from_port="merged set" to_op="Materialize Data" to_port="example set input"/>
      <connect from_op="Materialize Data" from_port="example set output" to_op="PCA" to_port="example set input"/>
      <connect from_op="PCA" from_port="example set output" to_op="Validation" to_port="training"/>
      <connect from_op="PCA" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
      <connect from_op="Validation" from_port="model" to_op="Group Models" to_port="models in 2"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
      <connect from_op="Group Models" from_port="model out" to_op="applyontestdata" to_port="model"/>
      <connect from_op="applyontestdata" from_port="labelled data" to_op="PerformanceTestdata" to_port="labelled data"/>
      <connect from_op="PerformanceTestdata" from_port="performance" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • Options
    JEdwardJEdward RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 578 Unicorn
    Activate debug mode by right clicking in your process window and selecting All Breakpoints (debug) mode.  This will help you to identify which operator is causing the problem.  Is it before the PCA or after the PCA where the error occurs (for example). 

  • Options
    NonaNona Member Posts: 15 Contributor II
    It is apply model operator that i used to apply grouped models to test set.
  • Options
    JEdwardJEdward RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 578 Unicorn
    Thanks for that. 

    What are the differences in attributes between your training & test datasets? 
    When generated some sample data to test your process it works without an issue for me, but I know that if the features aren't the same for the training & test then it might cause problems when trying to apply the models. 

    To test this try connecting your original data (training) to the the apply model to see if it still causes errors.  If it doesn't then you know you need to match the features on both sides.  You could use an operator like 'Superset' to do this, but this will only add the attributes, it's better to delve in and have a look at the two datasets. 
    Likely it's something really simple like a field stored as polynominal in the training data is numerical in the test data. 
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.4.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="false" class="retrieve" compatibility="6.4.000" expanded="true" height="60" name="Retrieve semiconductortesting" width="90" x="45" y="255">
            <parameter key="repository_entry" value="../data/semiconductor/semiconductortesting"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="6.4.000" expanded="true" height="60" name="Retrieve kmedoid_undersampled_data" width="90" x="45" y="30">
            <parameter key="repository_entry" value="../data/semiconductor/kmedoid_undersampled_data"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="6.4.000" expanded="true" height="60" name="Retrieve SMOTE_oversampled_data" width="90" x="45" y="120">
            <parameter key="repository_entry" value="../data/semiconductor/SMOTE_oversampled_data"/>
          </operator>
          <operator activated="false" class="shuffle" compatibility="6.4.000" expanded="true" height="76" name="Shuffle" width="90" x="983" y="210">
            <parameter key="use_local_random_seed" value="true"/>
            <parameter key="local_random_seed" value="532"/>
          </operator>
          <operator activated="true" class="append" compatibility="6.4.000" expanded="true" height="94" name="Append" width="90" x="179" y="30"/>
          <operator activated="true" class="multiply" compatibility="6.4.000" expanded="true" height="94" name="Multiply" width="90" x="45" y="210">
            <description align="center" color="transparent" colored="false" width="126">This multiply forces sends the training data to be tested rather than the test data.</description>
          </operator>
          <operator activated="true" class="materialize_data" compatibility="6.4.000" expanded="true" height="76" name="Materialize Data (2)" width="90" x="179" y="300"/>
          <operator activated="true" class="materialize_data" compatibility="6.4.000" expanded="true" height="76" name="Materialize Data" width="90" x="313" y="30"/>
          <operator activated="true" class="principal_component_analysis" compatibility="6.4.000" expanded="true" height="94" name="PCA" width="90" x="447" y="30">
            <parameter key="number_of_components" value="250"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="6.4.000" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
            <parameter key="number_of_validations" value="2"/>
            <parameter key="sampling_type" value="shuffled sampling"/>
            <process expanded="true">
              <operator activated="true" class="decision_tree" compatibility="6.4.000" expanded="true" height="76" name="Decision Tree" width="90" x="133" y="30"/>
              <connect from_port="training" to_op="Decision Tree" to_port="training set"/>
              <connect from_op="Decision Tree" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="apply_model" compatibility="6.4.000" expanded="true" height="76" name="ApplyDEcisiontree" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance" compatibility="6.4.000" expanded="true" height="76" name="trainingperformance (3)" width="90" x="179" y="30"/>
              <operator activated="false" class="performance_binominal_classification" compatibility="6.4.000" expanded="true" height="76" name="trainingperformance (2)" width="90" x="112" y="165">
                <parameter key="AUC" value="true"/>
                <parameter key="f_measure" value="true"/>
              </operator>
              <operator activated="false" class="performance_costs" compatibility="6.4.000" expanded="true" height="76" name="trainingcostperformance" width="90" x="246" y="165">
                <parameter key="cost_matrix" value="[0.0 4.0;1.0 0.0]"/>
                <enumeration key="class_order_definition">
                  <parameter key="class_name" value="-1.0"/>
                  <parameter key="class_name" value="1.0"/>
                </enumeration>
              </operator>
              <operator activated="false" breakpoints="after" class="performance_classification" compatibility="6.4.000" expanded="true" height="76" name="trainingperformance" width="90" x="380" y="255">
                <parameter key="main_criterion" value="accuracy"/>
                <parameter key="classification_error" value="true"/>
                <parameter key="weighted_mean_recall" value="true"/>
                <parameter key="weighted_mean_precision" value="true"/>
                <parameter key="root_mean_squared_error" value="true"/>
                <list key="class_weights"/>
              </operator>
              <connect from_port="model" to_op="ApplyDEcisiontree" to_port="model"/>
              <connect from_port="test set" to_op="ApplyDEcisiontree" to_port="unlabelled data"/>
              <connect from_op="ApplyDEcisiontree" from_port="labelled data" to_op="trainingperformance (3)" to_port="labelled data"/>
              <connect from_op="trainingperformance (3)" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="group_models" compatibility="6.4.000" expanded="true" height="94" name="Group Models" width="90" x="313" y="165"/>
          <operator activated="true" class="apply_model" compatibility="6.4.000" expanded="true" height="76" name="applyontestdata" width="90" x="447" y="255">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="6.4.000" expanded="true" height="76" name="PerformanceTestdata" width="90" x="581" y="210"/>
          <connect from_op="Retrieve kmedoid_undersampled_data" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Retrieve SMOTE_oversampled_data" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Append" from_port="merged set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Materialize Data" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Materialize Data (2)" to_port="example set input"/>
          <connect from_op="Materialize Data (2)" from_port="example set output" to_op="applyontestdata" to_port="unlabelled data"/>
          <connect from_op="Materialize Data" from_port="example set output" to_op="PCA" to_port="example set input"/>
          <connect from_op="PCA" from_port="example set output" to_op="Validation" to_port="training"/>
          <connect from_op="PCA" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
          <connect from_op="Validation" from_port="model" to_op="Group Models" to_port="models in 2"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
          <connect from_op="Group Models" from_port="model out" to_op="applyontestdata" to_port="model"/>
          <connect from_op="applyontestdata" from_port="labelled data" to_op="PerformanceTestdata" to_port="labelled data"/>
          <connect from_op="PerformanceTestdata" from_port="performance" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
  • Options
    NonaNona Member Posts: 15 Contributor II
    Thanks! Yes this is the problem. i ran it in Rapidminer 6.4 now, and it showed me the problem that some attributes in training and test set mismatch(in type). My dataset has large number of attributes some are set to numerical, some to integer, some to real, and one is nominal label. pls suggest how can i match attribute types in both sets i.e. training and test set. For example, i want all attributes to be of type real(but for label). If i again load dataset in rapidminer do i have to set each attribute manualy? there are alot of them...there must be some other way..sry i am new in this.
  • Options
    MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,507 RM Data Scientist
    Hi Nona,

    all the XX to XX Operators (Like Nominal to Numerical) have a attribute selector. There you can specifiy which attributes should be converted. This includes regular expression (all attributes which start with count_) or block types like "all numerical".

    Cheers,
    Martin
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.