[SOLVED] PCA problem

marcin_blachnikmarcin_blachnik Member Posts: 56  Maven
edited April 2019 in Help
Hallo

I have a problem with PCA used as a preprocessing operator. In the second iteration of crossvalidation (in non parralel X_validation) the PCA operator freezes. If I use parallel X_validation then it freezes when given thread is calculating this operator for the second time.

Thank for all answers and help
Marcin B.

Below is the code:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.017">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.1.017" expanded="true" name="Process">
   <parameter key="logverbosity" value="init"/>
   <parameter key="random_seed" value="2001"/>
   <parameter key="send_mail" value="never"/>
   <parameter key="notification_email" value=""/>
   <parameter key="process_duration_for_mail" value="30"/>
   <parameter key="encoding" value="SYSTEM"/>
   <parameter key="parallelize_main_process" value="false"/>
   <process expanded="true" height="408" width="472">
     <operator activated="true" class="retrieve" compatibility="5.1.017" expanded="true" height="60" name="Retrieve" width="90" x="41" y="66">
       <parameter key="repository_entry" value="//Samples/data/Iris"/>
     </operator>
     <operator activated="true" class="replace_missing_values" compatibility="5.1.017" expanded="true" height="94" name="Replace Missing Values" width="90" x="64" y="186">
       <parameter key="return_preprocessing_model" value="false"/>
       <parameter key="create_view" value="false"/>
       <parameter key="attribute_filter_type" value="all"/>
       <parameter key="attribute" value=""/>
       <parameter key="attributes" value=""/>
       <parameter key="use_except_expression" value="false"/>
       <parameter key="value_type" value="attribute_value"/>
       <parameter key="use_value_type_exception" value="false"/>
       <parameter key="except_value_type" value="time"/>
       <parameter key="block_type" value="attribute_block"/>
       <parameter key="use_block_type_exception" value="false"/>
       <parameter key="except_block_type" value="value_matrix_row_start"/>
       <parameter key="invert_selection" value="false"/>
       <parameter key="include_special_attributes" value="false"/>
       <parameter key="default" value="zero"/>
       <list key="columns"/>
     </operator>
     <operator activated="true" class="x_validation" compatibility="5.1.017" expanded="true" height="112" name="Validation" width="90" x="247" y="30">
       <parameter key="create_complete_model" value="false"/>
       <parameter key="average_performances_only" value="true"/>
       <parameter key="leave_one_out" value="false"/>
       <parameter key="number_of_validations" value="10"/>
       <parameter key="sampling_type" value="stratified sampling"/>
       <parameter key="use_local_random_seed" value="false"/>
       <parameter key="local_random_seed" value="1992"/>
       <parameter key="parallelize_training" value="false"/>
       <parameter key="parallelize_testing" value="false"/>
       <process expanded="true">
         <operator activated="true" class="multiply" compatibility="5.1.017" expanded="true" height="94" name="Multiply" width="90" x="40" y="153"/>
         <operator activated="true" class="optimize_parameters_grid" compatibility="5.1.017" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="145" y="23">
           <list key="parameters">
             <parameter key="SVM Opti.C" value="[0.01;10000;5;logarithmic]"/>
             <parameter key="SVM Opti.gamma" value="[0.01;1;5;linear]"/>
             <parameter key="PCA Opti.variance_threshold" value="[0.95;0.99;5;linear]"/>
           </list>
           <parameter key="parallelize_optimization_process" value="true"/>
           <process expanded="true" height="362" width="487">
             <operator activated="true" class="x_validation" compatibility="5.1.017" expanded="true" height="112" name="Validation Opti" width="90" x="45" y="30">
               <parameter key="create_complete_model" value="false"/>
               <parameter key="average_performances_only" value="true"/>
               <parameter key="leave_one_out" value="false"/>
               <parameter key="number_of_validations" value="10"/>
               <parameter key="sampling_type" value="stratified sampling"/>
               <parameter key="use_local_random_seed" value="false"/>
               <parameter key="local_random_seed" value="1992"/>
               <parameter key="parallelize_training" value="false"/>
               <parameter key="parallelize_testing" value="false"/>
               <process expanded="true">
                 <operator activated="true" class="principal_component_analysis" compatibility="5.1.017" expanded="true" height="94" name="PCA Opti" width="90" x="44" y="101">
                   <parameter key="dimensionality_reduction" value="fixed number"/>
                   <parameter key="variance_threshold" value="0.95"/>
                   <parameter key="number_of_components" value="1"/>
                 </operator>
                 <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.1.017" expanded="true" height="76" name="SVM Opti" width="90" x="249" y="73">
                   <parameter key="svm_type" value="C-SVC"/>
                   <parameter key="kernel_type" value="rbf"/>
                   <parameter key="degree" value="3"/>
                   <parameter key="gamma" value="0.01"/>
                   <parameter key="coef0" value="0.0"/>
                   <parameter key="C" value="0.01"/>
                   <parameter key="nu" value="0.5"/>
                   <parameter key="cache_size" value="80"/>
                   <parameter key="epsilon" value="0.0010"/>
                   <parameter key="p" value="0.1"/>
                   <list key="class_weights"/>
                   <parameter key="shrinking" value="true"/>
                   <parameter key="calculate_confidences" value="false"/>
                   <parameter key="confidence_for_multiclass" value="true"/>
                 </operator>
                 <operator activated="true" class="group_models" compatibility="5.1.017" expanded="true" height="94" name="Group Models" width="90" x="190" y="243"/>
                 <connect from_port="training" to_op="PCA Opti" to_port="example set input"/>
                 <connect from_op="PCA Opti" from_port="example set output" to_op="SVM Opti" to_port="training set"/>
                 <connect from_op="PCA Opti" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
                 <connect from_op="SVM Opti" from_port="model" to_op="Group Models" to_port="models in 2"/>
                 <connect from_op="Group Models" from_port="model out" to_port="model"/>
                 <portSpacing port="source_training" spacing="0"/>
                 <portSpacing port="sink_model" spacing="0"/>
                 <portSpacing port="sink_through 1" spacing="0"/>
               </process>
               <process expanded="true">
                 <operator activated="true" class="apply_model" compatibility="5.1.017" expanded="true" height="76" name="Apply Opti" width="90" x="45" y="30">
                   <list key="application_parameters"/>
                   <parameter key="create_view" value="false"/>
                 </operator>
                 <operator activated="true" class="performance" compatibility="5.1.017" expanded="true" height="76" name="Performance Opti" width="90" x="201" y="30">
                   <parameter key="use_example_weights" value="true"/>
                 </operator>
                 <connect from_port="model" to_op="Apply Opti" to_port="model"/>
                 <connect from_port="test set" to_op="Apply Opti" to_port="unlabelled data"/>
                 <connect from_op="Apply Opti" from_port="labelled data" to_op="Performance Opti" to_port="labelled data"/>
                 <connect from_op="Performance Opti" from_port="performance" to_port="averagable 1"/>
                 <portSpacing port="source_model" spacing="0"/>
                 <portSpacing port="source_test set" spacing="0"/>
                 <portSpacing port="source_through 1" spacing="0"/>
                 <portSpacing port="sink_averagable 1" spacing="0"/>
                 <portSpacing port="sink_averagable 2" spacing="0"/>
               </process>
             </operator>
             <operator activated="true" class="log" compatibility="5.1.017" expanded="true" height="76" name="Log Opti" width="90" x="387" y="96">
               <parameter key="filename" value="C:\Users\Administrator\Desktop\SVM_PCA_Opti_Sub"/>
               <list key="log">
                 <parameter key="C" value="operator.SVM Opti.parameter.C"/>
                 <parameter key="gamma" value="operator.SVM Opti.parameter.gamma"/>
                 <parameter key="Acc" value="operator.Validation Opti.value.performance"/>
                 <parameter key="Std" value="operator.Validation Opti.value.deviation"/>
                 <parameter key="PCA_th" value="operator.PCA Opti.parameter.variance_threshold"/>
               </list>
               <parameter key="sorting_type" value="none"/>
               <parameter key="sorting_k" value="100"/>
               <parameter key="persistent" value="false"/>
             </operator>
             <connect from_port="input 1" to_op="Validation Opti" to_port="training"/>
             <connect from_op="Validation Opti" from_port="averagable 1" to_op="Log Opti" to_port="through 1"/>
             <connect from_op="Log Opti" from_port="through 1" to_port="performance"/>
             <portSpacing port="source_input 1" spacing="0"/>
             <portSpacing port="source_input 2" spacing="0"/>
             <portSpacing port="sink_performance" spacing="0"/>
             <portSpacing port="sink_result 1" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="set_parameters" compatibility="5.1.017" expanded="true" height="60" name="Set Parameters" width="90" x="272" y="37">
           <list key="name_map">
             <parameter key="SVM Opti" value="SVM Final"/>
             <parameter key="PCA Opti" value="PCA Final"/>
           </list>
         </operator>
         <operator activated="true" class="principal_component_analysis" compatibility="5.1.017" expanded="true" height="94" name="PCA Final" width="90" x="83" y="314">
           <parameter key="dimensionality_reduction" value="keep variance"/>
           <parameter key="variance_threshold" value="0.95"/>
           <parameter key="number_of_components" value="1"/>
         </operator>
         <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.1.017" expanded="true" height="76" name="SVM Final" width="90" x="318" y="199">
           <parameter key="svm_type" value="C-SVC"/>
           <parameter key="kernel_type" value="rbf"/>
           <parameter key="degree" value="3"/>
           <parameter key="gamma" value="0.802"/>
           <parameter key="coef0" value="0.0"/>
           <parameter key="C" value="630.9573444801937"/>
           <parameter key="nu" value="0.5"/>
           <parameter key="cache_size" value="80"/>
           <parameter key="epsilon" value="0.0010"/>
           <parameter key="p" value="0.1"/>
           <list key="class_weights"/>
           <parameter key="shrinking" value="true"/>
           <parameter key="calculate_confidences" value="false"/>
           <parameter key="confidence_for_multiclass" value="true"/>
         </operator>
         <operator activated="true" class="group_models" compatibility="5.1.017" expanded="true" height="94" name="Group Models (2)" width="90" x="442" y="291"/>
         <connect from_port="training" to_op="Multiply" to_port="input"/>
         <connect from_op="Multiply" from_port="output 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
         <connect from_op="Multiply" from_port="output 2" to_op="PCA Final" to_port="example set input"/>
         <connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_op="Set Parameters" to_port="parameter set"/>
         <connect from_op="PCA Final" from_port="example set output" to_op="SVM Final" to_port="training set"/>
         <connect from_op="PCA Final" from_port="preprocessing model" to_op="Group Models (2)" to_port="models in 1"/>
         <connect from_op="SVM Final" from_port="model" to_op="Group Models (2)" to_port="models in 2"/>
         <connect from_op="Group Models (2)" from_port="model out" to_port="model"/>
         <portSpacing port="source_training" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
         <portSpacing port="sink_through 1" spacing="0"/>
       </process>
       <process expanded="true">
         <operator activated="true" class="apply_model" compatibility="5.1.017" expanded="true" height="76" name="Apply Final" width="90" x="66" y="37">
           <list key="application_parameters"/>
           <parameter key="create_view" value="false"/>
         </operator>
         <operator activated="true" class="performance" compatibility="5.1.017" expanded="true" height="76" name="Performance Final" width="90" x="198" y="36">
           <parameter key="use_example_weights" value="true"/>
         </operator>
         <operator activated="true" class="log" compatibility="5.1.017" expanded="true" height="76" name="Log Final" width="90" x="320" y="37">
           <parameter key="filename" value="C:\Users\Administrator\Desktop\svm_log_2"/>
           <list key="log">
             <parameter key="C" value="operator.SVM Final.parameter.C"/>
             <parameter key="Gamma" value="operator.SVM Final.parameter.gamma"/>
             <parameter key="Acc" value="operator.Performance Final.value.performance"/>
             <parameter key="PCA_th" value="operator.PCA Final.parameter.variance_threshold"/>
           </list>
           <parameter key="sorting_type" value="none"/>
           <parameter key="sorting_k" value="100"/>
           <parameter key="persistent" value="false"/>
         </operator>
         <operator activated="true" class="free_memory" compatibility="5.1.017" expanded="true" height="76" name="Free Memory" width="90" x="326" y="180"/>
         <connect from_port="model" to_op="Apply Final" to_port="model"/>
         <connect from_port="test set" to_op="Apply Final" to_port="unlabelled data"/>
         <connect from_op="Apply Final" from_port="labelled data" to_op="Performance Final" to_port="labelled data"/>
         <connect from_op="Performance Final" from_port="performance" to_op="Log Final" to_port="through 1"/>
         <connect from_op="Log Final" from_port="through 1" to_op="Free Memory" to_port="through 1"/>
         <connect from_op="Free Memory" from_port="through 1" to_port="averagable 1"/>
         <portSpacing port="source_model" spacing="0"/>
         <portSpacing port="source_test set" spacing="0"/>
         <portSpacing port="source_through 1" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="log" compatibility="5.1.017" expanded="true" height="76" name="Log ACC" width="90" x="372" y="166">
       <parameter key="filename" value="C:\Users\Administrator\Desktop\svm_pca.res"/>
       <list key="log">
         <parameter key="acc" value="operator.Performance Final.value.performance"/>
       </list>
       <parameter key="sorting_type" value="none"/>
       <parameter key="sorting_k" value="100"/>
       <parameter key="persistent" value="false"/>
     </operator>
     <connect from_op="Retrieve" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
     <connect from_op="Replace Missing Values" from_port="example set output" to_op="Validation" to_port="training"/>
     <connect from_op="Validation" from_port="averagable 1" to_op="Log ACC" to_port="through 1"/>
     <connect from_op="Log ACC" from_port="through 1" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>

Answers

  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869   Unicorn
    Hi Marcin,

    the PCA adds attributes to the data set, and adding attributes in a X-Validation is sometimes a bit dangerous. Please add a Materialize Data operator on the testing sides of the X-Validations as in the process below.

    Best,
    Marius
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.017">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.017" expanded="true" name="Process">
        <process expanded="true" height="408" width="472">
          <operator activated="true" class="retrieve" compatibility="5.1.017" expanded="true" height="60" name="Retrieve" width="90" x="41" y="66">
            <parameter key="repository_entry" value="//Samples/data/Iris"/>
          </operator>
          <operator activated="true" class="replace_missing_values" compatibility="5.1.017" expanded="true" height="94" name="Replace Missing Values" width="90" x="64" y="186">
            <parameter key="default" value="zero"/>
            <list key="columns"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.1.017" expanded="true" height="112" name="Validation" width="90" x="247" y="30">
            <process expanded="true" height="615" width="681">
              <operator activated="false" class="materialize_data" compatibility="5.1.017" expanded="true" height="76" name="Materialize Data" width="90" x="45" y="165"/>
              <operator activated="true" class="multiply" compatibility="5.1.017" expanded="true" height="94" name="Multiply" width="90" x="179" y="210"/>
              <operator activated="true" class="optimize_parameters_grid" compatibility="5.1.017" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="145" y="23">
                <list key="parameters">
                  <parameter key="SVM Opti.C" value="[0.01;10000;5;logarithmic]"/>
                  <parameter key="SVM Opti.gamma" value="[0.01;1;5;linear]"/>
                  <parameter key="PCA Opti.variance_threshold" value="[0.95;0.99;5;linear]"/>
                </list>
                <parameter key="parallelize_optimization_process" value="true"/>
                <process expanded="true" height="362" width="487">
                  <operator activated="true" class="x_validation" compatibility="5.1.017" expanded="true" height="112" name="Validation Opti" width="90" x="45" y="30">
                    <process expanded="true" height="633" width="346">
                      <operator activated="false" class="materialize_data" compatibility="5.1.017" expanded="true" height="76" name="Materialize Data (2)" width="90" x="45" y="210"/>
                      <operator activated="true" class="principal_component_analysis" compatibility="5.1.017" expanded="true" height="94" name="PCA Opti" width="90" x="45" y="75">
                        <parameter key="dimensionality_reduction" value="fixed number"/>
                        <parameter key="variance_threshold" value="0.966"/>
                      </operator>
                      <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.1.017" expanded="true" height="76" name="SVM Opti" width="90" x="246" y="75">
                        <parameter key="gamma" value="1.0"/>
                        <parameter key="C" value="10000.0"/>
                        <list key="class_weights"/>
                      </operator>
                      <operator activated="true" class="group_models" compatibility="5.1.017" expanded="true" height="94" name="Group Models" width="90" x="246" y="255"/>
                      <connect from_port="training" to_op="PCA Opti" to_port="example set input"/>
                      <connect from_op="PCA Opti" from_port="example set output" to_op="SVM Opti" to_port="training set"/>
                      <connect from_op="PCA Opti" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
                      <connect from_op="SVM Opti" from_port="model" to_op="Group Models" to_port="models in 2"/>
                      <connect from_op="Group Models" from_port="model out" to_port="model"/>
                      <portSpacing port="source_training" spacing="0"/>
                      <portSpacing port="sink_model" spacing="0"/>
                      <portSpacing port="sink_through 1" spacing="0"/>
                    </process>
                    <process expanded="true" height="633" width="346">
                      <operator activated="true" class="materialize_data" compatibility="5.1.017" expanded="true" height="76" name="Materialize Data (3)" width="90" x="45" y="210"/>
                      <operator activated="true" class="apply_model" compatibility="5.1.017" expanded="true" height="76" name="Apply Opti" width="90" x="45" y="30">
                        <list key="application_parameters"/>
                      </operator>
                      <operator activated="true" class="performance" compatibility="5.1.017" expanded="true" height="76" name="Performance Opti" width="90" x="201" y="30"/>
                      <connect from_port="model" to_op="Apply Opti" to_port="model"/>
                      <connect from_port="test set" to_op="Materialize Data (3)" to_port="example set input"/>
                      <connect from_op="Materialize Data (3)" from_port="example set output" to_op="Apply Opti" to_port="unlabelled data"/>
                      <connect from_op="Apply Opti" from_port="labelled data" to_op="Performance Opti" to_port="labelled data"/>
                      <connect from_op="Performance Opti" from_port="performance" to_port="averagable 1"/>
                      <portSpacing port="source_model" spacing="0"/>
                      <portSpacing port="source_test set" spacing="0"/>
                      <portSpacing port="source_through 1" spacing="0"/>
                      <portSpacing port="sink_averagable 1" spacing="0"/>
                      <portSpacing port="sink_averagable 2" spacing="0"/>
                    </process>
                  </operator>
                  <operator activated="true" class="log" compatibility="5.1.017" expanded="true" height="76" name="Log Opti" width="90" x="387" y="96">
                    <parameter key="filename" value="C:\Users\Administrator\Desktop\SVM_PCA_Opti_Sub"/>
                    <list key="log">
                      <parameter key="C" value="operator.SVM Opti.parameter.C"/>
                      <parameter key="gamma" value="operator.SVM Opti.parameter.gamma"/>
                      <parameter key="Acc" value="operator.Validation Opti.value.performance"/>
                      <parameter key="Std" value="operator.Validation Opti.value.deviation"/>
                      <parameter key="PCA_th" value="operator.PCA Opti.parameter.variance_threshold"/>
                    </list>
                  </operator>
                  <connect from_port="input 1" to_op="Validation Opti" to_port="training"/>
                  <connect from_op="Validation Opti" from_port="averagable 1" to_op="Log Opti" to_port="through 1"/>
                  <connect from_op="Log Opti" from_port="through 1" to_port="performance"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_performance" spacing="0"/>
                  <portSpacing port="sink_result 1" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="set_parameters" compatibility="5.1.017" expanded="true" height="60" name="Set Parameters" width="90" x="272" y="37">
                <list key="name_map">
                  <parameter key="SVM Opti" value="SVM Final"/>
                  <parameter key="PCA Opti" value="PCA Final"/>
                </list>
              </operator>
              <operator activated="true" class="principal_component_analysis" compatibility="5.1.017" expanded="true" height="94" name="PCA Final" width="90" x="179" y="345">
                <parameter key="variance_threshold" value="0.974"/>
              </operator>
              <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.1.017" expanded="true" height="76" name="SVM Final" width="90" x="380" y="210">
                <parameter key="gamma" value="0.604"/>
                <parameter key="C" value="2.511886431509581"/>
                <list key="class_weights"/>
              </operator>
              <operator activated="true" class="group_models" compatibility="5.1.017" expanded="true" height="94" name="Group Models (2)" width="90" x="514" y="300"/>
              <connect from_port="training" to_op="Multiply" to_port="input"/>
              <connect from_op="Multiply" from_port="output 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
              <connect from_op="Multiply" from_port="output 2" to_op="PCA Final" to_port="example set input"/>
              <connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_op="Set Parameters" to_port="parameter set"/>
              <connect from_op="PCA Final" from_port="example set output" to_op="SVM Final" to_port="training set"/>
              <connect from_op="PCA Final" from_port="preprocessing model" to_op="Group Models (2)" to_port="models in 1"/>
              <connect from_op="SVM Final" from_port="model" to_op="Group Models (2)" to_port="models in 2"/>
              <connect from_op="Group Models (2)" from_port="model out" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="615" width="446">
              <operator activated="false" class="free_memory" compatibility="5.1.017" expanded="true" height="60" name="Free Memory" width="90" x="326" y="180"/>
              <operator activated="true" class="materialize_data" compatibility="5.1.017" expanded="true" height="76" name="Materialize Data (4)" width="90" x="45" y="300"/>
              <operator activated="true" class="apply_model" compatibility="5.1.017" expanded="true" height="76" name="Apply Final" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance" compatibility="5.1.017" expanded="true" height="76" name="Performance Final" width="90" x="179" y="30"/>
              <operator activated="true" class="log" compatibility="5.1.017" expanded="true" height="76" name="Log Final" width="90" x="313" y="30">
                <parameter key="filename" value="C:\Users\Administrator\Desktop\svm_log_2"/>
                <list key="log">
                  <parameter key="C" value="operator.SVM Final.parameter.C"/>
                  <parameter key="Gamma" value="operator.SVM Final.parameter.gamma"/>
                  <parameter key="Acc" value="operator.Performance Final.value.performance"/>
                  <parameter key="PCA_th" value="operator.PCA Final.parameter.variance_threshold"/>
                </list>
              </operator>
              <connect from_port="model" to_op="Apply Final" to_port="model"/>
              <connect from_port="test set" to_op="Materialize Data (4)" to_port="example set input"/>
              <connect from_op="Materialize Data (4)" from_port="example set output" to_op="Apply Final" to_port="unlabelled data"/>
              <connect from_op="Apply Final" from_port="labelled data" to_op="Performance Final" to_port="labelled data"/>
              <connect from_op="Performance Final" from_port="performance" to_op="Log Final" to_port="through 1"/>
              <connect from_op="Log Final" from_port="through 1" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="log" compatibility="5.1.017" expanded="true" height="76" name="Log ACC" width="90" x="372" y="166">
            <parameter key="filename" value="C:\Users\Administrator\Desktop\svm_pca.res"/>
            <list key="log">
              <parameter key="acc" value="operator.Performance Final.value.performance"/>
            </list>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
          <connect from_op="Replace Missing Values" from_port="example set output" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="averagable 1" to_op="Log ACC" to_port="through 1"/>
          <connect from_op="Log ACC" from_port="through 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • marcin_blachnikmarcin_blachnik Member Posts: 56  Maven
    Thanks a lot
    Now it works :).
    I have a question according to that.  I thought that PCA  creates completely new example with new datatable, but us I understand it just adds new attributes and hides old ones. If I'm right, then what is the reason for that kind of solution instead of creating new dataset?

    Many thanks
    Marcin
  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869   Unicorn
    Well, as long as you are just hiding old attributes, it saves you a lot of memory in comparison to creating a new data table each time, if you have large data tables.

    In this case the problem is the creation of new attributes, which are obviously created on the original data and pour into the second iteration of the X-Validation where they cause problems. Materialize Data faces this problem by explicitly creating a new copy of the data.

    Best,
    Marius
  • marcin_blachnikmarcin_blachnik Member Posts: 56  Maven
    Thanks a lot for your explanation.

    Marcin
  • marcin_blachnikmarcin_blachnik Member Posts: 56  Maven
    Hi  Sal.

    You forgot to put your email, so I'll answer here.
    If you have any problems with PCA, then please let me know what the problem is, then I will try to help you however for the simplicity below I attach a simple example of how to use PCA embeded in cross-validation.

    I hope it will help


    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
        <process expanded="true" height="197" width="346">
          <operator activated="true" class="retrieve" compatibility="5.2.001" expanded="true" height="60" name="Retrieve" width="90" x="97" y="68">
            <parameter key="repository_entry" value="//Samples/data/Iris"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.2.001" expanded="true" height="112" name="Validation" width="90" x="246" y="75">
            <process expanded="true" height="510" width="467">
              <operator activated="true" class="principal_component_analysis" compatibility="5.2.001" expanded="true" height="94" name="PCA" width="90" x="45" y="120">
                <parameter key="dimensionality_reduction" value="fixed number"/>
                <parameter key="number_of_components" value="2"/>
              </operator>
              <operator activated="true" class="k_nn" compatibility="5.2.001" expanded="true" height="76" name="k-NN" width="90" x="179" y="75"/>
              <operator activated="true" class="group_models" compatibility="5.2.001" expanded="true" height="94" name="Group Models" width="90" x="313" y="165"/>
              <connect from_port="training" to_op="PCA" to_port="example set input"/>
              <connect from_op="PCA" from_port="example set output" to_op="k-NN" to_port="training set"/>
              <connect from_op="PCA" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
              <connect from_op="k-NN" from_port="model" to_op="Group Models" to_port="models in 2"/>
              <connect from_op="Group Models" from_port="model out" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="510" width="467">
              <operator activated="true" class="materialize_data" compatibility="5.2.001" expanded="true" height="76" name="Materialize Data" width="90" x="45" y="75"/>
              <operator activated="true" class="apply_model" compatibility="5.2.001" expanded="true" height="76" name="Apply Model" width="90" x="185" y="21">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance" compatibility="5.2.001" expanded="true" height="76" name="Performance" width="90" x="300" y="20"/>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Materialize Data" to_port="example set input"/>
              <connect from_op="Materialize Data" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    Best

    Marcin
Sign In or Register to comment.