"Problem with Loop and Average"

overfitteroverfitter Member Posts: 4 Contributor I
edited May 2019 in Help
The performance value provided by the 'Loop and Average' operator for logging seems to differ from the returned Performance Vector.
Here is my Process:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.004">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Root">
    <description>The WeightGuidedFeatureSelection operator uses given input AttributeWeights to determine the order of attribute adding. In this process we use a 10-fold cross validation of a learning scheme as performance evaluation (the inner operator) and combine attribute filtering with a wrapper approach.</description>
    <process expanded="true" height="604" width="480">
      <operator activated="true" class="retrieve" compatibility="5.1.004" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
        <parameter key="repository_entry" value="//Samples/data/Polynomial"/>
      </operator>
      <operator activated="true" class="weight_by_pca" compatibility="5.1.004" expanded="true" height="76" name="PCAWeighting" width="90" x="180" y="30">
        <parameter key="normalize_weights" value="false"/>
      </operator>
      <operator activated="true" class="optimize_selection_weight_guided" compatibility="5.1.004" expanded="true" height="94" name="WeightGuidedFeatureSelection" width="90" x="380" y="30">
        <process expanded="true" height="604" width="413">
          <operator activated="true" class="loop_and_average" compatibility="5.1.004" expanded="true" height="76" name="Loop and Average" width="90" x="45" y="30">
            <parameter key="iterations" value="100"/>
            <process expanded="true" height="551" width="697">
              <operator activated="true" class="x_validation" compatibility="5.1.004" expanded="true" height="112" name="XValidation" width="90" x="112" y="30">
                <parameter key="sampling_type" value="shuffled sampling"/>
                <process expanded="true">
                  <operator activated="true" class="k_nn" compatibility="5.1.004" expanded="true" name="NearestNeighbors">
                    <parameter key="k" value="5"/>
                  </operator>
                  <connect from_port="training" to_op="NearestNeighbors" to_port="training set"/>
                  <connect from_op="NearestNeighbors" from_port="model" to_port="model"/>
                  <portSpacing port="source_training" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="apply_model" compatibility="5.1.004" expanded="true" name="ModelApplier">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="true" class="performance" compatibility="5.1.004" expanded="true" name="Performance"/>
                  <connect from_port="model" to_op="ModelApplier" to_port="model"/>
                  <connect from_port="test set" to_op="ModelApplier" to_port="unlabelled data"/>
                  <connect from_op="ModelApplier" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                  <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_averagable 1" spacing="0"/>
                  <portSpacing port="sink_averagable 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="in 1" to_op="XValidation" to_port="training"/>
              <connect from_op="XValidation" from_port="averagable 1" to_port="averagable 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="log" compatibility="5.1.004" expanded="true" height="76" name="Log" width="90" x="313" y="75">
            <list key="log">
              <parameter key="performance" value="operator.Loop and Average.value.performance"/>
            </list>
          </operator>
          <connect from_port="example set" to_op="Loop and Average" to_port="in 1"/>
          <connect from_op="Loop and Average" from_port="averagable 1" to_op="Log" to_port="through 1"/>
          <connect from_op="Log" from_port="through 1" to_port="performance"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_performance" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="PCAWeighting" to_port="example set"/>
      <connect from_op="PCAWeighting" from_port="weights" to_op="WeightGuidedFeatureSelection" to_port="attribute weights in"/>
      <connect from_op="PCAWeighting" from_port="example set" to_op="WeightGuidedFeatureSelection" to_port="example set in"/>
      <connect from_op="WeightGuidedFeatureSelection" from_port="example set out" to_port="result 1"/>
      <connect from_op="WeightGuidedFeatureSelection" from_port="weights" to_port="result 2"/>
      <connect from_op="WeightGuidedFeatureSelection" from_port="performance" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • haddockhaddock Member Posts: 849 Maven
    And your question is?
  • overfitteroverfitter Member Posts: 4 Contributor I
    My understanding is that the performance value of the 'Loop and Average' operator should be the same as the value of the main criterion of the performance vector. This is not the case in this process.
  • haddockhaddock Member Posts: 849 Maven
    Each iteration of the loop adds an attribute, so the 'Loop and Average' performance will not resemble the output performance vector, whereas  the last logged performance should be close, as is shown if we log the number of attributes, like this...
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.003">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="5.1.003" expanded="true" name="Root">
       <description>The WeightGuidedFeatureSelection operator uses given input AttributeWeights to determine the order of attribute adding. In this process we use a 10-fold cross validation of a learning scheme as performance evaluation (the inner operator) and combine attribute filtering with a wrapper approach.</description>
       <process expanded="true" height="604" width="480">
         <operator activated="true" class="retrieve" compatibility="5.1.003" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
           <parameter key="repository_entry" value="//Samples/data/Polynomial"/>
         </operator>
         <operator activated="true" class="weight_by_pca" compatibility="5.1.003" expanded="true" height="76" name="PCAWeighting" width="90" x="180" y="30">
           <parameter key="normalize_weights" value="false"/>
         </operator>
         <operator activated="true" class="optimize_selection_weight_guided" compatibility="5.1.003" expanded="true" height="94" name="WeightGuidedFeatureSelection" width="90" x="380" y="30">
           <process expanded="true" height="604" width="547">
             <operator activated="true" class="loop_and_average" compatibility="5.1.003" expanded="true" height="76" name="Loop and Average" width="90" x="112" y="30">
               <parameter key="iterations" value="1"/>
               <process expanded="true" height="551" width="697">
                 <operator activated="true" class="extract_macro" compatibility="5.1.003" expanded="true" height="60" name="Extract Macro" width="90" x="112" y="75">
                   <parameter key="macro" value="Atts"/>
                   <parameter key="macro_type" value="number_of_attributes"/>
                 </operator>
                 <operator activated="true" class="x_validation" compatibility="5.1.003" expanded="true" height="112" name="XValidation" width="90" x="313" y="30">
                   <parameter key="sampling_type" value="shuffled sampling"/>
                   <process expanded="true" height="418" width="414">
                     <operator activated="true" class="k_nn" compatibility="5.1.003" expanded="true" height="76" name="NearestNeighbors" width="90" x="166" y="30">
                       <parameter key="k" value="5"/>
                     </operator>
                     <connect from_port="training" to_op="NearestNeighbors" to_port="training set"/>
                     <connect from_op="NearestNeighbors" from_port="model" to_port="model"/>
                     <portSpacing port="source_training" spacing="0"/>
                     <portSpacing port="sink_model" spacing="0"/>
                     <portSpacing port="sink_through 1" spacing="0"/>
                   </process>
                   <process expanded="true" height="418" width="414">
                     <operator activated="true" class="apply_model" compatibility="5.1.003" expanded="true" height="76" name="ModelApplier" width="90" x="45" y="30">
                       <list key="application_parameters"/>
                     </operator>
                     <operator activated="true" class="performance" compatibility="5.1.003" expanded="true" height="76" name="Performance" width="90" x="234" y="30"/>
                     <connect from_port="model" to_op="ModelApplier" to_port="model"/>
                     <connect from_port="test set" to_op="ModelApplier" to_port="unlabelled data"/>
                     <connect from_op="ModelApplier" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                     <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
                     <portSpacing port="source_model" spacing="0"/>
                     <portSpacing port="source_test set" spacing="0"/>
                     <portSpacing port="source_through 1" spacing="0"/>
                     <portSpacing port="sink_averagable 1" spacing="0"/>
                     <portSpacing port="sink_averagable 2" spacing="0"/>
                   </process>
                 </operator>
                 <connect from_port="in 1" to_op="Extract Macro" to_port="example set"/>
                 <connect from_op="Extract Macro" from_port="example set" to_op="XValidation" to_port="training"/>
                 <connect from_op="XValidation" from_port="averagable 1" to_port="averagable 1"/>
                 <portSpacing port="source_in 1" spacing="0"/>
                 <portSpacing port="source_in 2" spacing="0"/>
                 <portSpacing port="sink_averagable 1" spacing="0"/>
                 <portSpacing port="sink_averagable 2" spacing="0"/>
               </process>
             </operator>
             <operator activated="true" class="provide_macro_as_log_value" compatibility="5.1.003" expanded="true" height="76" name="Provide Macro as Log Value" width="90" x="313" y="30">
               <parameter key="macro_name" value="Atts"/>
             </operator>
             <operator activated="true" class="log" compatibility="5.1.003" expanded="true" height="76" name="Log" width="90" x="447" y="30">
               <list key="log">
                 <parameter key="performance" value="operator.Loop and Average.value.performance"/>
                 <parameter key="Attributes" value="operator.Provide Macro as Log Value.value.macro_value"/>
               </list>
             </operator>
             <connect from_port="example set" to_op="Loop and Average" to_port="in 1"/>
             <connect from_op="Loop and Average" from_port="averagable 1" to_op="Provide Macro as Log Value" to_port="through 1"/>
             <connect from_op="Provide Macro as Log Value" from_port="through 1" to_op="Log" to_port="through 1"/>
             <connect from_op="Log" from_port="through 1" to_port="performance"/>
             <portSpacing port="source_example set" spacing="0"/>
             <portSpacing port="source_through 1" spacing="0"/>
             <portSpacing port="sink_performance" spacing="0"/>
           </process>
         </operator>
         <connect from_op="Retrieve" from_port="output" to_op="PCAWeighting" to_port="example set"/>
         <connect from_op="PCAWeighting" from_port="weights" to_op="WeightGuidedFeatureSelection" to_port="attribute weights in"/>
         <connect from_op="PCAWeighting" from_port="example set" to_op="WeightGuidedFeatureSelection" to_port="example set in"/>
         <connect from_op="WeightGuidedFeatureSelection" from_port="example set out" to_port="result 1"/>
         <connect from_op="WeightGuidedFeatureSelection" from_port="weights" to_port="result 2"/>
         <connect from_op="WeightGuidedFeatureSelection" from_port="performance" to_port="result 3"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
         <portSpacing port="sink_result 3" spacing="0"/>
         <portSpacing port="sink_result 4" spacing="0"/>
       </process>
     </operator>
    </process>

  • overfitteroverfitter Member Posts: 4 Contributor I
    Here is a simplified process. It averages some random numbers. The value in the performance vector is still different from the logged performance value.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.004">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
        <process expanded="true" height="161" width="614">
          <operator activated="true" class="generate_data_user_specification" compatibility="5.1.004" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30">
            <list key="attribute_values">
              <parameter key="att" value="0"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="loop_and_average" compatibility="5.1.004" expanded="true" height="76" name="Loop and Average" width="90" x="246" y="30">
            <process expanded="true" height="569" width="715">
              <operator activated="true" class="generate_attributes" compatibility="5.1.004" expanded="true" height="76" name="Generate Attributes" width="90" x="112" y="30">
                <list key="function_descriptions">
                  <parameter key="value" value="rand()"/>
                </list>
              </operator>
              <operator activated="true" class="extract_performance" compatibility="5.1.004" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
                <parameter key="performance_type" value="data_value"/>
                <parameter key="attribute_name" value="value"/>
                <parameter key="example_index" value="1"/>
              </operator>
              <connect from_port="in 1" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_op="Performance" to_port="example set"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="log" compatibility="5.1.004" expanded="true" height="76" name="Log" width="90" x="447" y="30">
            <list key="log">
              <parameter key="performance" value="operator.Loop and Average.value.performance"/>
            </list>
          </operator>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Loop and Average" to_port="in 1"/>
          <connect from_op="Loop and Average" from_port="averagable 1" to_op="Log" to_port="through 1"/>
          <connect from_op="Log" from_port="through 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • haddockhaddock Member Posts: 849 Maven
    I agree, it looks like the loop and average takes the last performance value, instead of making the average. In the last example this error got masked, but this example makes it plain, especially if you log the loop like this..
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.004">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
        <process expanded="true" height="161" width="614">
          <operator activated="true" class="generate_data_user_specification" compatibility="5.1.004" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30">
            <list key="attribute_values">
              <parameter key="att" value="0"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="loop_and_average" compatibility="5.1.004" expanded="true" height="76" name="Loop and Average" width="90" x="246" y="30">
            <parameter key="average_performances_only" value="false"/>
            <process expanded="true" height="569" width="715">
              <operator activated="true" class="generate_attributes" compatibility="5.1.004" expanded="true" height="76" name="Generate Attributes" width="90" x="112" y="30">
                <list key="function_descriptions">
                  <parameter key="value" value="rand()"/>
                </list>
              </operator>
              <operator activated="true" class="extract_performance" compatibility="5.1.004" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
                <parameter key="performance_type" value="data_value"/>
                <parameter key="attribute_name" value="value"/>
                <parameter key="example_index" value="1"/>
                <parameter key="optimization_direction" value="minimize"/>
              </operator>
              <operator activated="true" class="log" compatibility="5.1.004" expanded="true" height="76" name="Log (2)" width="90" x="539" y="29">
                <list key="log">
                  <parameter key="In loop" value="operator.Performance.value.performance"/>
                </list>
              </operator>
              <connect from_port="in 1" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_op="Performance" to_port="example set"/>
              <connect from_op="Performance" from_port="performance" to_op="Log (2)" to_port="through 1"/>
              <connect from_op="Log (2)" from_port="through 1" to_port="averagable 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="log" compatibility="5.1.004" expanded="true" height="76" name="Log" width="90" x="447" y="30">
            <list key="log">
              <parameter key="performance" value="operator.Loop and Average.value.performance"/>
            </list>
          </operator>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Loop and Average" to_port="in 1"/>
          <connect from_op="Loop and Average" from_port="averagable 1" to_op="Log" to_port="through 1"/>
          <connect from_op="Log" from_port="through 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • landland RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 2,531 Unicorn
    Hi folks,

    you are right. Seems to be a little bit surprising. Could anybody add a few lines in the documentation?

    Greetings,
      Sebastian
Sign In or Register to comment.