"Feature selection inside validation loop"

wesselwessel Member Posts: 537  Guru
edited May 23 in Help
Dear All,

Is it possible to run feature selection inside cross validation?

I created the following process (pasted below) which I believe does exactly this.
But I'm afraid that the remember and recall of feature weights might return feature weights from the previous loop.

Best regards,

Wessel







<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
    <process expanded="true" height="409" width="678">
      <operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="112" y="75">
        <parameter key="repository_entry" value="//Samples/data/Sonar"/>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.1.006" expanded="true" height="112" name="k-fold" width="90" x="246" y="75">
        <parameter key="number_of_validations" value="144"/>
        <process expanded="true" height="391" width="300">
          <operator activated="true" class="weka:W-ReliefFAttributeEval" compatibility="5.1.000" expanded="true" height="76" name="W-ReliefFAttributeEval (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="select_by_weights" compatibility="5.1.006" expanded="true" height="94" name="Select by Weights (2)" width="90" x="180" y="30">
            <parameter key="weight_relation" value="top k"/>
            <parameter key="k" value="5"/>
          </operator>
          <operator activated="true" class="remember" compatibility="5.1.006" expanded="true" height="60" name="Remember" width="90" x="45" y="120">
            <parameter key="name" value="f"/>
            <parameter key="io_object" value="AttributeWeights"/>
          </operator>
          <operator activated="true" class="naive_bayes" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes" width="90" x="179" y="120"/>
          <connect from_port="training" to_op="W-ReliefFAttributeEval (2)" to_port="example set"/>
          <connect from_op="W-ReliefFAttributeEval (2)" from_port="weights" to_op="Select by Weights (2)" to_port="weights"/>
          <connect from_op="W-ReliefFAttributeEval (2)" from_port="example set" to_op="Select by Weights (2)" to_port="example set input"/>
          <connect from_op="Select by Weights (2)" from_port="example set output" to_op="Naive Bayes" to_port="training set"/>
          <connect from_op="Select by Weights (2)" from_port="weights" to_op="Remember" to_port="store"/>
          <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="391" width="300">
          <operator activated="true" class="recall" compatibility="5.1.006" expanded="true" height="60" name="Recall" width="90" x="45" y="30">
            <parameter key="name" value="f"/>
            <parameter key="io_object" value="AttributeWeights"/>
          </operator>
          <operator activated="true" class="select_by_weights" compatibility="5.1.006" expanded="true" height="94" name="Select by Weights (3)" width="90" x="180" y="30">
            <parameter key="weight_relation" value="top k"/>
            <parameter key="k" value="5"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.1.006" expanded="true" height="76" name="-1 Folds Model" width="90" x="45" y="120">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="5.1.006" expanded="true" height="76" name="Accuracy Fold" width="90" x="180" y="120">
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="-1 Folds Model" to_port="model"/>
          <connect from_port="test set" to_op="Select by Weights (3)" to_port="example set input"/>
          <connect from_op="Recall" from_port="result" to_op="Select by Weights (3)" to_port="weights"/>
          <connect from_op="Select by Weights (3)" from_port="example set output" to_op="-1 Folds Model" to_port="unlabelled data"/>
          <connect from_op="-1 Folds Model" from_port="labelled data" to_op="Accuracy Fold" to_port="labelled data"/>
          <connect from_op="Accuracy Fold" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="k-fold" to_port="training"/>
      <connect from_op="k-fold" from_port="averagable 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="72"/>
      <portSpacing port="sink_result 2" spacing="18"/>
    </process>
  </operator>
</process>

Answers

  • IngoRMIngoRM Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Community Manager, RMResearcher, Member, University Professor Posts: 1,643  RM Founder
    Hi Wessel,

    sure, this is possible. This works with Remember and Recall in principle but you actually would not need Remember and Recall at all but could use the "Through" port like in the process below.

    Cheers,
    Ingo

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.008" expanded="true" name="Process">
        <process expanded="true" height="409" width="678">
          <operator activated="true" class="retrieve" compatibility="5.1.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
            <parameter key="repository_entry" value="//Samples/data/Sonar"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.1.008" expanded="true" height="112" name="k-fold" width="90" x="179" y="30">
            <process expanded="true" height="541" width="433">
              <operator activated="true" class="weka:W-ReliefFAttributeEval" compatibility="5.1.000" expanded="true" height="76" name="W-ReliefFAttributeEval (2)" width="90" x="45" y="30"/>
              <operator activated="true" class="select_by_weights" compatibility="5.1.008" expanded="true" height="94" name="Select by Weights (2)" width="90" x="180" y="30">
                <parameter key="weight_relation" value="top k"/>
                <parameter key="k" value="5"/>
              </operator>
              <operator activated="true" class="naive_bayes" compatibility="5.1.008" expanded="true" height="76" name="Naive Bayes" width="90" x="313" y="30"/>
              <connect from_port="training" to_op="W-ReliefFAttributeEval (2)" to_port="example set"/>
              <connect from_op="W-ReliefFAttributeEval (2)" from_port="weights" to_op="Select by Weights (2)" to_port="weights"/>
              <connect from_op="W-ReliefFAttributeEval (2)" from_port="example set" to_op="Select by Weights (2)" to_port="example set input"/>
              <connect from_op="Select by Weights (2)" from_port="example set output" to_op="Naive Bayes" to_port="training set"/>
              <connect from_op="Select by Weights (2)" from_port="weights" to_port="through 1"/>
              <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
              <portSpacing port="sink_through 2" spacing="0"/>
            </process>
            <process expanded="true" height="541" width="433">
              <operator activated="true" class="select_by_weights" compatibility="5.1.008" expanded="true" height="94" name="Select by Weights (3)" width="90" x="45" y="75">
                <parameter key="weight_relation" value="top k"/>
                <parameter key="k" value="5"/>
              </operator>
              <operator activated="true" class="apply_model" compatibility="5.1.008" expanded="true" height="76" name="-1 Folds Model" width="90" x="179" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_classification" compatibility="5.1.008" expanded="true" height="76" name="Accuracy Fold" width="90" x="313" y="30">
                <list key="class_weights"/>
              </operator>
              <connect from_port="model" to_op="-1 Folds Model" to_port="model"/>
              <connect from_port="test set" to_op="Select by Weights (3)" to_port="example set input"/>
              <connect from_port="through 1" to_op="Select by Weights (3)" to_port="weights"/>
              <connect from_op="Select by Weights (3)" from_port="example set output" to_op="-1 Folds Model" to_port="unlabelled data"/>
              <connect from_op="-1 Folds Model" from_port="labelled data" to_op="Accuracy Fold" to_port="labelled data"/>
              <connect from_op="Accuracy Fold" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="source_through 2" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="k-fold" to_port="training"/>
          <connect from_op="k-fold" from_port="averagable 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="72"/>
          <portSpacing port="sink_result 2" spacing="18"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.