RapidMiner

a month ago

This article briefs on automatic selection of optimized models built using RapidMiner operators.

 

As shown in the below screenshot, the input dataset is trained, validated and/or optimized using various machine learning algorithms in a Sub-process operator. 

This Sub-process operator is wrapped around Optimize Parameter operator with "Select Subprocess.select_which" as the parameter to be optimized.

Process workflowProcess workflow

 

 

Let's say we want to optimize 4 different models in the Sub-process operator.

Optimize Parameter operator runs 4 iterations(since we have 4 sections within select sub-process and we are Optimizing on the parameter "select_which") and picks up and outputs the sub-process(model/learner) which results in the optimized parameter results for the given input dataset.

 

Optimize Parameter settingsOptimize Parameter settings

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Sonar" width="90" x="380" y="136">
        <parameter key="repository_entry" value="//Samples/data/Sonar"/>
        <description align="center" color="transparent" colored="false" width="126">Input data set to build a classification learner/model</description>
      </operator>
      <operator activated="true" class="optimize_parameters_grid" compatibility="7.6.001" expanded="true" height="145" name="Optimize Parameters (Grid)" width="90" x="581" y="136">
        <list key="parameters">
          <parameter key="Select Subprocess.select_which" value="[1.0;4;3;linear]"/>
        </list>
        <process expanded="true">
          <operator activated="true" class="select_subprocess" compatibility="7.6.001" expanded="true" height="124" name="Select Subprocess" width="90" x="514" y="34">
            <parameter key="select_which" value="4"/>
            <process expanded="true">
              <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation (4)" width="90" x="45" y="34">
                <process expanded="true">
                  <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="7.6.001" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="136">
                    <list key="expert_parameters"/>
                  </operator>
                  <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/>
                  <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/>
                  <portSpacing port="source_training set" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (4)" width="90" x="112" y="34">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance (4)" width="90" x="313" y="34"/>
                  <connect from_port="model" to_op="Apply Model (4)" to_port="model"/>
                  <connect from_port="test set" to_op="Apply Model (4)" to_port="unlabelled data"/>
                  <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Performance (4)" to_port="labelled data"/>
                  <connect from_op="Performance (4)" from_port="performance" to_port="performance 1"/>
                  <connect from_op="Performance (4)" from_port="example set" to_port="test set results"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_test set results" spacing="0"/>
                  <portSpacing port="sink_performance 1" spacing="0"/>
                  <portSpacing port="sink_performance 2" spacing="0"/>
                </process>
                <description align="center" color="transparent" colored="false" width="126">Cross validation subprocess to to build learner model and validate it's performance</description>
              </operator>
              <connect from_port="input 1" to_op="Cross Validation (4)" to_port="example set"/>
              <connect from_op="Cross Validation (4)" from_port="model" to_port="output 2"/>
              <connect from_op="Cross Validation (4)" from_port="performance 1" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
              <portSpacing port="sink_output 4" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply (2)" width="90" x="45" y="340"/>
              <operator activated="true" class="optimize_parameters_grid" compatibility="7.6.001" expanded="true" height="103" name="Optimize Parameters DT" width="90" x="112" y="34">
                <list key="parameters">
                  <parameter key="Decision Tree.criterion" value="gain_ratio,information_gain,gini_index,accuracy"/>
                  <parameter key="Decision Tree.minimal_gain" value="[0.01;1;100;linear]"/>
                </list>
                <process expanded="true">
                  <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation" width="90" x="514" y="34">
                    <process expanded="true">
                      <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.6.001" expanded="true" height="82" name="Decision Tree" width="90" x="179" y="34">
                        <parameter key="criterion" value="information_gain"/>
                        <parameter key="apply_pruning" value="false"/>
                        <parameter key="apply_prepruning" value="false"/>
                        <parameter key="minimal_gain" value="0.604"/>
                      </operator>
                      <connect from_port="training set" to_op="Decision Tree" to_port="training set"/>
                      <connect from_op="Decision Tree" from_port="model" to_port="model"/>
                      <portSpacing port="source_training set" spacing="0"/>
                      <portSpacing port="sink_model" spacing="0"/>
                      <portSpacing port="sink_through 1" spacing="0"/>
                    </process>
                    <process expanded="true">
                      <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
                        <list key="application_parameters"/>
                      </operator>
                      <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="313" y="34"/>
                      <connect from_port="model" to_op="Apply Model" to_port="model"/>
                      <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
                      <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                      <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
                      <connect from_op="Performance" from_port="example set" to_port="test set results"/>
                      <portSpacing port="source_model" spacing="0"/>
                      <portSpacing port="source_test set" spacing="0"/>
                      <portSpacing port="source_through 1" spacing="0"/>
                      <portSpacing port="sink_test set results" spacing="0"/>
                      <portSpacing port="sink_performance 1" spacing="0"/>
                      <portSpacing port="sink_performance 2" spacing="0"/>
                    </process>
                  </operator>
                  <connect from_port="input 1" to_op="Cross Validation" to_port="example set"/>
                  <connect from_op="Cross Validation" from_port="performance 1" to_port="performance"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_performance" spacing="0"/>
                  <portSpacing port="sink_result 1" spacing="0"/>
                </process>
                <description align="center" color="transparent" colored="false" width="126">Optimize the parameters of the model and performance parameters</description>
              </operator>
              <operator activated="true" class="set_parameters" compatibility="7.6.001" expanded="true" height="82" name="Set Parameters (4)" width="90" x="179" y="340">
                <list key="name_map">
                  <parameter key="Decision Tree " value="Decision Tree (2)"/>
                </list>
                <description align="center" color="transparent" colored="false" width="126">Picks up the optimized parameters and applies a set of parameters to the specified operators</description>
              </operator>
              <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.6.001" expanded="true" height="82" name="Decision Tree (2)" width="90" x="112" y="595"/>
              <connect from_port="input 1" to_op="Multiply (2)" to_port="input"/>
              <connect from_op="Multiply (2)" from_port="output 1" to_op="Optimize Parameters DT" to_port="input 1"/>
              <connect from_op="Multiply (2)" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
              <connect from_op="Optimize Parameters DT" from_port="performance" to_port="output 1"/>
              <connect from_op="Optimize Parameters DT" from_port="parameter" to_op="Set Parameters (4)" to_port="parameter set"/>
              <connect from_op="Set Parameters (4)" from_port="parameter set" to_port="output 3"/>
              <connect from_op="Decision Tree (2)" from_port="model" to_port="output 2"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
              <portSpacing port="sink_output 4" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply (4)" width="90" x="45" y="34"/>
              <operator activated="true" class="optimize_parameters_grid" compatibility="7.6.001" expanded="true" height="103" name="Optimize Parameters RF" width="90" x="179" y="34">
                <list key="parameters">
                  <parameter key="Random Forest (2).number_of_trees" value="[1.0;10;10;linear]"/>
                </list>
                <process expanded="true">
                  <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation (2)" width="90" x="514" y="34">
                    <process expanded="true">
                      <operator activated="true" class="concurrency:parallel_random_forest" compatibility="7.6.001" expanded="true" height="82" name="Random Forest (2)" width="90" x="246" y="34"/>
                      <connect from_port="training set" to_op="Random Forest (2)" to_port="training set"/>
                      <connect from_op="Random Forest (2)" from_port="model" to_port="model"/>
                      <portSpacing port="source_training set" spacing="0"/>
                      <portSpacing port="sink_model" spacing="0"/>
                      <portSpacing port="sink_through 1" spacing="0"/>
                    </process>
                    <process expanded="true">
                      <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="112" y="34">
                        <list key="application_parameters"/>
                      </operator>
                      <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance (2)" width="90" x="313" y="34"/>
                      <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
                      <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
                      <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
                      <connect from_op="Performance (2)" from_port="performance" to_port="performance 1"/>
                      <connect from_op="Performance (2)" from_port="example set" to_port="test set results"/>
                      <portSpacing port="source_model" spacing="0"/>
                      <portSpacing port="source_test set" spacing="0"/>
                      <portSpacing port="source_through 1" spacing="0"/>
                      <portSpacing port="sink_test set results" spacing="0"/>
                      <portSpacing port="sink_performance 1" spacing="0"/>
                      <portSpacing port="sink_performance 2" spacing="0"/>
                    </process>
                  </operator>
                  <connect from_port="input 1" to_op="Cross Validation (2)" to_port="example set"/>
                  <connect from_op="Cross Validation (2)" from_port="performance 1" to_port="performance"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_performance" spacing="0"/>
                  <portSpacing port="sink_result 1" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="set_parameters" compatibility="7.6.001" expanded="true" height="82" name="Set Parameters (5)" width="90" x="179" y="238">
                <list key="name_map">
                  <parameter key="Random Forest (2)" value="Random Forest (3)"/>
                </list>
              </operator>
              <operator activated="true" class="concurrency:parallel_random_forest" compatibility="7.6.001" expanded="true" height="82" name="Random Forest (3)" width="90" x="112" y="595"/>
              <connect from_port="input 1" to_op="Multiply (4)" to_port="input"/>
              <connect from_op="Multiply (4)" from_port="output 1" to_op="Optimize Parameters RF" to_port="input 1"/>
              <connect from_op="Multiply (4)" from_port="output 2" to_op="Random Forest (3)" to_port="training set"/>
              <connect from_op="Optimize Parameters RF" from_port="performance" to_port="output 1"/>
              <connect from_op="Optimize Parameters RF" from_port="parameter" to_op="Set Parameters (5)" to_port="parameter set"/>
              <connect from_op="Set Parameters (5)" from_port="parameter set" to_port="output 3"/>
              <connect from_op="Random Forest (3)" from_port="model" to_port="output 2"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
              <portSpacing port="sink_output 4" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply (3)" width="90" x="45" y="289"/>
              <operator activated="true" class="optimize_parameters_grid" compatibility="7.6.001" expanded="true" height="103" name="Optimize Parameters RI" width="90" x="112" y="34">
                <list key="parameters">
                  <parameter key="Rule Induction (2).minimal_prune_benefit" value="[0.0;1.0;10;linear]"/>
                </list>
                <process expanded="true">
                  <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation (3)" width="90" x="514" y="34">
                    <process expanded="true">
                      <operator activated="true" class="rule_induction" compatibility="7.6.001" expanded="true" height="82" name="Rule Induction (2)" width="90" x="246" y="34">
                        <parameter key="minimal_prune_benefit" value="1.0"/>
                      </operator>
                      <connect from_port="training set" to_op="Rule Induction (2)" to_port="training set"/>
                      <connect from_op="Rule Induction (2)" from_port="model" to_port="model"/>
                      <portSpacing port="source_training set" spacing="0"/>
                      <portSpacing port="sink_model" spacing="0"/>
                      <portSpacing port="sink_through 1" spacing="0"/>
                    </process>
                    <process expanded="true">
                      <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (3)" width="90" x="112" y="34">
                        <list key="application_parameters"/>
                      </operator>
                      <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance (3)" width="90" x="313" y="34"/>
                      <connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
                      <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
                      <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (3)" to_port="labelled data"/>
                      <connect from_op="Performance (3)" from_port="performance" to_port="performance 1"/>
                      <connect from_op="Performance (3)" from_port="example set" to_port="test set results"/>
                      <portSpacing port="source_model" spacing="0"/>
                      <portSpacing port="source_test set" spacing="0"/>
                      <portSpacing port="source_through 1" spacing="0"/>
                      <portSpacing port="sink_test set results" spacing="0"/>
                      <portSpacing port="sink_performance 1" spacing="0"/>
                      <portSpacing port="sink_performance 2" spacing="0"/>
                    </process>
                  </operator>
                  <connect from_port="input 1" to_op="Cross Validation (3)" to_port="example set"/>
                  <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="performance"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_performance" spacing="0"/>
                  <portSpacing port="sink_result 1" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="set_parameters" compatibility="7.6.001" expanded="true" height="82" name="Set Parameters (6)" width="90" x="179" y="187">
                <list key="name_map">
                  <parameter key="Rule Induction (2)" value="Rule Induction (3)"/>
                </list>
              </operator>
              <operator activated="true" class="rule_induction" compatibility="7.6.001" expanded="true" height="82" name="Rule Induction (3)" width="90" x="179" y="442">
                <parameter key="minimal_prune_benefit" value="0.8"/>
              </operator>
              <connect from_port="input 1" to_op="Multiply (3)" to_port="input"/>
              <connect from_op="Multiply (3)" from_port="output 1" to_op="Optimize Parameters RI" to_port="input 1"/>
              <connect from_op="Multiply (3)" from_port="output 2" to_op="Rule Induction (3)" to_port="training set"/>
              <connect from_op="Optimize Parameters RI" from_port="performance" to_port="output 1"/>
              <connect from_op="Optimize Parameters RI" from_port="parameter" to_op="Set Parameters (6)" to_port="parameter set"/>
              <connect from_op="Set Parameters (6)" from_port="parameter set" to_port="output 3"/>
              <connect from_op="Rule Induction (3)" from_port="model" to_port="output 2"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
              <portSpacing port="sink_output 4" spacing="0"/>
            </process>
            <description align="center" color="transparent" colored="false" width="126">Subprocess to Optimize number of models and its performance</description>
          </operator>
          <connect from_port="input 1" to_op="Select Subprocess" to_port="input 1"/>
          <connect from_op="Select Subprocess" from_port="output 1" to_port="performance"/>
          <connect from_op="Select Subprocess" from_port="output 2" to_port="result 1"/>
          <connect from_op="Select Subprocess" from_port="output 3" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_performance" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">Actomatically picks the process which produces the optimized model</description>
      </operator>
      <connect from_op="Retrieve Sonar" from_port="output" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 1"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 4"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_port="result 2"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="result 2" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
      <description align="center" color="yellow" colored="false" height="74" resized="true" width="764" x="165" y="10">This process automatically picks the optimized model out of the number of models built inside Select subprocess operator&lt;br/&gt;The outer optimize operator, optimizes on the Select subprocess parameter to pick a process(insideselect subprocess operator) which has optimized model results for the given input data</description>
    </process>
  </operator>
</process>

 Cheers,

 

Pavithra Rao