Do Not Use Certain Attributes in Prediction

michaelglovenmichaelgloven RapidMiner Certified Analyst, Member Posts: 46 Guru
edited December 2018 in Help

Hi, I have attributes that are useful as information in a training and scored example set, but I do not want them to be used in my learning model. How do I set an attribute so it comes along for the ride as training or scored data in the examples, but is not used for learning? I don't see a target role that allows data to be consumed like a "comment" or such.

 

thanks

Answers

  • FBTFBT Member Posts: 106 Unicorn

    You would most likely need to split your data set prior to training and then join it back together afterwards, like this:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply" width="90" x="112" y="85"/>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="246" y="238"/>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="246" y="34"/>
    <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Validation" width="90" x="380" y="34">
    <parameter key="sampling_type" value="stratified sampling"/>
    <process expanded="true">
    <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.6.001" expanded="true" height="82" name="Decision Tree (2)" width="90" x="45" y="34"/>
    <connect from_port="training set" to_op="Decision Tree (2)" to_port="training set"/>
    <connect from_op="Decision Tree (2)" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    <description align="left" color="green" colored="true" height="80" resized="true" width="248" x="37" y="137">In the training phase, a model is built on the current training data set. (90 % of data by default, 10 times)</description>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34"/>
    <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
    <connect from_op="Performance" from_port="example set" to_port="test set results"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    <description align="left" color="blue" colored="true" height="103" resized="true" width="315" x="38" y="137">The model created in the Training step is applied to the current test set (10 %).&lt;br/&gt;The performance is evaluated and sent to the operator results.</description>
    </process>
    <description align="center" color="transparent" colored="false" width="126">A cross-validation evaluating a decision tree model.</description>
    </operator>
    <operator activated="true" class="join" compatibility="7.6.001" expanded="true" height="82" name="Join" width="90" x="648" y="136">
    <parameter key="join_type" value="left"/>
    <list key="key_attributes"/>
    </operator>
    <connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
    <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Join" to_port="right"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Validation" to_port="example set"/>
    <connect from_op="Validation" from_port="test result set" to_op="Join" to_port="left"/>
    <connect from_op="Validation" from_port="performance 1" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
  • JEdwardJEdward RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 578 Unicorn

    The Set Role operator isn't just limited to the listed options.  You can type whatever you like in there & make the field special.  

     

    Here's an example. 

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Golf" width="90" x="179" y="136">
    <parameter key="repository_entry" value="//Samples/data/Golf"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="380" y="136">
    <parameter key="attribute_name" value="Outlook"/>
    <parameter key="target_role" value="comment1"/>
    <list key="set_additional_roles">
    <parameter key="Humidity" value="comment2"/>
    </list>
    <description align="center" color="transparent" colored="false" width="126">Set Roles of Outlook &amp;amp; Humidity as special</description>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply" width="90" x="514" y="161"/>
    <operator activated="true" class="concurrency:loop_attributes" compatibility="7.6.001" expanded="true" height="82" name="Loop Attributes" width="90" x="648" y="238">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Outlook|Humidity"/>
    <parameter key="include_special_attributes" value="true"/>
    <parameter key="reuse_results" value="true"/>
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Return Role" width="90" x="179" y="136">
    <parameter key="attribute_name" value="%{loop_attribute}"/>
    <list key="set_additional_roles"/>
    </operator>
    <connect from_port="input 1" to_op="Return Role" to_port="example set input"/>
    <connect from_op="Return Role" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    <description align="center" color="transparent" colored="false" width="126">Return Outlook &amp;amp; Humidity to normal.</description>
    </operator>
    <connect from_op="Retrieve Golf" from_port="output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
    <connect from_op="Multiply" from_port="output 2" to_op="Loop Attributes" to_port="input 1"/>
    <connect from_op="Loop Attributes" from_port="output 1" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>
  • FBTFBT Member Posts: 106 Unicorn

    @JEdward 's proposed solution is far more elegant than mine and is the way to go in this kind of situations.

Sign In or Register to comment.