RapidMiner 9.7 is Now Available

Lots of amazing new improvements including true version control! Learn more about what's new here.

CLICK HERE TO DOWNLOAD

Profiling Dataset

JEdwardJEdward RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 570   Unicorn
edited November 2018 in Help
This is a really simple problem, but for some reason I just can't get it.  

I'd like to loop through the attributes of a dataset to create counts againts each variable against the label.  (This can later become percentages or index)
If there is a numeric attribute to discretize it, otherwise to show the groups.  
So for example using the golf dataset I'd get something like:
TemperaturePlay: YesPlay: No
range1 [-∞ - 73.500]53
range2 [73.500 - ∞]42
OutlookPlay: YesPlay: No
overcast40
rain32
sunny23
For some reason I just can't work out a way of extracting this data in an automated way... can anyone help my ignorance?  

Thanks,
JEdward

Here's my current XML: I have tried using Loop Attributes to step through each variable, but this seems to cause errors and so to get the values I need to manually change the value in Select Attributes & Pivot individually (Aggregate ignores values it can't count so I put all possible attributes in there). 
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Golf" width="90" x="45" y="30">
        <parameter key="repository_entry" value="//Samples/data/Golf"/>
      </operator>
      <operator activated="true" class="subprocess" compatibility="5.3.015" expanded="true" height="76" name="Numeric" width="90" x="112" y="165">
        <process expanded="true">
          <operator activated="true" class="discretize_by_frequency" compatibility="5.3.015" expanded="true" height="94" name="Discretize" width="90" x="246" y="30">
            <parameter key="number_of_bins" value="5"/>
          </operator>
          <connect from_port="in 1" to_op="Discretize" to_port="example set input"/>
          <connect from_op="Discretize" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Humidity"/>
      </operator>
      <operator activated="true" class="aggregate" compatibility="5.3.015" expanded="true" height="76" name="Aggregate" width="90" x="246" y="165">
        <parameter key="use_default_aggregation" value="true"/>
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Play"/>
        <parameter key="default_aggregation_function" value="count"/>
        <list key="aggregation_attributes">
          <parameter key="Play" value="count"/>
        </list>
        <parameter key="group_by_attributes" value="Play|Outlook|Humidity|Temperature|Wind"/>
      </operator>
      <operator activated="true" class="pivot" compatibility="5.3.015" expanded="true" height="76" name="Pivot" width="90" x="380" y="165">
        <parameter key="group_attribute" value="Humidity"/>
        <parameter key="index_attribute" value="Play"/>
      </operator>
      <operator activated="true" class="replace_missing_values" compatibility="5.3.015" expanded="true" height="94" name="Replace Missing Values" width="90" x="380" y="30">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|count(Play)_no|count(Play)_yes"/>
        <parameter key="default" value="zero"/>
        <list key="columns"/>
      </operator>
      <operator activated="true" class="order_attributes" compatibility="5.3.015" expanded="true" height="76" name="Reorder Attributes" width="90" x="506" y="30">
        <parameter key="sort_direction" value="descending"/>
        <parameter key="attribute_ordering" value="count(Play)_yes|count(Play)_no"/>
        <parameter key="handle_unmatched" value="prepend"/>
      </operator>
      <connect from_op="Golf" from_port="output" to_op="Numeric" to_port="in 1"/>
      <connect from_op="Numeric" from_port="out 1" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="example set output" to_op="Pivot" to_port="example set input"/>
      <connect from_op="Pivot" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
      <connect from_op="Replace Missing Values" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
      <connect from_op="Reorder Attributes" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • homburghomburg Moderator, Employee, Member Posts: 114  RM Data Scientist
    Hi JEdward,

    the idea of using Loop Attributes is a good one. You only have to keep in mind that a change during an iteration (filter attributes) will have an effect to the input of all following iterations. Using a collection will provide all the individual tables, the original example set must remain invariant. To avoid using a collection you may also rename your newly created attributes and add them to the example set. Here is how to use loops, macros and collections:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.0.008">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
       <process expanded="true">
         <operator activated="true" class="retrieve" compatibility="6.0.008" expanded="true" height="60" name="Golf" width="90" x="45" y="30">
           <parameter key="repository_entry" value="//Samples/data/Golf"/>
         </operator>
         <operator activated="true" class="subprocess" compatibility="6.0.008" expanded="true" height="76" name="Numeric" width="90" x="246" y="30">
           <process expanded="true">
             <operator activated="true" class="discretize_by_frequency" compatibility="6.0.008" expanded="true" height="94" name="Discretize" width="90" x="246" y="30">
               <parameter key="number_of_bins" value="5"/>
             </operator>
             <connect from_port="in 1" to_op="Discretize" to_port="example set input"/>
             <connect from_op="Discretize" from_port="example set output" to_port="out 1"/>
             <portSpacing port="source_in 1" spacing="0"/>
             <portSpacing port="source_in 2" spacing="0"/>
             <portSpacing port="sink_out 1" spacing="0"/>
             <portSpacing port="sink_out 2" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="loop_attributes" compatibility="6.0.008" expanded="true" height="94" name="Loop Attributes" width="90" x="447" y="30">
           <process expanded="true">
             <operator activated="true" class="select_attributes" compatibility="6.0.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="30">
               <parameter key="attribute_filter_type" value="single"/>
               <parameter key="attribute" value="%{loop_attribute}"/>
             </operator>
             <operator activated="true" class="aggregate" compatibility="6.0.006" expanded="true" height="76" name="Aggregate" width="90" x="179" y="120">
               <parameter key="use_default_aggregation" value="true"/>
               <parameter key="attribute_filter_type" value="single"/>
               <parameter key="attribute" value="Play"/>
               <parameter key="default_aggregation_function" value="count"/>
               <list key="aggregation_attributes">
                 <parameter key="Play" value="count"/>
               </list>
               <parameter key="group_by_attributes" value="Play|Outlook|Humidity|Temperature|Wind"/>
             </operator>
             <operator activated="true" class="pivot" compatibility="6.0.008" expanded="true" height="76" name="Pivot" width="90" x="313" y="120">
               <parameter key="group_attribute" value="%{loop_attribute}"/>
               <parameter key="index_attribute" value="Play"/>
             </operator>
             <operator activated="true" class="replace_missing_values" compatibility="6.0.008" expanded="true" height="94" name="Replace Missing Values" width="90" x="447" y="120">
               <parameter key="attribute_filter_type" value="subset"/>
               <parameter key="attributes" value="|count(Play)_no|count(Play)_yes"/>
               <parameter key="default" value="zero"/>
               <list key="columns"/>
             </operator>
             <operator activated="true" class="order_attributes" compatibility="6.0.008" expanded="true" height="76" name="Reorder Attributes" width="90" x="581" y="120">
               <parameter key="sort_direction" value="descending"/>
               <parameter key="attribute_ordering" value="count(Play)_yes|count(Play)_no"/>
               <parameter key="handle_unmatched" value="prepend"/>
             </operator>
             <operator activated="true" class="collect" compatibility="6.0.008" expanded="true" height="76" name="Collect" width="90" x="715" y="120"/>
             <connect from_port="example set" to_op="Select Attributes" to_port="example set input"/>
             <connect from_op="Select Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
             <connect from_op="Select Attributes" from_port="original" to_port="example set"/>
             <connect from_op="Aggregate" from_port="example set output" to_op="Pivot" to_port="example set input"/>
             <connect from_op="Pivot" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
             <connect from_op="Replace Missing Values" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
             <connect from_op="Reorder Attributes" from_port="example set output" to_op="Collect" to_port="input 1"/>
             <connect from_op="Collect" from_port="collection" to_port="result 1"/>
             <portSpacing port="source_example set" spacing="0"/>
             <portSpacing port="sink_example set" spacing="0"/>
             <portSpacing port="sink_result 1" spacing="0"/>
             <portSpacing port="sink_result 2" spacing="0"/>
           </process>
         </operator>
         <connect from_op="Golf" from_port="output" to_op="Numeric" to_port="in 1"/>
         <connect from_op="Numeric" from_port="out 1" to_op="Loop Attributes" to_port="example set"/>
         <connect from_op="Loop Attributes" from_port="example set" to_port="result 1"/>
         <connect from_op="Loop Attributes" from_port="result 1" to_port="result 2"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
         <portSpacing port="sink_result 3" spacing="0"/>
       </process>
     </operator>
    </process>
    Happy looping!
Sign In or Register to comment.