Options

Replacing missing values with subgroup averages not total column averages

parisaveparisave Member Posts: 1 Contributor I
edited June 2019 in Help

Hi all,

I would like to replace the missing values of attributes with their respective group average instead of  

their column average, eg. :                     averages

state          region       widgets    north     south    east    west

florida         south           5            8.5         3           9        8

california      west         10

new york      north        15

delaware      east           8

georgia        south          1

texas           south          ?   

nevada        west            6

arizona        west            ?

maryland     east          10

mass.          east            ?

maine          north          ?

vermont      north           2

 

 

Tagged:

Answers

  • Options
    AndrewAndrew RapidMiner Certified Expert, RapidMiner Certified Master, Member Posts: 47 Guru

    Hello parisave,

     

    You can aggregate by region and use the result to join back to the original where there are missing values.

     

    Relatively complex - here's an example

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="112" y="85">
    <process expanded="true">
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
    <list key="attribute_values">
    <parameter key="state" value="&quot;florida&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="5"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
    <list key="attribute_values">
    <parameter key="state" value="&quot;california&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="state" value="&quot;new york&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="15"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="340">
    <list key="attribute_values">
    <parameter key="state" value="&quot;delaware&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="8"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="442">
    <list key="attribute_values">
    <parameter key="state" value="&quot;georgia&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="1"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="544">
    <list key="attribute_values">
    <parameter key="state" value="&quot;texas&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (7)" width="90" x="45" y="646">
    <list key="attribute_values">
    <parameter key="state" value="&quot;nevada&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="6"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (8)" width="90" x="45" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;arizona&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (9)" width="90" x="313" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;maryland&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (10)" width="90" x="514" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;mass.&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (11)" width="90" x="715" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;main&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (12)" width="90" x="849" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;vermont&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="2"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="313" name="Append" width="90" x="514" y="34"/>
    <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="648" y="34"/>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
    <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 4"/>
    <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 5"/>
    <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
    <connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 7"/>
    <connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 8"/>
    <connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 9"/>
    <connect from_op="Generate Data by User Specification (10)" from_port="output" to_op="Append" to_port="example set 10"/>
    <connect from_op="Generate Data by User Specification (11)" from_port="output" to_op="Append" to_port="example set 11"/>
    <connect from_op="Generate Data by User Specification (12)" from_port="output" to_op="Append" to_port="example set 12"/>
    <connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="aggregate" compatibility="7.6.001" expanded="true" height="82" name="Aggregate" width="90" x="313" y="85">
    <list key="aggregation_attributes">
    <parameter key="widget" value="average"/>
    </list>
    <parameter key="group_by_attributes" value="region"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.6.001" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
    <parameter key="old_name" value="average(widget)"/>
    <parameter key="new_name" value="widget"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID (2)" width="90" x="581" y="34"/>
    <operator activated="true" class="filter_examples" compatibility="7.6.001" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="238">
    <parameter key="condition_class" value="no_missing_attributes"/>
    <list key="filters_list"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="136">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="region|state"/>
    </operator>
    <operator activated="true" class="join" compatibility="7.6.001" expanded="true" height="82" name="Join" width="90" x="782" y="34">
    <parameter key="join_type" value="left"/>
    <parameter key="use_id_attribute_as_key" value="false"/>
    <list key="key_attributes">
    <parameter key="region" value="region"/>
    </list>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="103" name="Append (2)" width="90" x="916" y="85"/>
    <operator activated="true" class="sort" compatibility="7.6.001" expanded="true" height="82" name="Sort" width="90" x="1050" y="85">
    <parameter key="attribute_name" value="id"/>
    </operator>
    <connect from_op="Subprocess" from_port="out 1" to_op="Aggregate" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="example set output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="original" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Generate ID (2)" to_port="example set input"/>
    <connect from_op="Generate ID (2)" from_port="example set output" to_op="Join" to_port="right"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
    <connect from_op="Filter Examples" from_port="unmatched example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
    <connect from_op="Join" from_port="join" to_op="Append (2)" to_port="example set 1"/>
    <connect from_op="Append (2)" from_port="merged set" to_op="Sort" to_port="example set input"/>
    <connect from_op="Sort" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

     

    regards,

     

    Andrew

  • Options
    MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,517 RM Data Scientist

    Hi,

     

    an easier way to do it is to user Group Into Collection from operator toolbox extension. This enables you to do any subprocess with a group_by statement. An example is attached.

     

    Cheers,

    Martin

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="45" y="85">
    <process expanded="true">
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
    <list key="attribute_values">
    <parameter key="state" value="&quot;florida&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="5"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
    <list key="attribute_values">
    <parameter key="state" value="&quot;california&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="state" value="&quot;new york&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="15"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="340">
    <list key="attribute_values">
    <parameter key="state" value="&quot;delaware&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="8"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="442">
    <list key="attribute_values">
    <parameter key="state" value="&quot;georgia&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="1"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="544">
    <list key="attribute_values">
    <parameter key="state" value="&quot;texas&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (7)" width="90" x="45" y="646">
    <list key="attribute_values">
    <parameter key="state" value="&quot;nevada&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="6"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (8)" width="90" x="45" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;arizona&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (9)" width="90" x="313" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;maryland&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (10)" width="90" x="514" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;mass.&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (11)" width="90" x="715" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;main&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (12)" width="90" x="849" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;vermont&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="2"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="313" name="Append" width="90" x="514" y="34"/>
    <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="648" y="34"/>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
    <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 4"/>
    <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 5"/>
    <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
    <connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 7"/>
    <connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 8"/>
    <connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 9"/>
    <connect from_op="Generate Data by User Specification (10)" from_port="output" to_op="Append" to_port="example set 10"/>
    <connect from_op="Generate Data by User Specification (11)" from_port="output" to_op="Append" to_port="example set 11"/>
    <connect from_op="Generate Data by User Specification (12)" from_port="output" to_op="Append" to_port="example set 12"/>
    <connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="operator_toolbox:group_into_collection" compatibility="0.7.000" expanded="true" height="82" name="Group Into Collection" width="90" x="246" y="85">
    <parameter key="group_by_attribute" value="region"/>
    </operator>
    <operator activated="true" class="loop_collection" compatibility="7.6.001" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="85">
    <process expanded="true">
    <operator activated="true" class="replace_missing_values" compatibility="7.6.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="34">
    <list key="columns"/>
    </operator>
    <connect from_port="single" to_op="Replace Missing Values" to_port="example set input"/>
    <connect from_op="Replace Missing Values" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="82" name="Append (2)" width="90" x="648" y="85"/>
    <connect from_op="Subprocess" from_port="out 1" to_op="Group Into Collection" to_port="exa"/>
    <connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
    <connect from_op="Loop Collection" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
    <connect from_op="Append (2)" from_port="merged set" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.