Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Replacing missing values with subgroup averages not total column averages

parisaveparisave Member Posts: 1 Learner II
edited June 2019 in Help

Hi all,

I would like to replace the missing values of attributes with their respective group average instead of  

their column average, eg. :                     averages

state          region       widgets    north     south    east    west

florida         south           5            8.5         3           9        8

california      west         10

new york      north        15

delaware      east           8

georgia        south          1

texas           south          ?   

nevada        west            6

arizona        west            ?

maryland     east          10

mass.          east            ?

maine          north          ?

vermont      north           2

 

 

Tagged:

Answers

  • AndrewAndrew RapidMiner Certified Expert, RapidMiner Certified Master, Member Posts: 47 Guru

    Hello parisave,

     

    You can aggregate by region and use the result to join back to the original where there are missing values.

     

    Relatively complex - here's an example

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="112" y="85">
    <process expanded="true">
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
    <list key="attribute_values">
    <parameter key="state" value="&quot;florida&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="5"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
    <list key="attribute_values">
    <parameter key="state" value="&quot;california&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="state" value="&quot;new york&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="15"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="340">
    <list key="attribute_values">
    <parameter key="state" value="&quot;delaware&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="8"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="442">
    <list key="attribute_values">
    <parameter key="state" value="&quot;georgia&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="1"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="544">
    <list key="attribute_values">
    <parameter key="state" value="&quot;texas&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (7)" width="90" x="45" y="646">
    <list key="attribute_values">
    <parameter key="state" value="&quot;nevada&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="6"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (8)" width="90" x="45" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;arizona&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (9)" width="90" x="313" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;maryland&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (10)" width="90" x="514" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;mass.&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (11)" width="90" x="715" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;main&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (12)" width="90" x="849" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;vermont&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="2"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="313" name="Append" width="90" x="514" y="34"/>
    <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="648" y="34"/>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
    <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 4"/>
    <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 5"/>
    <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
    <connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 7"/>
    <connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 8"/>
    <connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 9"/>
    <connect from_op="Generate Data by User Specification (10)" from_port="output" to_op="Append" to_port="example set 10"/>
    <connect from_op="Generate Data by User Specification (11)" from_port="output" to_op="Append" to_port="example set 11"/>
    <connect from_op="Generate Data by User Specification (12)" from_port="output" to_op="Append" to_port="example set 12"/>
    <connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="aggregate" compatibility="7.6.001" expanded="true" height="82" name="Aggregate" width="90" x="313" y="85">
    <list key="aggregation_attributes">
    <parameter key="widget" value="average"/>
    </list>
    <parameter key="group_by_attributes" value="region"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.6.001" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
    <parameter key="old_name" value="average(widget)"/>
    <parameter key="new_name" value="widget"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID (2)" width="90" x="581" y="34"/>
    <operator activated="true" class="filter_examples" compatibility="7.6.001" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="238">
    <parameter key="condition_class" value="no_missing_attributes"/>
    <list key="filters_list"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="136">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="region|state"/>
    </operator>
    <operator activated="true" class="join" compatibility="7.6.001" expanded="true" height="82" name="Join" width="90" x="782" y="34">
    <parameter key="join_type" value="left"/>
    <parameter key="use_id_attribute_as_key" value="false"/>
    <list key="key_attributes">
    <parameter key="region" value="region"/>
    </list>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="103" name="Append (2)" width="90" x="916" y="85"/>
    <operator activated="true" class="sort" compatibility="7.6.001" expanded="true" height="82" name="Sort" width="90" x="1050" y="85">
    <parameter key="attribute_name" value="id"/>
    </operator>
    <connect from_op="Subprocess" from_port="out 1" to_op="Aggregate" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="example set output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="original" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Generate ID (2)" to_port="example set input"/>
    <connect from_op="Generate ID (2)" from_port="example set output" to_op="Join" to_port="right"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
    <connect from_op="Filter Examples" from_port="unmatched example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
    <connect from_op="Join" from_port="join" to_op="Append (2)" to_port="example set 1"/>
    <connect from_op="Append (2)" from_port="merged set" to_op="Sort" to_port="example set input"/>
    <connect from_op="Sort" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

     

    regards,

     

    Andrew

  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,529 RM Data Scientist

    Hi,

     

    an easier way to do it is to user Group Into Collection from operator toolbox extension. This enables you to do any subprocess with a group_by statement. An example is attached.

     

    Cheers,

    Martin

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="45" y="85">
    <process expanded="true">
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
    <list key="attribute_values">
    <parameter key="state" value="&quot;florida&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="5"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
    <list key="attribute_values">
    <parameter key="state" value="&quot;california&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="state" value="&quot;new york&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="15"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="340">
    <list key="attribute_values">
    <parameter key="state" value="&quot;delaware&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="8"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="442">
    <list key="attribute_values">
    <parameter key="state" value="&quot;georgia&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="1"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="544">
    <list key="attribute_values">
    <parameter key="state" value="&quot;texas&quot;"/>
    <parameter key="region" value="&quot;south&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (7)" width="90" x="45" y="646">
    <list key="attribute_values">
    <parameter key="state" value="&quot;nevada&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="6"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (8)" width="90" x="45" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;arizona&quot;"/>
    <parameter key="region" value="&quot;west&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (9)" width="90" x="313" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;maryland&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="10"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (10)" width="90" x="514" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;mass.&quot;"/>
    <parameter key="region" value="&quot;east&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (11)" width="90" x="715" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;main&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="log(-1)"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (12)" width="90" x="849" y="748">
    <list key="attribute_values">
    <parameter key="state" value="&quot;vermont&quot;"/>
    <parameter key="region" value="&quot;north&quot;"/>
    <parameter key="widget" value="2"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="313" name="Append" width="90" x="514" y="34"/>
    <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="648" y="34"/>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
    <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 4"/>
    <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 5"/>
    <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
    <connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 7"/>
    <connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 8"/>
    <connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 9"/>
    <connect from_op="Generate Data by User Specification (10)" from_port="output" to_op="Append" to_port="example set 10"/>
    <connect from_op="Generate Data by User Specification (11)" from_port="output" to_op="Append" to_port="example set 11"/>
    <connect from_op="Generate Data by User Specification (12)" from_port="output" to_op="Append" to_port="example set 12"/>
    <connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="operator_toolbox:group_into_collection" compatibility="0.7.000" expanded="true" height="82" name="Group Into Collection" width="90" x="246" y="85">
    <parameter key="group_by_attribute" value="region"/>
    </operator>
    <operator activated="true" class="loop_collection" compatibility="7.6.001" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="85">
    <process expanded="true">
    <operator activated="true" class="replace_missing_values" compatibility="7.6.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="34">
    <list key="columns"/>
    </operator>
    <connect from_port="single" to_op="Replace Missing Values" to_port="example set input"/>
    <connect from_op="Replace Missing Values" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="82" name="Append (2)" width="90" x="648" y="85"/>
    <connect from_op="Subprocess" from_port="out 1" to_op="Group Into Collection" to_port="exa"/>
    <connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
    <connect from_op="Loop Collection" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
    <connect from_op="Append (2)" from_port="merged set" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.