Select all attributes having only missing value

zubairali_jzubairali_j Member Posts: 1 Contributor I
edited November 2018 in Help

Hi,

 

How shall I select all the attributes which has ONLY missing values in Rapidminer tool. I dont want to select other attributes which has both missing and non-missing values. If I put  'no_missing_values' in attribute_filter_type option in 'Select Attributes' operator and inverse selection, it select the rows which has both missing and non-missing values. But I need to select attributes which has all the values missing.

 

Thanks,

Zubair

Tagged:

Answers

  • mschmitzmschmitz Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 2,061  RM Data Scientist

    Dear Zubairali,

     

    interesting question. I did not find a one operator solution. Attached is a longer process doing the job. I would be curious if there is an easier way to do it.


    ~Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="7.3.000" expanded="true" height="82" name="Subprocess" width="90" x="45" y="187">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.3.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Samples/data/Golf"/>
    </operator>
    <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Humidity"/>
    <parameter key="mode" value="expression"/>
    <parameter key="expression_value" value="Humidity&gt;5"/>
    </operator>
    <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (2)" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Temperature"/>
    <parameter key="numeric_value" value="80.0"/>
    </operator>
    <connect from_op="Retrieve Golf" from_port="output" to_op="Declare Missing Value" to_port="example set input"/>
    <connect from_op="Declare Missing Value" from_port="example set output" to_op="Declare Missing Value (2)" to_port="example set input"/>
    <connect from_op="Declare Missing Value (2)" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    <description align="center" color="transparent" colored="false" width="126">Get A data set</description>
    </operator>
    <operator activated="true" class="extract_macro" compatibility="7.3.000" expanded="true" height="68" name="Extract Macro" width="90" x="179" y="187">
    <parameter key="macro" value="examples"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="aggregate" compatibility="7.3.000" expanded="true" height="82" name="Aggregate" width="90" x="313" y="187">
    <parameter key="use_default_aggregation" value="true"/>
    <parameter key="default_aggregation_function" value="count (ignoring missings)"/>
    <list key="aggregation_attributes"/>
    </operator>
    <operator activated="true" class="rename_by_replacing" compatibility="7.3.000" expanded="true" height="82" name="Rename by Replacing" width="90" x="447" y="85">
    <parameter key="replace_what" value="countWithOutMissings\((.*)\)"/>
    <parameter key="replace_by" value="$1"/>
    </operator>
    <operator activated="true" class="loop_attributes" compatibility="7.3.000" expanded="true" height="82" name="Loop Attributes" width="90" x="581" y="85">
    <process expanded="true">
    <operator activated="true" class="extract_macro" compatibility="7.3.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="179" y="34">
    <parameter key="macro" value="nonMissings"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="attribute_name" value="%{loop_attribute}"/>
    <parameter key="example_index" value="1"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="branch" compatibility="7.3.000" expanded="true" height="82" name="Branch" width="90" x="380" y="34">
    <parameter key="condition_type" value="expression"/>
    <parameter key="expression" value="%{nonMissings}!=0"/>
    <process expanded="true">
    <connect from_port="condition" to_port="input 1"/>
    <portSpacing port="source_condition" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_input 1" spacing="0"/>
    <portSpacing port="sink_input 2" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="select_attributes" compatibility="7.3.000" expanded="true" height="82" name="Select Attributes" width="90" x="112" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="%{loop_attribute}"/>
    <parameter key="invert_selection" value="true"/>
    </operator>
    <connect from_port="condition" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_port="input 1"/>
    <portSpacing port="source_condition" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_input 1" spacing="0"/>
    <portSpacing port="sink_input 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="example set" to_op="Extract Macro (2)" to_port="example set"/>
    <connect from_op="Extract Macro (2)" from_port="example set" to_op="Branch" to_port="condition"/>
    <connect from_op="Branch" from_port="input 1" to_port="example set"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="data_to_weights" compatibility="7.3.000" expanded="true" height="82" name="Data to Weights" width="90" x="715" y="85"/>
    <operator activated="true" class="select_by_weights" compatibility="7.3.000" expanded="true" height="103" name="Select by Weights" width="90" x="764" y="187"/>
    <connect from_op="Subprocess" from_port="out 1" to_op="Extract Macro" to_port="example set"/>
    <connect from_op="Extract Macro" from_port="example set" to_op="Aggregate" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="original" to_op="Select by Weights" to_port="example set input"/>
    <connect from_op="Rename by Replacing" from_port="example set output" to_op="Loop Attributes" to_port="example set"/>
    <connect from_op="Loop Attributes" from_port="example set" to_op="Data to Weights" to_port="example set"/>
    <connect from_op="Data to Weights" from_port="weights" to_op="Select by Weights" to_port="weights"/>
    <connect from_op="Select by Weights" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
    - Head of Data Science Services at RapidMiner -
    Dortmund, Germany
  • IngoRMIngoRM Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Community Manager, RMResearcher, Member, University Professor Posts: 1,621  RM Founder

    Well, a shorter version (although not exactly the same) is to just use "Replace Missing Values" with a constant value NOT in the data.  Then use "Remove Useless Attributes".  Of course this one also removes other attributes which are constant (but how useful are those?).  You can then turn the constant value you have used above back into a missing again with "Declare Missing Value".

     

    Here is the code:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" breakpoints="after" class="subprocess" compatibility="7.3.000" expanded="true" height="82" name="Subprocess" width="90" x="45" y="34">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.3.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Samples/data/Golf"/>
    </operator>
    <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Humidity"/>
    <parameter key="mode" value="expression"/>
    <parameter key="expression_value" value="Humidity&gt;5"/>
    </operator>
    <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (2)" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Temperature"/>
    <parameter key="numeric_value" value="80.0"/>
    </operator>
    <connect from_op="Retrieve Golf" from_port="output" to_op="Declare Missing Value" to_port="example set input"/>
    <connect from_op="Declare Missing Value" from_port="example set output" to_op="Declare Missing Value (2)" to_port="example set input"/>
    <connect from_op="Declare Missing Value (2)" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    <description align="center" color="transparent" colored="false" width="126">Get A data set</description>
    </operator>
    <operator activated="true" class="replace_missing_values" compatibility="7.3.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="34">
    <parameter key="default" value="value"/>
    <list key="columns"/>
    <parameter key="replenishment_value" value="-99"/>
    </operator>
    <operator activated="true" class="remove_useless_attributes" compatibility="7.3.000" expanded="true" height="82" name="Remove Useless Attributes" width="90" x="313" y="34"/>
    <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (4)" width="90" x="447" y="34">
    <parameter key="mode" value="nominal"/>
    <parameter key="nominal_value" value="-99"/>
    </operator>
    <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (3)" width="90" x="581" y="34">
    <parameter key="numeric_value" value="-99.0"/>
    </operator>
    <connect from_op="Subprocess" from_port="out 1" to_op="Replace Missing Values" to_port="example set input"/>
    <connect from_op="Replace Missing Values" from_port="example set output" to_op="Remove Useless Attributes" to_port="example set input"/>
    <connect from_op="Remove Useless Attributes" from_port="example set output" to_op="Declare Missing Value (4)" to_port="example set input"/>
    <connect from_op="Declare Missing Value (4)" from_port="example set output" to_op="Declare Missing Value (3)" to_port="example set input"/>
    <connect from_op="Declare Missing Value (3)" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Cheers,

    Ingo

Sign In or Register to comment.