RapidMiner

RapidMiner

Select all attributes having only missing value

Contributor

Select all attributes having only missing value

Hi,

 

How shall I select all the attributes which has ONLY missing values in Rapidminer tool. I dont want to select other attributes which has both missing and non-missing values. If I put  'no_missing_values' in attribute_filter_type option in 'Select Attributes' operator and inverse selection, it select the rows which has both missing and non-missing values. But I need to select attributes which has all the values missing.

 

Thanks,

Zubair

2 REPLIES
RMStaff

Re: Select all attributes having only missing value

Dear Zubairali,

 

interesting question. I did not find a one operator solution. Attached is a longer process doing the job. I would be curious if there is an easier way to do it.


~Martin

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="subprocess" compatibility="7.3.000" expanded="true" height="82" name="Subprocess" width="90" x="45" y="187">
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="7.3.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
            <parameter key="repository_entry" value="//Samples/data/Golf"/>
          </operator>
          <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Humidity"/>
            <parameter key="mode" value="expression"/>
            <parameter key="expression_value" value="Humidity&gt;5"/>
          </operator>
          <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (2)" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Temperature"/>
            <parameter key="numeric_value" value="80.0"/>
          </operator>
          <connect from_op="Retrieve Golf" from_port="output" to_op="Declare Missing Value" to_port="example set input"/>
          <connect from_op="Declare Missing Value" from_port="example set output" to_op="Declare Missing Value (2)" to_port="example set input"/>
          <connect from_op="Declare Missing Value (2)" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">Get A data set</description>
      </operator>
      <operator activated="true" class="extract_macro" compatibility="7.3.000" expanded="true" height="68" name="Extract Macro" width="90" x="179" y="187">
        <parameter key="macro" value="examples"/>
        <list key="additional_macros"/>
      </operator>
      <operator activated="true" class="aggregate" compatibility="7.3.000" expanded="true" height="82" name="Aggregate" width="90" x="313" y="187">
        <parameter key="use_default_aggregation" value="true"/>
        <parameter key="default_aggregation_function" value="count (ignoring missings)"/>
        <list key="aggregation_attributes"/>
      </operator>
      <operator activated="true" class="rename_by_replacing" compatibility="7.3.000" expanded="true" height="82" name="Rename by Replacing" width="90" x="447" y="85">
        <parameter key="replace_what" value="countWithOutMissings\((.*)\)"/>
        <parameter key="replace_by" value="$1"/>
      </operator>
      <operator activated="true" class="loop_attributes" compatibility="7.3.000" expanded="true" height="82" name="Loop Attributes" width="90" x="581" y="85">
        <process expanded="true">
          <operator activated="true" class="extract_macro" compatibility="7.3.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="179" y="34">
            <parameter key="macro" value="nonMissings"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="%{loop_attribute}"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="branch" compatibility="7.3.000" expanded="true" height="82" name="Branch" width="90" x="380" y="34">
            <parameter key="condition_type" value="expression"/>
            <parameter key="expression" value="%{nonMissings}!=0"/>
            <process expanded="true">
              <connect from_port="condition" to_port="input 1"/>
              <portSpacing port="source_condition" spacing="0"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="sink_input 1" spacing="0"/>
              <portSpacing port="sink_input 2" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="7.3.000" expanded="true" height="82" name="Select Attributes" width="90" x="112" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{loop_attribute}"/>
                <parameter key="invert_selection" value="true"/>
              </operator>
              <connect from_port="condition" to_op="Select Attributes" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_port="input 1"/>
              <portSpacing port="source_condition" spacing="0"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="sink_input 1" spacing="0"/>
              <portSpacing port="sink_input 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="example set" to_op="Extract Macro (2)" to_port="example set"/>
          <connect from_op="Extract Macro (2)" from_port="example set" to_op="Branch" to_port="condition"/>
          <connect from_op="Branch" from_port="input 1" to_port="example set"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="data_to_weights" compatibility="7.3.000" expanded="true" height="82" name="Data to Weights" width="90" x="715" y="85"/>
      <operator activated="true" class="select_by_weights" compatibility="7.3.000" expanded="true" height="103" name="Select by Weights" width="90" x="764" y="187"/>
      <connect from_op="Subprocess" from_port="out 1" to_op="Extract Macro" to_port="example set"/>
      <connect from_op="Extract Macro" from_port="example set" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="original" to_op="Select by Weights" to_port="example set input"/>
      <connect from_op="Rename by Replacing" from_port="example set output" to_op="Loop Attributes" to_port="example set"/>
      <connect from_op="Loop Attributes" from_port="example set" to_op="Data to Weights" to_port="example set"/>
      <connect from_op="Data to Weights" from_port="weights" to_op="Select by Weights" to_port="weights"/>
      <connect from_op="Select by Weights" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
--------------------------------------------------------------------------
Head of Data Science Services at RapidMiner
RMStaff

Re: Select all attributes having only missing value

Well, a shorter version (although not exactly the same) is to just use "Replace Missing Values" with a constant value NOT in the data.  Then use "Remove Useless Attributes".  Of course this one also removes other attributes which are constant (but how useful are those?).  You can then turn the constant value you have used above back into a missing again with "Declare Missing Value".

 

Here is the code:

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" breakpoints="after" class="subprocess" compatibility="7.3.000" expanded="true" height="82" name="Subprocess" width="90" x="45" y="34">
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="7.3.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
            <parameter key="repository_entry" value="//Samples/data/Golf"/>
          </operator>
          <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Humidity"/>
            <parameter key="mode" value="expression"/>
            <parameter key="expression_value" value="Humidity&gt;5"/>
          </operator>
          <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (2)" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Temperature"/>
            <parameter key="numeric_value" value="80.0"/>
          </operator>
          <connect from_op="Retrieve Golf" from_port="output" to_op="Declare Missing Value" to_port="example set input"/>
          <connect from_op="Declare Missing Value" from_port="example set output" to_op="Declare Missing Value (2)" to_port="example set input"/>
          <connect from_op="Declare Missing Value (2)" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">Get A data set</description>
      </operator>
      <operator activated="true" class="replace_missing_values" compatibility="7.3.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="34">
        <parameter key="default" value="value"/>
        <list key="columns"/>
        <parameter key="replenishment_value" value="-99"/>
      </operator>
      <operator activated="true" class="remove_useless_attributes" compatibility="7.3.000" expanded="true" height="82" name="Remove Useless Attributes" width="90" x="313" y="34"/>
      <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (4)" width="90" x="447" y="34">
        <parameter key="mode" value="nominal"/>
        <parameter key="nominal_value" value="-99"/>
      </operator>
      <operator activated="true" class="declare_missing_value" compatibility="7.3.000" expanded="true" height="82" name="Declare Missing Value (3)" width="90" x="581" y="34">
        <parameter key="numeric_value" value="-99.0"/>
      </operator>
      <connect from_op="Subprocess" from_port="out 1" to_op="Replace Missing Values" to_port="example set input"/>
      <connect from_op="Replace Missing Values" from_port="example set output" to_op="Remove Useless Attributes" to_port="example set input"/>
      <connect from_op="Remove Useless Attributes" from_port="example set output" to_op="Declare Missing Value (4)" to_port="example set input"/>
      <connect from_op="Declare Missing Value (4)" from_port="example set output" to_op="Declare Missing Value (3)" to_port="example set input"/>
      <connect from_op="Declare Missing Value (3)" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Cheers,

Ingo


How to load processes in XML from the forum into RapidMiner: Read this!