RapidMiner

Remove attributes with missing values exceeding a given threshold (percentage)

Contributor I f_laperna
Contributor I

Remove attributes with missing values exceeding a given threshold (percentage)

Hi, I'm new to Rapid Miner. I'm trying to do something very simple but I'm stuck with it. Given my data collection with many attributes I want to remove columns in which there are more than a given percentage of missing values (because I would not be able to use fixed values or infer their values). I tried the Remove Useless Attributes node but still I have columns with almost 90% of missing values so it didn't work as I wanted. Can you help me achieve what I want? It should be something trivial, I remember in Knime there was a specific option in the filter node to specify the percentage threshold.

 

Thank you!

1 REPLY
Highlighted
Maven
Maven

Re: Remove attributes with missing values exceeding a given threshold (percentage)

There are probably a few different ways of doing it, but the easiest I can come up with is using the "Remove Useless Attributes" operator. Please take a look at the example process below (just copy it and paste it into your XML panel, then click the green checkmark):

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Golf" width="90" x="45" y="136">
        <parameter key="repository_entry" value="//Samples/data/Golf"/>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="7.6.001" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="136">
        <list key="function_descriptions">
          <parameter key="Missing Value" value="if([Outlook] == &quot;sunny&quot;,1, MISSING_NOMINAL)"/>
        </list>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="136">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="Outlook|Wind"/>
        <parameter key="invert_selection" value="true"/>
      </operator>
      <operator activated="true" breakpoints="after" class="filter_example_range" compatibility="7.6.001" expanded="true" height="82" name="first 10 examples" width="90" x="447" y="136">
        <parameter key="first_example" value="1"/>
        <parameter key="last_example" value="10"/>
      </operator>
      <operator activated="true" class="remove_useless_attributes" compatibility="7.6.001" expanded="true" height="82" name="Remove Useless Attributes" width="90" x="581" y="136">
        <parameter key="nominal_useless_above" value="0.6"/>
        <parameter key="nominal_useless_below" value="0.5"/>
      </operator>
      <connect from_op="Golf" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="first 10 examples" to_port="example set input"/>
      <connect from_op="first 10 examples" from_port="example set output" to_op="Remove Useless Attributes" to_port="example set input"/>
      <connect from_op="Remove Useless Attributes" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="90"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>