RapidMiner

RapidMiner

[SOLVED] Reduce number of nominal classes in attribute

Contributor II

[SOLVED] Reduce number of nominal classes in attribute

I have a nominal attribute with a lot of infrequently used classes in it.

How can I include only the ten most frequently occuring classes and classify the rest as "Other"?

Thanks.
2 REPLIES
Regular Contributor

Re: Reduce number of nominal classes in attribute

Hi,

There is no single operator for this task, but you could do it with the combination of several operator. See my attached process as an example how to do this.

Have fun
  Marcin


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="generate_nominal_data" compatibility="5.3.008" expanded="true" height="60" name="Generate Nominal Data" width="90" x="45" y="30">
        <parameter key="number_of_attributes" value="1"/>
        <parameter key="number_of_values" value="20"/>
      </operator>
      <operator activated="true" class="subprocess" compatibility="5.3.008" expanded="true" height="76" name="Reduce values" width="90" x="246" y="30">
        <process expanded="true">
          <operator activated="true" class="aggregate" compatibility="5.3.008" expanded="true" height="76" name="Aggregate" width="90" x="45" y="30">
            <list key="aggregation_attributes">
              <parameter key="att1" value="count"/>
            </list>
            <parameter key="group_by_attributes" value="|att1"/>
          </operator>
          <operator activated="true" class="sort" compatibility="5.3.008" expanded="true" height="76" name="Sort" width="90" x="246" y="30">
            <parameter key="attribute_name" value="count(att1)"/>
            <parameter key="sorting_direction" value="decreasing"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="5.3.008" expanded="true" height="76" name="Generate ID" width="90" x="380" y="30"/>
          <operator activated="true" class="join" compatibility="5.3.008" expanded="true" height="76" name="Join" width="90" x="313" y="210">
            <parameter key="use_id_attribute_as_key" value="false"/>
            <list key="key_attributes">
              <parameter key="att1" value="att1"/>
            </list>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="5.3.008" expanded="true" height="76" name="Generate Attributes" width="90" x="447" y="210">
            <list key="function_descriptions">
              <parameter key="att1" value="if(id&gt;10, &quot;Other&quot;, att1)"/>
            </list>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="581" y="30">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="|count(att1)|id"/>
            <parameter key="invert_selection" value="true"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <connect from_port="in 1" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Sort" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="original" to_op="Join" to_port="right"/>
          <connect from_op="Sort" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Join" to_port="left"/>
          <connect from_op="Join" from_port="join" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Generate Nominal Data" from_port="output" to_op="Reduce values" to_port="in 1"/>
      <connect from_op="Reduce values" from_port="out 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Contributor II

Re: Reduce number of nominal classes in attribute

Wow, thanks so much for this Marcin...

I figured there was a way to do it, I just need to get a better handle on how to use all of the RM operators. 

I really appreciate the help.