RapidMiner

How to aggregate age in polynominal type

Newbie di_zhu
Newbie

How to aggregate age in polynominal type

Hello, all

 

In my raw data set, I got age attribute as following example:

 

John, 64,

Alice, 33years,

Bob, 22years,

Mike, 50

 

So some of the value with a redundant 'years' at the end. What I eventually need is to check the average of age, and also group example into age group ( 0-9,10-19,20-29.and etc )

 

1) If i set the attribute type as integer when read file, then every example with redundant 'years' will only get as missing value. 

2) If I read the attribute as polynominal, and then use replace operator to remove the redudant part from attribute value. and then apply nominal to numeric, but still what I got is not a column with numeric type

 

is there a workaround for that?

1 REPLY
Highlighted
RM Staff
RM Staff

Re: How to aggregate age in polynominal type

Hi,

 

read in as polynominal, than use Replace on it with a regex like:

(\d+).+

replace by

$1

that way you have only the digits in the column.

 

Then you transform it to numerical with the Parse Numbers operator.

 

Afterwards, you can use one of the Discretize operators to get bins and Aggregate to get avg() per Bin.

 

Cheers,

Martin

 

Edit: And here is an example for it. Maybe we need to adjust this regex a bit Smiley Happy

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" breakpoints="after" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="45" y="34">
        <process expanded="true">
          <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
            <list key="attribute_values">
              <parameter key="age" value="&quot;23&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
            <list key="attribute_values">
              <parameter key="age" value="&quot;50years&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
            <list key="attribute_values">
              <parameter key="age" value="&quot;25&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="124" name="Append" width="90" x="179" y="34"/>
          <operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34"/>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
          <connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">Get Example Data</description>
      </operator>
      <operator activated="true" class="replace" compatibility="7.6.001" expanded="true" height="82" name="Replace" width="90" x="246" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="age"/>
        <parameter key="replace_what" value="(\d+)years"/>
        <parameter key="replace_by" value="$1"/>
      </operator>
      <operator activated="true" class="parse_numbers" compatibility="7.6.001" expanded="true" height="82" name="Parse Numbers" width="90" x="380" y="34"/>
      <operator activated="true" class="discretize_by_bins" compatibility="7.6.001" expanded="true" height="103" name="Discretize" width="90" x="514" y="85">
        <parameter key="number_of_bins" value="10"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.6.001" expanded="true" height="82" name="Rename" width="90" x="648" y="34">
        <parameter key="old_name" value="age"/>
        <parameter key="new_name" value="Binned Age"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="join" compatibility="7.6.001" expanded="true" height="82" name="Join" width="90" x="782" y="85">
        <list key="key_attributes"/>
      </operator>
      <operator activated="true" class="aggregate" compatibility="7.6.001" expanded="true" height="82" name="Aggregate" width="90" x="916" y="85">
        <list key="aggregation_attributes">
          <parameter key="age" value="average"/>
        </list>
        <parameter key="group_by_attributes" value="Binned Age"/>
      </operator>
      <connect from_op="Subprocess" from_port="out 1" to_op="Replace" to_port="example set input"/>
      <connect from_op="Replace" from_port="example set output" to_op="Parse Numbers" to_port="example set input"/>
      <connect from_op="Parse Numbers" from_port="example set output" to_op="Discretize" to_port="example set input"/>
      <connect from_op="Discretize" from_port="example set output" to_op="Rename" to_port="example set input"/>
      <connect from_op="Discretize" from_port="original" to_op="Join" to_port="right"/>
      <connect from_op="Rename" from_port="example set output" to_op="Join" to_port="left"/>
      <connect from_op="Join" from_port="join" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
--------------------------------------------------------------------------
Head of Data Science Services at RapidMiner