count total occurrence and sort by date

tahsin · May 2021

I have rapidminer example set like this,

ID   Issue       Exp  
100  9/8/2020    11/8/2020
100  8/5/2019    9/5/2019
101  6/3/2020    10/1/2020
102  8/15/2020   12/12/2020

I want to add a new column which will count the occurrence of the ID by adding the numbers and sort by the earliest date so we know at what date how many count I had.

Output like this,

ID   Issue       Exp         Count  
100  8/5/2019    9/5/2019    1
100  9/8/2020    11/8/2020   2
101  6/3/2020    10/1/2020   1
102  8/15/2020   12/12/2020  1

But when I aggregate by ID and do a count it will just count the total instead and show them for the same ID. So, for ID 100 it shows me 2 both the times because it is just adding the numbers both the times.

For example, for ID 100 in 2019 we had only 1 issue date hence count is 1, when we find ID 100 again at 2020 the count will be 2. So, the sort by date is also important because it will help us find the ID occurrence in correct order.

tahsin · May 2021

Yes, if you look at the ID column, for ID 100 the count is 2 both the times. It should be 1 when there was an issue date first time(2019) and 2 when there was another issue date (2020). So, we are incrementing the numbers based on occurrence instead of just counting 2 both times.

Thanks.

MartinLiebig · May 2021

Hi,

have a look at the attached process, it should do the trick.

Best,

Martin

<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
<context>
    <input/>
    <output/>
    <macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="utility:create_exampleset" compatibility="9.9.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
        <parameter key="generator_type" value="comma separated text"/>
        <parameter key="number_of_examples" value="100"/>
        <parameter key="use_stepsize" value="false"/>
        <list key="function_descriptions"/>
        <parameter key="add_id_attribute" value="false"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="input_csv_text" value="ID , Issue     , Exp 
100 , 9/8/2020 , 11/8/2020
100 , 8/5/2019 , 9/5/2019
101 , 6/3/2020 , 10/1/2020
102 , 8/15/2020 , 12/12/2020"/>
        <parameter key="column_separator" value=","/>
        <parameter key="parse_all_as_nominal" value="false"/>
        <parameter key="decimal_point_character" value="."/>
        <parameter key="trim_attribute_names" value="true"/>
      </operator>
      <operator activated="true" class="operator_toolbox:group_into_collection" compatibility="3.0.000-SNAPSHOT" expanded="true" height="82" name="Group Into Collection" width="90" x="179" y="85">
        <parameter key="group_by_attribute" value="ID"/>
        <parameter key="group_by_attribute (numerical)" value=""/>
        <parameter key="sorting_order" value="none"/>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="9.9.000" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="85">
        <parameter key="set_iteration_macro" value="false"/>
        <parameter key="macro_name" value="iteration"/>
        <parameter key="macro_start_value" value="1"/>
        <parameter key="unfold" value="false"/>
        <process expanded="true">
          <operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="112" y="34">
            <list key="sort_by">
              <parameter key="Issue" value="ascending"/>
            </list>
          </operator>
          <operator activated="true" class="generate_id" compatibility="9.9.000" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34">
            <parameter key="create_nominal_ids" value="false"/>
            <parameter key="offset" value="0"/>
          </operator>
          <operator activated="true" class="blending:rename" compatibility="9.9.000" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
            <list key="rename attributes">
              <parameter key="id" value="count"/>
            </list>
            <parameter key="from_attribute" value=""/>
            <parameter key="to_attribute" value=""/>
          </operator>
          <connect from_port="single" to_op="Sort" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="operator_toolbox:advanced_append" compatibility="3.0.000-SNAPSHOT" expanded="true" height="82" name="Append (Superset)" width="90" x="447" y="85"/>
      <connect from_op="Create ExampleSet" from_port="output" to_op="Group Into Collection" to_port="exa"/>
      <connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Append (Superset)" to_port="example set 1"/>
      <connect from_op="Append (Superset)" from_port="merged set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
</operator>
</process>

MartinLiebig · May 2021

Hi,

sounds like you want to aggregate and then join again? Attached is a process on your data.

Bet,

Martin

<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
<context>
    <input/>
    <output/>
    <macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="utility:create_exampleset" compatibility="9.9.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
        <parameter key="generator_type" value="comma separated text"/>
        <parameter key="number_of_examples" value="100"/>
        <parameter key="use_stepsize" value="false"/>
        <list key="function_descriptions"/>
        <parameter key="add_id_attribute" value="false"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="input_csv_text" value="ID , Issue     , Exp 
100 , 9/8/2020 , 11/8/2020
100 , 8/5/2019 , 9/5/2019
101 , 6/3/2020 , 10/1/2020
102 , 8/15/2020 , 12/12/2020"/>
        <parameter key="column_separator" value=","/>
        <parameter key="parse_all_as_nominal" value="false"/>
        <parameter key="decimal_point_character" value="."/>
        <parameter key="trim_attribute_names" value="true"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="9.9.000" expanded="true" height="103" name="Multiply" width="90" x="179" y="136"/>
      <operator activated="true" class="aggregate" compatibility="9.9.000" expanded="true" height="82" name="Aggregate" width="90" x="313" y="34">
        <parameter key="use_default_aggregation" value="false"/>
        <parameter key="attribute_filter_type" value="all"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="default_aggregation_function" value="average"/>
        <list key="aggregation_attributes">
          <parameter key="ID" value="count"/>
        </list>
        <parameter key="group_by_attributes" value="ID"/>
        <parameter key="count_all_combinations" value="false"/>
        <parameter key="only_distinct" value="false"/>
        <parameter key="ignore_missings" value="true"/>
      </operator>
      <operator activated="true" class="concurrency:join" compatibility="9.9.000" expanded="true" height="82" name="Join" width="90" x="447" y="136">
        <parameter key="remove_double_attributes" value="true"/>
        <parameter key="join_type" value="inner"/>
        <parameter key="use_id_attribute_as_key" value="false"/>
        <list key="key_attributes">
          <parameter key="ID" value="ID"/>
        </list>
        <parameter key="keep_both_join_attributes" value="false"/>
      </operator>
      <operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="581" y="136">
        <list key="sort_by">
          <parameter key="Exp" value="ascending"/>
        </list>
      </operator>
      <connect from_op="Create ExampleSet" from_port="output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="right"/>
      <connect from_op="Aggregate" from_port="example set output" to_op="Join" to_port="left"/>
      <connect from_op="Join" from_port="join" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
</operator>
</process>

tahsin · May 2021

Hi,

Thanks for your reply. I ran your process and this is the result I get. It looks like its the same result I got. Showing 2 for count both the times for ID 100.

Image: https://us.v-cdn.net/6030995/uploads/editor/g9/khwgpse1cffl.png

MartinLiebig · May 2021

Hi,

so why would this be wrong? You want more a cumulative sum?

Best,

Martin

tahsin · May 2021

Great!! Exactly how I expected. Thanks so much for your help.

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

count total occurrence and sort by date

Best Answers

Answers