Have log file Need multiple binomial attributes

srinivassrinivas Member Posts: 2 Contributor I
edited November 2018 in Help
I have following example data (each record recording one group membership):

User, Group
bill, 1
bill, 2
bill, 3
mary, 1
mary, 2
ed, 3
...


From the above I need example data that indicates the events that a user has generated event at least once.  For example:

User, Group_1, Group_2, Group_3,...
bill, true, true, true
mary, false, true, false
ed, false, false, true


There are about about 8000 groups  (numbered 1-8000).  

The purpose of analysis is to create association rules for groups that identify connections between groups.  (e.g. "user being a member of group 1, is also likely to be a member of group 7)

Given the number of groups, perhaps there is a better way than the create association rules operator...



Answers

  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,485 RM Data Scientist
    Hi,

    attached is a quick and dirty way. For sure this is a pivot-thing. Not sure if that is the easiest way to go.

    ~Martin

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="7.0.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="7.0.000" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="subprocess" compatibility="7.0.000" expanded="true" height="82" name="Subprocess" width="90" x="45" y="34">
            <process expanded="true">
              <operator activated="true" class="retrieve" compatibility="7.0.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="34">
                <parameter key="repository_entry" value="//Samples/data/Iris"/>
              </operator>
              <operator activated="true" class="select_attributes" compatibility="7.0.000" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="label"/>
                <parameter key="include_special_attributes" value="true"/>
              </operator>
              <operator activated="true" class="map" compatibility="7.0.000" expanded="true" height="82" name="Map" width="90" x="313" y="34">
                <parameter key="include_special_attributes" value="true"/>
                <list key="value_mappings">
                  <parameter key="Iris-setosa" value="Bill"/>
                  <parameter key="Iris-virginica" value="Mary"/>
                  <parameter key="Iris-versicolor" value="Bob"/>
                </list>
              </operator>
              <operator activated="true" class="generate_attributes" compatibility="7.0.000" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="34">
                <list key="function_descriptions">
                  <parameter key="Group" value="round(rand()*100)"/>
                </list>
              </operator>
              <operator activated="true" class="set_role" compatibility="7.0.000" expanded="true" height="82" name="Set Role" width="90" x="581" y="34">
                <parameter key="attribute_name" value="label"/>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="rename" compatibility="7.0.000" expanded="true" height="82" name="Rename" width="90" x="715" y="34">
                <parameter key="old_name" value="label"/>
                <parameter key="new_name" value="User"/>
                <list key="rename_additional_attributes"/>
              </operator>
              <connect from_op="Retrieve Iris" from_port="output" to_op="Select Attributes" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_op="Map" to_port="example set input"/>
              <connect from_op="Map" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
              <connect from_op="Set Role" from_port="example set output" to_op="Rename" to_port="example set input"/>
              <connect from_op="Rename" from_port="example set output" to_port="out 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
            <description align="center" color="transparent" colored="false" width="126">Get a fitting Data Set</description>
          </operator>
          <operator activated="true" class="generate_id" compatibility="7.0.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
          <operator activated="true" class="pivot" compatibility="7.0.000" expanded="true" height="82" name="Pivot" width="90" x="313" y="34">
            <parameter key="group_attribute" value="User"/>
            <parameter key="index_attribute" value="id"/>
            <parameter key="consider_weights" value="false"/>
            <parameter key="skip_constant_attributes" value="false"/>
          </operator>
          <operator activated="true" class="replace_missing_values" compatibility="7.0.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="447" y="34">
            <parameter key="default" value="zero"/>
            <list key="columns"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="7.0.000" expanded="true" height="82" name="Set Role (2)" width="90" x="581" y="34">
            <parameter key="attribute_name" value="User"/>
            <parameter key="target_role" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="loop_attributes" compatibility="7.0.000" expanded="true" height="82" name="Loop Attributes" width="90" x="849" y="34">
            <process expanded="true">
              <operator activated="true" class="generate_attributes" compatibility="7.0.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="112" y="34">
                <list key="function_descriptions">
                  <parameter key="%{loop_attribute}" value="if(#{loop_attribute}==0,&quot;false&quot;,&quot;true&quot;)"/>
                </list>
              </operator>
              <connect from_port="example set" to_op="Generate Attributes (2)" to_port="example set input"/>
              <connect from_op="Generate Attributes (2)" from_port="example set output" to_port="example set"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_result 1" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Subprocess" from_port="out 1" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Pivot" to_port="example set input"/>
          <connect from_op="Pivot" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
          <connect from_op="Replace Missing Values" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Loop Attributes" to_port="example set"/>
          <connect from_op="Loop Attributes" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
  • srinivassrinivas Member Posts: 2 Contributor I
    Thanks Martin,

    Unfortunately this produces the following output:

    "User","Group_1.0","Group_2.0","Group_3.0","Group_4.0","Group_5.0","Group_6.0"
    "bill","true","true","true","false","false","false"
    "ed","false","false","false","false","false","true"
    "mary","false","false","false","true","true","false"

    Note the introduction of attributes  Group_4.0, Group_5.0, Group_6.0?  My input examples do not contain a group 4-6.  Only groups 1-3.  I think this is caused by use of the generated attribute as index attribute in the pivot.

    I've spent some time with pivot and I can't get it to produce the output I'm looking for.




  • JEdwardJEdward RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 578 Unicorn
    Just spotted Martin's error.   :P :P :P
    What wasn't clear from his example is that the pivot operator works by having both an attribute to group by & also several value attributes which get added up to make the total count in the grouping.  (For example if someone has joined the same group several times)

    Below is the corrected XML with the addition of a value_attribute set to 1.  This means that for every group the person is a member has a value of 1 & when added up it becomes Bill: 0,1,1,0, etc.  
    I also swapped out the loop values with a numerical to binominal operator as I think that looks a little neater, both our proceses do the same at this point.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="7.0.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="7.0.000" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="subprocess" compatibility="7.0.000" expanded="true" height="82" name="Subprocess" width="90" x="45" y="34">
            <process expanded="true">
              <operator activated="true" class="text:create_document" compatibility="7.0.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="34">
                <parameter key="text" value="User,Group&#10;bill,2&#10;bill,1&#10;bill,3&#10;mary,1&#10;mary,2&#10;ed,3&#10;"/>
              </operator>
              <operator activated="true" class="text:write_document" compatibility="7.0.000" expanded="true" height="82" name="Write Document" width="90" x="179" y="85"/>
              <operator activated="true" class="read_csv" compatibility="7.0.000" expanded="true" height="68" name="Read CSV" width="90" x="380" y="85">
                <parameter key="column_separators" value=","/>
                <parameter key="use_quotes" value="false"/>
                <parameter key="first_row_as_names" value="false"/>
                <list key="annotations">
                  <parameter key="0" value="Name"/>
                </list>
                <list key="data_set_meta_data_information">
                  <parameter key="0" value="User.true.nominal.attribute"/>
                  <parameter key="1" value="Group.true.polynominal.attribute"/>
                </list>
              </operator>
              <connect from_op="Create Document" from_port="output" to_op="Write Document" to_port="document"/>
              <connect from_op="Write Document" from_port="file" to_op="Read CSV" to_port="file"/>
              <connect from_op="Read CSV" from_port="output" to_port="out 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
            <description align="center" color="transparent" colored="false" width="126">Get a fitting Data Set</description>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="7.0.000" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
            <list key="function_descriptions">
              <parameter key="member_of_group" value="1"/>
            </list>
            <description align="center" color="transparent" colored="false" width="126">This adds a value attribute to the dataset.</description>
          </operator>
          <operator activated="true" class="pivot" compatibility="7.0.000" expanded="true" height="82" name="Pivot" width="90" x="313" y="34">
            <parameter key="group_attribute" value="User"/>
            <parameter key="index_attribute" value="Group"/>
            <parameter key="consider_weights" value="false"/>
            <parameter key="skip_constant_attributes" value="false"/>
          </operator>
          <operator activated="true" class="numerical_to_binominal" compatibility="7.0.000" expanded="true" height="82" name="Numerical to Binominal" width="90" x="447" y="34"/>
          <operator activated="true" class="replace_missing_values" compatibility="7.0.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="581" y="34">
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="value_type" value="binominal"/>
            <parameter key="default" value="value"/>
            <list key="columns"/>
            <parameter key="replenishment_value" value="false"/>
          </operator>
          <operator activated="true" class="order_attributes" compatibility="7.0.000" expanded="true" height="82" name="Reorder Attributes" width="90" x="715" y="34">
            <parameter key="attribute_ordering" value="User"/>
            <description align="center" color="transparent" colored="false" width="126">This is just to move the User column back to the front &amp;amp; ensure every other group is also ordered alphabetically.</description>
          </operator>
          <operator activated="true" class="set_role" compatibility="7.0.000" expanded="true" height="82" name="Set Role" width="90" x="581" y="238">
            <parameter key="attribute_name" value="User"/>
            <parameter key="target_role" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="fp_growth" compatibility="7.0.000" expanded="true" height="82" name="FP-Growth" width="90" x="849" y="238"/>
          <operator activated="true" class="create_association_rules" compatibility="7.0.000" expanded="true" height="82" name="Create Association Rules" width="90" x="849" y="85"/>
          <connect from_op="Subprocess" from_port="out 1" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Pivot" to_port="example set input"/>
          <connect from_op="Pivot" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
          <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
          <connect from_op="Replace Missing Values" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
          <connect from_op="Reorder Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="FP-Growth" to_port="example set"/>
          <connect from_op="FP-Growth" from_port="example set" to_port="result 2"/>
          <connect from_op="FP-Growth" from_port="frequent sets" to_op="Create Association Rules" to_port="item sets"/>
          <connect from_op="Create Association Rules" from_port="rules" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,485 RM Data Scientist
    Thanks John,

    and sorry for the mistake :/ That happens if you do such a thing in the last 3 minutes before a sales call starts.


    ~Martin
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.