Options

"association rules -optimal"

veveveve Member Posts: 63 Contributor II
edited June 2019 in Help
Hello,

I'm havina an associaiton rules workflow and it is too slow.

with 10 000 events it is finished in 3.5 minutes but with 20 000 events it is still running after half an hour.
(in total I have 1 500 000 events..)


As input I have :
- transactionid;itemid

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
 <context>
   <input/>
   <output>
     <location>Data_association_rules1</location>
   </output>
   <macros>
     <macro>
       <key>recipe</key>
       <value>300387239</value>
     </macro>
   </macros>
 </context>
 <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="read_database" compatibility="5.3.013" expanded="true" height="60" name="Read fav" width="90" x="45" y="30">
       <parameter key="connection" value="rapidanalytics"/>
       <parameter key="define_query" value="table name"/>
       <parameter key="use_default_schema" value="false"/>
       <parameter key="schema_name" value="rapidanalytics"/>
       <parameter key="table_name" value="favoritesWithoutQlikview"/>
       <enumeration key="parameters"/>
     </operator>
     <operator activated="true" class="text_to_nominal" compatibility="5.3.013" expanded="true" height="76" name="Text to Nominal" width="90" x="180" y="30"/>
     <operator activated="true" class="sort" compatibility="5.3.013" expanded="true" height="76" name="Sort" width="90" x="315" y="30">
       <parameter key="attribute_name" value="recipeid"/>
     </operator>
     <operator activated="true" class="sample" compatibility="5.3.013" expanded="true" height="76" name="Sample" width="90" x="447" y="30">
       <parameter key="sample_size" value="20000"/>
       <list key="sample_size_per_class"/>
       <list key="sample_ratio_per_class"/>
       <list key="sample_probability_per_class"/>
     </operator>
     <operator activated="true" class="print_to_console" compatibility="5.3.013" expanded="true" height="76" name="Print join" width="90" x="585" y="30">
       <parameter key="log_value" value="finished join"/>
     </operator>
     <operator activated="true" class="subprocess" compatibility="5.3.013" expanded="true" height="76" name="pivoting" width="90" x="720" y="30">
       <parameter key="parallelize_nested_chain" value="true"/>
       <process expanded="true">
         <operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (2)" width="90" x="45" y="30">
           <parameter key="attribute_filter_type" value="subset"/>
           <parameter key="attributes" value="|userid|recipeid"/>
         </operator>
         <operator activated="true" class="generate_attributes" compatibility="5.3.013" expanded="true" height="76" name="Gen 1" width="90" x="180" y="30">
           <list key="function_descriptions">
             <parameter key="Fav" value="1"/>
           </list>
         </operator>
         <operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (3)" width="90" x="315" y="30">
           <parameter key="attribute_filter_type" value="subset"/>
           <parameter key="attributes" value="Fav|userid|recipeid"/>
         </operator>
         <operator activated="true" class="numerical_to_polynominal" compatibility="5.3.013" expanded="true" height="76" name="Numerical to Polynominal" width="90" x="450" y="30">
           <parameter key="attribute_filter_type" value="subset"/>
           <parameter key="attributes" value="|userid|recipeid"/>
         </operator>
         <operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role" width="90" x="585" y="30">
           <parameter key="attribute_name" value="recipeid"/>
           <parameter key="target_role" value="label"/>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="pivot" compatibility="5.3.013" expanded="true" height="76" name="pivot" width="90" x="112" y="300">
           <parameter key="group_attribute" value="userid"/>
           <parameter key="index_attribute" value="recipeid"/>
           <parameter key="consider_weights" value="false"/>
           <parameter key="skip_constant_attributes" value="false"/>
         </operator>
         <operator activated="true" class="replace_missing_values" compatibility="5.3.013" expanded="true" height="94" name="Replace Missing Values" width="90" x="246" y="300">
           <parameter key="default" value="zero"/>
           <list key="columns"/>
         </operator>
         <operator activated="true" class="numerical_to_binominal" compatibility="5.3.013" expanded="true" height="76" name="Numerical to Binominal" width="90" x="380" y="300"/>
         <operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (4)" width="90" x="514" y="300">
           <parameter key="attribute_filter_type" value="value_type"/>
           <parameter key="value_type" value="binominal"/>
         </operator>
         <connect from_port="in 1" to_op="Select Attributes (2)" to_port="example set input"/>
         <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Gen 1" to_port="example set input"/>
         <connect from_op="Gen 1" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
         <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Numerical to Polynominal" to_port="example set input"/>
         <connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Set Role" to_port="example set input"/>
         <connect from_op="Set Role" from_port="example set output" to_op="pivot" to_port="example set input"/>
         <connect from_op="pivot" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
         <connect from_op="Replace Missing Values" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
         <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Select Attributes (4)" to_port="example set input"/>
         <connect from_op="Select Attributes (4)" from_port="example set output" to_port="out 1"/>
         <portSpacing port="source_in 1" spacing="0"/>
         <portSpacing port="source_in 2" spacing="0"/>
         <portSpacing port="sink_out 1" spacing="0"/>
         <portSpacing port="sink_out 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="print_to_console" compatibility="5.3.013" expanded="true" height="76" name="Print pivot" width="90" x="855" y="30">
       <parameter key="log_value" value="finished pivot"/>
     </operator>
     <operator activated="true" class="subprocess" compatibility="5.3.013" expanded="true" height="76" name="rules" width="90" x="983" y="30">
       <process expanded="true">
         <operator activated="true" class="free_memory" compatibility="5.3.013" expanded="true" height="76" name="Free Memory" width="90" x="45" y="30"/>
         <operator activated="true" class="fp_growth" compatibility="5.3.013" expanded="true" height="76" name="FP-Growth" width="90" x="246" y="30">
           <parameter key="min_number_of_itemsets" value="2"/>
           <parameter key="max_number_of_retries" value="30"/>
           <parameter key="min_support" value="1.0E-20"/>
         </operator>
         <operator activated="true" class="free_memory" compatibility="5.3.013" expanded="true" height="76" name="Free Memory (2)" width="90" x="380" y="75"/>
         <operator activated="true" class="create_association_rules" compatibility="5.3.013" expanded="true" height="76" name="Create Association Rules" width="90" x="514" y="30">
           <parameter key="min_confidence" value="0.2"/>
           <parameter key="min_criterion_value" value="5.0E-4"/>
           <parameter key="gain_theta" value="1.0"/>
         </operator>
         <connect from_port="in 1" to_op="Free Memory" to_port="through 1"/>
         <connect from_op="Free Memory" from_port="through 1" to_op="FP-Growth" to_port="example set"/>
         <connect from_op="FP-Growth" from_port="frequent sets" to_op="Free Memory (2)" to_port="through 1"/>
         <connect from_op="Free Memory (2)" from_port="through 1" to_op="Create Association Rules" to_port="item sets"/>
         <connect from_op="Create Association Rules" from_port="rules" to_port="out 1"/>
         <portSpacing port="source_in 1" spacing="0"/>
         <portSpacing port="source_in 2" spacing="0"/>
         <portSpacing port="sink_out 1" spacing="0"/>
         <portSpacing port="sink_out 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Read fav" from_port="output" to_op="Text to Nominal" to_port="example set input"/>
     <connect from_op="Text to Nominal" from_port="example set output" to_op="Sort" to_port="example set input"/>
     <connect from_op="Sort" from_port="example set output" to_op="Sample" to_port="example set input"/>
     <connect from_op="Sample" from_port="example set output" to_op="Print join" to_port="through 1"/>
     <connect from_op="Print join" from_port="through 1" to_op="pivoting" to_port="in 1"/>
     <connect from_op="pivoting" from_port="out 1" to_op="Print pivot" to_port="through 1"/>
     <connect from_op="Print pivot" from_port="through 1" to_op="rules" to_port="in 1"/>
     <connect from_op="rules" from_port="out 1" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>

Is there any way to optimize(in time and memory/CPU usage) the pivot part ? /or the FP-Growth part?

Thank you!!

Alina

Answers

  • Options
    frasfras Member Posts: 93 Contributor II
    Did you try the setting "double_sparse_array" for datamanagement  in Operator "Pivot" ?
    Do you really get only two attributes from operator "Read Database" (transactionid;itemid), because you sort attribute "recipeid" ?
Sign In or Register to comment.