Options

Modelling similar audiences

mugicagonzalez_mugicagonzalez_ Member Posts: 14 Contributor II
Hello

I was wondering if anyone has some tips for creating a model for identifying similar audiences.
I have a list of 1000+ records (people from a CRM with 80000+ people). Now I would like to identify another 1000 people with a similar profile as they do. 

Any advice on how to start on this issue?

Cheers,
Pello
Tagged:

Best Answer

  • Options
    mugicagonzalez_mugicagonzalez_ Member Posts: 14 Contributor II
    Solution Accepted
    I think the "Cross Distances" operator does what I'm looking for. Could anyone confirm?

Answers

  • Options
    yyhuangyyhuang Administrator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 364 RM Data Scientist
    edited April 2019
    You are absolutely correct. Cross-distance will generate pair-wised distances between request and reference document.
    In case there is anything that can be used as prediction labels (loyalty, or customer lifetime value), you can build predictive models for that. 
    If you make a predictive model for high-value customer using a CRM with 80000+ people, you can apply the models to future another 1000+ for scoring.
    Check out the examples in //Community Samples/Community Real World Use Cases/TelCo Customer Churn/TelCo Customer Churn process 
    More learning material can be found under //Training Resources/--Getting Started--/Processes       using the customer churn data
    <?xml version="1.0" encoding="UTF-8"?><process version="9.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process" origin="GENERATED_TUTORIAL">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve Titanic Training" width="90" x="246" y="289">
            <parameter key="repository_entry" value="//Samples/data/Titanic Training"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve Titanic Unlabeled" width="90" x="112" y="85">
            <parameter key="repository_entry" value="//Samples/data/Titanic Unlabeled"/>
          </operator>
          <operator activated="true" class="sample" compatibility="9.2.001" expanded="true" height="82" name="Sample" width="90" x="246" y="85">
            <parameter key="sample" value="absolute"/>
            <parameter key="balance_data" value="false"/>
            <parameter key="sample_size" value="5"/>
            <parameter key="sample_ratio" value="0.1"/>
            <parameter key="sample_probability" value="0.1"/>
            <list key="sample_size_per_class"/>
            <list key="sample_ratio_per_class"/>
            <list key="sample_probability_per_class"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <description align="center" color="transparent" colored="false" width="126">select random 5 request samples</description>
          </operator>
          <operator activated="true" class="cross_distances" compatibility="9.2.001" expanded="true" height="103" name="Cross Distances" origin="GENERATED_TUTORIAL" width="90" x="447" y="136">
            <parameter key="measure_types" value="MixedMeasures"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="KernelEuclideanDistance"/>
            <parameter key="divergence" value="GeneralizedIDivergence"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
            <parameter key="only_top_k" value="false"/>
            <parameter key="k" value="10"/>
            <parameter key="search_for" value="nearest"/>
            <parameter key="compute_similarities" value="false"/>
            <description align="center" color="transparent" colored="false" width="126">request ID linked to the reference document ID, to view the nearest neighboors, sort by increasing distance</description>
          </operator>
          <operator activated="true" class="sort" compatibility="9.2.001" expanded="true" height="82" name="Sort" width="90" x="581" y="136">
            <parameter key="attribute_name" value="distance"/>
            <parameter key="sorting_direction" value="increasing"/>
          </operator>
          <operator activated="true" class="sort" compatibility="9.2.001" expanded="true" height="82" name="Sort (2)" width="90" x="715" y="136">
            <parameter key="attribute_name" value="request"/>
            <parameter key="sorting_direction" value="increasing"/>
          </operator>
          <connect from_op="Retrieve Titanic Training" from_port="output" to_op="Cross Distances" to_port="reference set"/>
          <connect from_op="Retrieve Titanic Unlabeled" from_port="output" to_op="Sample" to_port="example set input"/>
          <connect from_op="Sample" from_port="example set output" to_op="Cross Distances" to_port="request set"/>
          <connect from_op="Cross Distances" from_port="result set" to_op="Sort" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Sort (2)" to_port="example set input"/>
          <connect from_op="Sort (2)" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="90"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    


Sign In or Register to comment.