Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Compute distance to centroid

wesselwessel Member Posts: 537 Maven
edited November 2018 in Help
Dear All,

How to compute the distance to all cluster centroids?
For example:
1. apply k-means clustering
2. obtain k centroids
3. for each example in the example set:
compute k distances to k centroids

Best regards,

Wessel

Answers

  • wesselwessel Member Posts: 537 Maven
    Here is my best attempt:

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.006">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
        <process expanded="true" height="422" width="705">
          <operator activated="true" class="retrieve" compatibility="5.2.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
            <parameter key="repository_entry" value="//Samples/data/Sonar"/>
          </operator>
          <operator activated="true" class="normalize" compatibility="5.2.006" expanded="true" height="94" name="Normalize" width="90" x="180" y="30"/>
          <operator activated="true" class="k_means" compatibility="5.2.006" expanded="true" height="76" name="Clustering" width="90" x="315" y="30">
            <parameter key="k" value="10"/>
          </operator>
          <operator activated="true" class="extract_prototypes" compatibility="5.2.006" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="447" y="30"/>
          <operator activated="true" class="cross_distances" compatibility="5.2.006" expanded="true" height="94" name="Cross Distances" width="90" x="585" y="30"/>
          <operator activated="true" class="pivot" compatibility="5.2.006" expanded="true" height="76" name="Pivot" width="90" x="45" y="120">
            <parameter key="group_attribute" value="request"/>
            <parameter key="index_attribute" value="document"/>
            <parameter key="consider_weights" value="false"/>
            <parameter key="skip_constant_attributes" value="false"/>
          </operator>
          <operator activated="true" class="join" compatibility="5.2.006" expanded="true" height="76" name="Join" width="90" x="180" y="120">
            <parameter key="join_type" value="right"/>
            <parameter key="use_id_attribute_as_key" value="false"/>
            <list key="key_attributes">
              <parameter key="request" value="id"/>
            </list>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.2.006" expanded="true" height="76" name="Select Attributes" width="90" x="315" y="120">
            <parameter key="attribute_filter_type" value="regular_expression"/>
            <parameter key="regular_expression" value="at.*|request"/>
            <parameter key="invert_selection" value="true"/>
          </operator>
          <operator activated="true" class="store" compatibility="5.2.006" expanded="true" height="60" name="Store" width="90" x="447" y="120">
            <parameter key="repository_entry" value="TEMP"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="5.2.006" expanded="true" height="60" name="Retrieve" width="90" x="112" y="255">
            <parameter key="repository_entry" value="TEMP"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.2.006" expanded="true" height="112" name="Validation" width="90" x="313" y="255">
            <process expanded="true" height="421" width="165">
              <operator activated="false" class="naive_bayes" compatibility="5.2.006" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="30"/>
              <operator activated="true" class="k_nn" compatibility="5.2.006" expanded="true" height="76" name="k-NN" width="90" x="87" y="108"/>
              <connect from_port="training" to_op="k-NN" to_port="training set"/>
              <connect from_op="k-NN" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="421" width="300">
              <operator activated="true" class="apply_model" compatibility="5.2.006" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_classification" compatibility="5.2.006" expanded="true" height="76" name="Performance" width="90" x="180" y="30">
                <parameter key="accuracy" value="false"/>
                <parameter key="kappa" value="true"/>
                <list key="class_weights"/>
              </operator>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Normalize" to_port="example set input"/>
          <connect from_op="Normalize" from_port="example set output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
          <connect from_op="Clustering" from_port="clustered set" to_op="Cross Distances" to_port="request set"/>
          <connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Cross Distances" to_port="reference set"/>
          <connect from_op="Cross Distances" from_port="result set" to_op="Pivot" to_port="example set input"/>
          <connect from_op="Cross Distances" from_port="request set" to_op="Join" to_port="right"/>
          <connect from_op="Pivot" from_port="example set output" to_op="Join" to_port="left"/>
          <connect from_op="Join" from_port="join" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Store" to_port="input"/>
          <connect from_op="Retrieve" from_port="output" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="model" to_port="result 1"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="180"/>
          <portSpacing port="sink_result 2" spacing="18"/>
          <portSpacing port="sink_result 3" spacing="36"/>
        </process>
      </operator>
    </process>
  • fritmorefritmore Member Posts: 90 Contributor II
    hi w
    so is ur best attempt working?
  • wesselwessel Member Posts: 537 Maven
    Hey,

    It works, but it is rather slow.

    Best regards,

    Wessel
  • fritmorefritmore Member Posts: 90 Contributor II
    wessel wrote:

    Hey,

    It works, but it is rather slow.

    Best regards,

    Wessel
    hi W
    I do this kind of post or pre processing outside of RM in tools that are much more fit for that, I personally use Matlab,
    but since this is algorithmically very simple problem you have, it can be done very fast even in excel. (python and perl should do this in a matter of milliseconds by means of very simple 1 loop script even for 10 of 1000s examples)

    if you need to do this on the fly you may want to look at Rapid Analytics where u can run your current RM k-means clustering process and 'chain' it with an external script.

    I am afraid RM will always be S L O W with loops. Unless u can write your own operator ;)

    br
    f
Sign In or Register to comment.