Compute distance to centroid

wessel · May 2012

Dear All,

How to compute the distance to all cluster centroids?
For example:
1. apply k-means clustering
2. obtain k centroids
3. for each example in the example set:
compute k distances to k centroids

Best regards,

Wessel

wessel · May 2012

Here is my best attempt:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
<process expanded="true" height="422" width="705">
<operator activated="true" class="retrieve" compatibility="5.2.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Sonar"/>
</operator>
<operator activated="true" class="normalize" compatibility="5.2.006" expanded="true" height="94" name="Normalize" width="90" x="180" y="30"/>
<operator activated="true" class="k_means" compatibility="5.2.006" expanded="true" height="76" name="Clustering" width="90" x="315" y="30">
<parameter key="k" value="10"/>
</operator>
<operator activated="true" class="extract_prototypes" compatibility="5.2.006" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="447" y="30"/>
<operator activated="true" class="cross_distances" compatibility="5.2.006" expanded="true" height="94" name="Cross Distances" width="90" x="585" y="30"/>
<operator activated="true" class="pivot" compatibility="5.2.006" expanded="true" height="76" name="Pivot" width="90" x="45" y="120">
<parameter key="group_attribute" value="request"/>
<parameter key="index_attribute" value="document"/>
<parameter key="consider_weights" value="false"/>
<parameter key="skip_constant_attributes" value="false"/>
</operator>
<operator activated="true" class="join" compatibility="5.2.006" expanded="true" height="76" name="Join" width="90" x="180" y="120">
<parameter key="join_type" value="right"/>
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="request" value="id"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.2.006" expanded="true" height="76" name="Select Attributes" width="90" x="315" y="120">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="at.*|request"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="store" compatibility="5.2.006" expanded="true" height="60" name="Store" width="90" x="447" y="120">
<parameter key="repository_entry" value="TEMP"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.2.006" expanded="true" height="60" name="Retrieve" width="90" x="112" y="255">
<parameter key="repository_entry" value="TEMP"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.2.006" expanded="true" height="112" name="Validation" width="90" x="313" y="255">
<process expanded="true" height="421" width="165">
<operator activated="false" class="naive_bayes" compatibility="5.2.006" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="30"/>
<operator activated="true" class="k_nn" compatibility="5.2.006" expanded="true" height="76" name="k-NN" width="90" x="87" y="108"/>
<connect from_port="training" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="421" width="300">
<operator activated="true" class="apply_model" compatibility="5.2.006" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="5.2.006" expanded="true" height="76" name="Performance" width="90" x="180" y="30">
<parameter key="accuracy" value="false"/>
<parameter key="kappa" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Normalize" to_port="example set input"/>
<connect from_op="Normalize" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Cross Distances" to_port="request set"/>
<connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Cross Distances" to_port="reference set"/>
<connect from_op="Cross Distances" from_port="result set" to_op="Pivot" to_port="example set input"/>
<connect from_op="Cross Distances" from_port="request set" to_op="Join" to_port="right"/>
<connect from_op="Pivot" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Store" to_port="input"/>
<connect from_op="Retrieve" from_port="output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="180"/>
<portSpacing port="sink_result 2" spacing="18"/>
<portSpacing port="sink_result 3" spacing="36"/>
</process>
</operator>
</process>

fritmore · May 2012

hi w
so is ur best attempt working?

wessel · May 2012

Hey,

It works, but it is rather slow.

Best regards,

Wessel

fritmore · May 2012

wessel wrote:

Hey,

It works, but it is rather slow.

Best regards,

Wessel

hi W
I do this kind of post or pre processing outside of RM in tools that are much more fit for that, I personally use Matlab,
but since this is algorithmically very simple problem you have, it can be done very fast even in excel. (python and perl should do this in a matter of milliseconds by means of very simple 1 loop script even for 10 of 1000s examples)

if you need to do this on the fly you may want to look at Rapid Analytics where u can run your current RM k-means clustering process and 'chain' it with an external script.

I am afraid RM will always be S L O W with loops. Unless u can write your own operator

br
f

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Compute distance to centroid

Answers