Possible bug in X-Means parameters.

rfuentealbarfuentealba Moderator, RapidMiner Certified Analyst, Member, University Professor Posts: 568 Unicorn
edited December 2018 in Product Feedback - Resolved

Hello, World!

 

I was explaining something with clustering with k-Means, and my XML looked like this (Notice that I'm using the Operator Toolbox extension because I'm too lazy to open Excel for my examples):

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.2.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma_separated_text"/>
<list key="function_descriptions"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="input_csv_text" value="Color,Status&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Good&#10;Red,Good&#10;Red,Good&#10;Red,Good&#10;Green,Good&#10;Green,Bad&#10;Red,Good&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Red,Bad&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Green,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Green,Good&#10;Green,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Red,Bad&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Red,Bad&#10;Green,Good&#10;Red,Bad"/>
<parameter key="parse_all_as_nominal" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.2.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
<list key="function_descriptions">
<parameter key="random_stuff" value="rand() * 128"/>
<parameter key="more_random_stuff" value="rand() * 3.1415926535 / 17 * random_stuff"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="8.2.000" expanded="true" height="103" name="Multiply" width="90" x="514" y="340"/>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="id|Status|Color"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="false" class="x_means" compatibility="8.2.000" expanded="true" height="82" name="X-Means" width="90" x="782" y="136">
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="clustering_algorithm" value="FastKMeans"/>
</operator>
<operator activated="false" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="782" y="238">
<parameter key="attribute_name" value="cluster"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:k_means" compatibility="8.2.000" expanded="true" height="82" name="Clustering" width="90" x="782" y="34">
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="4"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="max_optimization_steps" value="4"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.2.000" expanded="true" height="103" name="Decision Tree" width="90" x="1050" y="187">
<parameter key="criterion" value="information_gain"/>
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.2.000" expanded="true" height="82" name="Join" width="90" x="1184" y="289">
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="id" value="id"/>
</list>
</operator>
<connect from_op="Create ExampleSet" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="right"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="result 2"/>
<connect from_op="Decision Tree" from_port="exampleSet" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="168"/>
<portSpacing port="sink_result 3" spacing="126"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>

Pay attention to the Clustering operator: it has add cluster attribute checked and add as label as well, and it does add a cluster attribute with a label role.

 

Then I tried to switch k-Means by X-Means:

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.2.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma_separated_text"/>
<list key="function_descriptions"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="input_csv_text" value="Color,Status&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Good&#10;Red,Good&#10;Red,Good&#10;Red,Good&#10;Green,Good&#10;Green,Bad&#10;Red,Good&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Red,Bad&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Green,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Green,Good&#10;Green,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Red,Bad&#10;Red,Bad&#10;Green,Bad&#10;Red,Good&#10;Green,Good&#10;Red,Bad&#10;Red,Bad&#10;Green,Good&#10;Red,Bad"/>
<parameter key="parse_all_as_nominal" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.2.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
<list key="function_descriptions">
<parameter key="random_stuff" value="rand() * 128"/>
<parameter key="more_random_stuff" value="rand() * 3.1415926535 / 17 * random_stuff"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="8.2.000" expanded="true" height="103" name="Multiply" width="90" x="514" y="340"/>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="id|Status|Color"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="x_means" compatibility="8.2.000" expanded="true" height="82" name="X-Means" width="90" x="782" y="136">
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="clustering_algorithm" value="FastKMeans"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="916" y="238">
<parameter key="attribute_name" value="cluster"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="false" class="concurrency:k_means" compatibility="8.2.000" expanded="true" height="82" name="Clustering" width="90" x="782" y="34">
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="4"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="max_optimization_steps" value="4"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.2.000" expanded="true" height="103" name="Decision Tree" width="90" x="1050" y="187">
<parameter key="criterion" value="information_gain"/>
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.2.000" expanded="true" height="82" name="Join" width="90" x="1184" y="289">
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="id" value="id"/>
</list>
</operator>
<connect from_op="Create ExampleSet" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="right"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="X-Means" to_port="example set"/>
<connect from_op="X-Means" from_port="clustered set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="result 2"/>
<connect from_op="Decision Tree" from_port="exampleSet" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="168"/>
<portSpacing port="sink_result 3" spacing="126"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>

The thing is: if I check the add cluster attribute it adds the cluster attribute like it says it will, but, and here is the report: if I check add as label, it doesn't add the cluster attribute with a label role, but actually it adds the label attribute. I had to put a Set Role operator after the X-Means operator to actually get a label and feed the Decision Tree.

 

Is this a desired behavior? If so, why? It's counterintuitive.

 

All the best,

 

Rodrigo.

 

Tagged:
0
0 votes

Fixed and Released · Last Updated

9.0.0

Comments

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
  • rfuentealbarfuentealba Moderator, RapidMiner Certified Analyst, Member, University Professor Posts: 568 Unicorn

    Hi there,

     

    Update: in RapidMiner 9.0 Beta, now it doesn't change the name cluster by label yet the model still doesn't make use of the add as label checkbox. To use the cluster as a label (e.g. to interpret results with a decision tree, which is my favourite trick under the sleeve), I need to apply Set Role again. 50% of the bug/feature has been resolved.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="9.0.000-BETA">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="9.0.000-BETA" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="9.0.000-BETA" expanded="true" height="68" name="Retrieve Titanic Training" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Samples/data/Titanic Training"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="9.0.000-BETA" expanded="true" height="82" name="Set Role" width="90" x="179" y="34">
    <parameter key="attribute_name" value="Survived"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="9.0.000-BETA" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Sex|Passenger Class|Age"/>
    </operator>
    <operator activated="true" class="x_means" compatibility="9.0.000-BETA" expanded="true" height="82" name="X-Means" width="90" x="447" y="34">
    <parameter key="add_as_label" value="true"/>
    <parameter key="measure_types" value="MixedMeasures"/>
    <parameter key="clustering_algorithm" value="FastKMeans"/>
    </operator>
    <connect from_op="Retrieve Titanic Training" from_port="output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="X-Means" to_port="example set"/>
    <connect from_op="X-Means" from_port="cluster model" to_port="result 1"/>
    <connect from_op="X-Means" from_port="clustered set" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>

    Is there anything I can do to help?

     

    All the best,

     

    Rodrigo.

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    thx @rfuentealba. I forwarded your comment to the dev team and they will follow up if needed. Thanks!

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    Bug fix will be in Studio 9.0 release. Thanks for reporting @rfuentealba!

Sign In or Register to comment.