Identify common atributes using clusters

richyeivrichyeiv Member Posts: 2 Contributor I
edited December 2018 in Help

Hi, I'm quite new to RapidMiner. I was working with the white wine quality dataset from http://www3.dsi.uminho.pt/pcortez/wine/.  I have to identify common atributes that generate wines with quality superior to 6 using clustering (k-means and k-menoids). This is the process:

white-wine.PNG

Really don't know how to achieve this.

Thank you in advance.

Answers

  • Pavithra_RaoPavithra_Rao Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 123 RM Data Scientist

    Hi,

     

    Could please share the XML code of the RapidMiner process you have built (of the screenshot you shared)?

     

    This would help to recreate the process with exact parameters of the operators you have set and work on getting the results you are looking for.

     

    Thanks,

    Pavithra

     

  • richyeivrichyeiv Member Posts: 2 Contributor I

    Sorry, the process is quite simple:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.5.003" expanded="true" height="68" name="Retrieve winequality-white" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Local Repository/proyecto_mineriaI/winequality-white"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="7.5.003" expanded="true" height="103" name="Filter outliers" width="90" x="179" y="34">
    <parameter key="parameter_expression" value=""/>
    <parameter key="condition_class" value="custom_filters"/>
    <parameter key="invert_filter" value="false"/>
    <list key="filters_list">
    <parameter key="filters_entry_key" value="fixed acidity.le.8\.3"/>
    <parameter key="filters_entry_key" value="volatile acidity.le.0\.43"/>
    <parameter key="filters_entry_key" value="citric acid.le.0\.51"/>
    </list>
    <parameter key="filters_logic_and" value="true"/>
    <parameter key="filters_check_metadata" value="true"/>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.5.003" expanded="true" height="82" name="Multiply" width="90" x="313" y="34"/>
    <operator activated="true" class="k_means" compatibility="7.5.003" expanded="true" height="82" name="K-Means" width="90" x="514" y="34">
    <parameter key="add_cluster_attribute" value="true"/>
    <parameter key="add_as_label" value="false"/>
    <parameter key="remove_unlabeled" value="false"/>
    <parameter key="k" value="3"/>
    <parameter key="max_runs" value="10"/>
    <parameter key="determine_good_start_values" value="false"/>
    <parameter key="measure_types" value="MixedMeasures"/>
    <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
    <parameter key="nominal_measure" value="NominalDistance"/>
    <parameter key="numerical_measure" value="EuclideanDistance"/>
    <parameter key="divergence" value="SquaredEuclideanDistance"/>
    <parameter key="kernel_type" value="radial"/>
    <parameter key="kernel_gamma" value="1.0"/>
    <parameter key="kernel_sigma1" value="1.0"/>
    <parameter key="kernel_sigma2" value="0.0"/>
    <parameter key="kernel_sigma3" value="2.0"/>
    <parameter key="kernel_degree" value="3.0"/>
    <parameter key="kernel_shift" value="1.0"/>
    <parameter key="kernel_a" value="1.0"/>
    <parameter key="kernel_b" value="0.0"/>
    <parameter key="max_optimization_steps" value="100"/>
    <parameter key="use_local_random_seed" value="false"/>
    <parameter key="local_random_seed" value="1992"/>
    </operator>
    <operator activated="true" class="extract_prototypes" compatibility="7.5.003" expanded="true" height="82" name="Extract Cluster Prototypes" width="90" x="648" y="34"/>
    <operator activated="true" class="multiply" compatibility="7.5.003" expanded="true" height="103" name="Multiply (3)" width="90" x="648" y="187"/>
    <operator activated="true" class="cluster_distance_performance" compatibility="7.5.003" expanded="true" height="103" name="Perf kmeans" width="90" x="849" y="187">
    <parameter key="main_criterion" value="Avg. within centroid distance"/>
    <parameter key="main_criterion_only" value="false"/>
    <parameter key="normalize" value="false"/>
    <parameter key="maximize" value="false"/>
    </operator>
    <operator activated="true" class="cross_distances" compatibility="7.5.003" expanded="true" height="103" name="Cross Distances" width="90" x="916" y="34">
    <parameter key="measure_types" value="MixedMeasures"/>
    <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
    <parameter key="nominal_measure" value="NominalDistance"/>
    <parameter key="numerical_measure" value="EuclideanDistance"/>
    <parameter key="divergence" value="GeneralizedIDivergence"/>
    <parameter key="kernel_type" value="radial"/>
    <parameter key="kernel_gamma" value="1.0"/>
    <parameter key="kernel_sigma1" value="1.0"/>
    <parameter key="kernel_sigma2" value="0.0"/>
    <parameter key="kernel_sigma3" value="2.0"/>
    <parameter key="kernel_degree" value="3.0"/>
    <parameter key="kernel_shift" value="1.0"/>
    <parameter key="kernel_a" value="1.0"/>
    <parameter key="kernel_b" value="0.0"/>
    <parameter key="only_top_k" value="false"/>
    <parameter key="k" value="10"/>
    <parameter key="search_for" value="nearest"/>
    <parameter key="compute_similarities" value="false"/>
    </operator>
    <connect from_op="Retrieve winequality-white" from_port="output" to_op="Filter outliers" to_port="example set input"/>
    <connect from_op="Filter outliers" from_port="original" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="K-Means" to_port="example set"/>
    <connect from_op="K-Means" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
    <connect from_op="K-Means" from_port="clustered set" to_op="Multiply (3)" to_port="input"/>
    <connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Cross Distances" to_port="request set"/>
    <connect from_op="Extract Cluster Prototypes" from_port="model" to_op="Perf kmeans" to_port="cluster model"/>
    <connect from_op="Multiply (3)" from_port="output 1" to_op="Cross Distances" to_port="reference set"/>
    <connect from_op="Multiply (3)" from_port="output 2" to_op="Perf kmeans" to_port="example set"/>
    <connect from_op="Perf kmeans" from_port="performance" to_port="result 4"/>
    <connect from_op="Perf kmeans" from_port="cluster model" to_port="result 5"/>
    <connect from_op="Cross Distances" from_port="result set" to_port="result 1"/>
    <connect from_op="Cross Distances" from_port="request set" to_port="result 2"/>
    <connect from_op="Cross Distances" from_port="reference set" to_port="result 3"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    <portSpacing port="sink_result 6" spacing="0"/>
    </process>
    </operator>
    </process>
  • Pavithra_RaoPavithra_Rao Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 123 RM Data Scientist

    Hi,

     

    Thanks for sharing the XML code.

    I have tried clustering with a slightly different approach. As seen in the process screenshot,

    1. Set Role operator is used to tell Rapidminer that "quality" is the target variable. To make sure that clusters are not based on id/special attributes.
    2. Normalize attribute is used to transform the values and normalize the data to produce better statistical results
    3. Used X-Means operator instead of k means. As X-Means automatically determines k -value depending on the data. More details about this operator can be found in the help window of the operator

    From the Plot tab of X-Means output, we can observe that Cluster 1 is distinct from other clusters with respect to chloride contents as its, centroid value is greater than centroid values of other attributes.

         4. Further, I have used Filter examples to filter out data/examples with quality value >= 6 to narrow down the analysis. We could use charting options in the Filter examples results window to explore the data and observe the distributions of the attribute values in the subset and their cluster groupings.

         5. I have used Weighting by relevance to see the importance of these attributes in the data.

     

    Hope this helps. Let me know for any further question/concerns here. Attached are the screenshots and XML code.

     

    Lastly, I had a question, the goal here is to identify common attributes that generate wines with quality superior to 6. This is more like supervised learning problem(target is quality) rather than unsupervised learning(clustering). Any specific reason to choose clustering approach here?

     

    Thanks,

    Cheers,

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.000" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" breakpoints="after" class="retrieve" compatibility="7.6.000" expanded="true" height="68" name="Retrieve winequality-white" width="90" x="45" y="136">
    <parameter key="repository_entry" value="//MyRepository/winequality/winequality-white"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.000" expanded="true" height="82" name="Set Role" width="90" x="179" y="136">
    <parameter key="attribute_name" value="quality"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="normalize" compatibility="7.6.000" expanded="true" height="103" name="Normalize" width="90" x="313" y="136">
    <parameter key="attributes" value="|volatile acidity|total sulfur dioxide|sulphates|residual sugar|pH|free sulfur dioxide|fixed acidity|density|citric acid|chlorides|alcohol"/>
    </operator>
    <operator activated="true" class="x_means" compatibility="7.6.000" expanded="true" height="82" name="X-Means" width="90" x="447" y="136">
    <parameter key="add_as_label" value="true"/>
    <parameter key="measure_types" value="MixedMeasures"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="7.6.000" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="238">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="quality.ge.6"/>
    </list>
    </operator>
    <operator activated="true" class="featselext:maximum_relevance_weighting" compatibility="1.1.004" expanded="true" height="82" name="MR-Weighting" width="90" x="782" y="238"/>
    <operator activated="true" class="multiply" compatibility="7.6.000" expanded="true" height="103" name="Multiply" width="90" x="581" y="34"/>
    <operator activated="true" class="extract_prototypes" compatibility="7.6.000" expanded="true" height="82" name="Extract Cluster Prototypes" width="90" x="782" y="34"/>
    <connect from_op="Retrieve winequality-white" from_port="output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Normalize" to_port="example set input"/>
    <connect from_op="Normalize" from_port="example set output" to_op="X-Means" to_port="example set"/>
    <connect from_op="X-Means" from_port="cluster model" to_op="Multiply" to_port="input"/>
    <connect from_op="X-Means" from_port="clustered set" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="MR-Weighting" to_port="example set"/>
    <connect from_op="MR-Weighting" from_port="weights" to_port="result 4"/>
    <connect from_op="MR-Weighting" from_port="example set" to_port="result 5"/>
    <connect from_op="Multiply" from_port="output 1" to_op="Extract Cluster Prototypes" to_port="model"/>
    <connect from_op="Multiply" from_port="output 2" to_port="result 1"/>
    <connect from_op="Extract Cluster Prototypes" from_port="example set" to_port="result 2"/>
    <connect from_op="Extract Cluster Prototypes" from_port="model" to_port="result 3"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    <portSpacing port="sink_result 6" spacing="0"/>
    </process>
    </operator>
    </process>

     

     


    cluster_plot.pngcluster centroid plotprocess.pngprocessweighting.pngweightings

     

Sign In or Register to comment.