"k-means validation"

MarcosRLMarcosRL Member Posts: 53 Contributor II
edited June 2019 in Help
Hello friends of the community. a query
I have made ​​a cluster with k-means and would like to apply internal quality metrics, such as Sum of Squared Error (SSE) or Silhouette.Need to measure cohesion and separation of clusters.
Do these metrics in rapid miner?
Tagged:

Answers

  • awchisholmawchisholm RapidMiner Certified Expert, Member Posts: 458 Unicorn
    Hello

    You could use R - I have an example here.

    http://rapidminernotes.blogspot.com/2011/06/counting-clusters-part-r.html

    regards

    Andrew

  • MarcosRLMarcosRL Member Posts: 53 Contributor II
    Hello friends of the community. a query
    I want to perform validation of some clusters with k-means performed with the metric "Silhouette" for this I downloaded the. Jar of this page:

    Then copy the. Jar to the folder lib / plugins
    then connect the output of the operator "k-means clustering," the operator "Silhouette" but it seems to me are missing something, because I get the following error
    "Process failed: No data was delivered at port Performance.similarity (disconnected)."
    accompanying the process xml

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
        <process expanded="true" height="386" width="681">
          <operator activated="true" class="text:process_document_from_file" compatibility="5.2.004" expanded="true" height="76" name="Process Documents from Files" width="90" x="112" y="75">
            <list key="text_directories">
              <parameter key="doc1" value="C:\Users\marcos\Desktop\Datos de prueba para clustering\Caso de prueba 2\En español\doc1"/>
              <parameter key="doc2" value="C:\Users\marcos\Desktop\Datos de prueba para clustering\Caso de prueba 2\En español\doc2"/>
              <parameter key="doc3" value="C:\Users\marcos\Desktop\Datos de prueba para clustering\Caso de prueba 2\En español\doc3"/>
            </list>
            <process expanded="true" height="415" width="758">
              <operator activated="true" class="text:transform_cases" compatibility="5.2.004" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
              <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="45" y="120"/>
              <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.2.004" expanded="true" height="76" name="Filter stopwords_pronombres_preposiciones" width="90" x="45" y="210">
                <parameter key="file" value="C:\Users\marcos\Desktop\stopwords\stopwords_pronombres_preposiciones.txt"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.2.004" expanded="true" height="76" name="Filter stopwords_caratula" width="90" x="45" y="300">
                <parameter key="file" value="C:\Users\marcos\Desktop\stopwords\stopwords_caratula.txt"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.004" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="179" y="30"/>
              <operator activated="true" class="text:filter_by_length" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="120">
                <parameter key="min_chars" value="3"/>
              </operator>
              <connect from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Filter stopwords_pronombres_preposiciones" to_port="document"/>
              <connect from_op="Filter stopwords_pronombres_preposiciones" from_port="document" to_op="Filter stopwords_caratula" to_port="document"/>
              <connect from_op="Filter stopwords_caratula" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="k_means" compatibility="5.2.008" expanded="true" height="76" name="Clustering" width="90" x="320" y="132">
            <parameter key="add_as_label" value="true"/>
            <parameter key="k" value="3"/>
            <parameter key="measure_types" value="NumericalMeasures"/>
            <parameter key="numerical_measure" value="CosineSimilarity"/>
          </operator>
          <operator activated="true" class="rmx_cpplugin:cluster_silhouette" compatibility="0.2.000" expanded="true" height="112" name="Performance" width="90" x="541" y="206"/>
          <connect from_op="Process Documents from Files" from_port="example set" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_op="Performance" to_port="cluster model"/>
          <connect from_op="Clustering" from_port="clustered set" to_op="Performance" to_port="example set"/>
          <connect from_op="Performance" from_port="example set" to_port="result 1"/>
          <connect from_op="Performance" from_port="performance vector" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>

    regards
Sign In or Register to comment.