Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Entropy and purity of k-means ?

p013498p013498 Member Posts: 1 Learner II
edited January 2020 in Help

Hello,

I have 11 docuemtns which contains unstractiured data. I am using k-means to cluster these documents into 3 clusters. How to validate the k-means using Entropy and Purity ?

 

Is there any direct operater to use and get the real values of Entropy and Purity

 

----------------------------------------------------------------------------------------------------------

XML file:

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.6.003" expanded="true" name="Process">
    <parameter key="logverbosity" value="all"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="34">
        <list key="text_directories">
          <parameter key="doc1" value="C:\Papidminer_Data_tags1\doc1"/>
          <parameter key="doc2" value="C:\Papidminer_Data_tags1\doc2"/>
          <parameter key="doc3" value="C:\Papidminer_Data_tags1\doc3"/>
          <parameter key="doc4" value="C:\Papidminer_Data_tags1\doc4"/>
          <parameter key="doc5" value="C:\Papidminer_Data_tags1\doc5"/>
          <parameter key="doc6" value="C:\Papidminer_Data_tags1\doc6"/>
          <parameter key="doc7" value="C:\Papidminer_Data_tags1\doc7"/>
          <parameter key="doc7" value="C:\Papidminer_Data_tags1\doc7"/>
          <parameter key="doc8" value="C:\Papidminer_Data_tags1\doc8"/>
          <parameter key="doc9" value="C:\Papidminer_Data_tags1\doc9"/>
          <parameter key="doc10" value="C:\Papidminer_Data_tags1\doc10"/>
          <parameter key="doc11" value="C:\Papidminer_Data_tags1\doc11"/>
        </list>
        <parameter key="file_pattern" value="*"/>
        <parameter key="extract_text_only" value="true"/>
        <parameter key="use_file_extension_as_type" value="true"/>
        <parameter key="content_type" value="txt"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="k_means" compatibility="7.6.003" expanded="true" height="82" name="Clustering" width="90" x="313" y="187">
        <parameter key="add_cluster_attribute" value="true"/>
        <parameter key="add_as_label" value="false"/>
        <parameter key="remove_unlabeled" value="false"/>
        <parameter key="k" value="3"/>
        <parameter key="max_runs" value="10"/>
        <parameter key="determine_good_start_values" value="false"/>
        <parameter key="measure_types" value="BregmanDivergences"/>
        <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
        <parameter key="nominal_measure" value="NominalDistance"/>
        <parameter key="numerical_measure" value="DiceSimilarity"/>
        <parameter key="divergence" value="SquaredEuclideanDistance"/>
        <parameter key="kernel_type" value="radial"/>
        <parameter key="kernel_gamma" value="1.0"/>
        <parameter key="kernel_sigma1" value="1.0"/>
        <parameter key="kernel_sigma2" value="0.0"/>
        <parameter key="kernel_sigma3" value="2.0"/>
        <parameter key="kernel_degree" value="3.0"/>
        <parameter key="kernel_shift" value="1.0"/>
        <parameter key="kernel_a" value="1.0"/>
        <parameter key="kernel_b" value="0.0"/>
        <parameter key="max_optimization_steps" value="100"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 1"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

 

--------------------------------------------------------------------------------------------------

 

Tagged:
Sign In or Register to comment.