Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Analyzing information for each cluster
carlitos_mg
Member Posts: 2 Contributor I
Hi,
I would need help and advice for this problem with RM. I am trying to cluster in 5 clusters and get the topic of scientific documents (I have it in PDF and in TXT and I tried with both so the format is not the problem). As you can see in the process after extract cluster prototypes I try to separate each cluster to analyze the most common words in every cluster and exctract the topic of them. I would like to filter the words, for example the ones that appear less than 0.01 using select attributes or remove useless attributes, but this operator is not working. I have to transpose the rows and then I can remove the words that have 0 value adding another operator filter examples. I tried to add text to nominal because I thought maybe RM does not detect that the frequency after separate the clusters is a number, but still is not working.
I hope you understand the proleblem. I write the code below.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="75">
<list key="text_directories">
<parameter key="Fluid" value="C:\Users\Carlos\Desktop\Rapidminer\Data\Engineering\All TXT"/>
</list>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="120"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="120"/>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="313" y="120">
<parameter key="string" value="cluster"/>
<parameter key="invert condition" value="true"/>
</operator>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stopwords (Dictionary)" width="90" x="447" y="120">
<parameter key="file" value="C:\Users\Carlos\Desktop\Rapidminer\Data\Unnecesry words.txt"/>
</operator>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
<connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
<connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="k_means" compatibility="5.3.015" expanded="true" height="76" name="Clustering" width="90" x="45" y="210">
<parameter key="add_cluster_attribute" value="false"/>
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="5"/>
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
</operator>
<operator activated="true" class="extract_prototypes" compatibility="5.3.015" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="45" y="345"/>
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="148" name="Multiply" width="90" x="179" y="165"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (6)" width="90" x="313" y="390">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_4"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (5)" width="90" x="447" y="390"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (10)" width="90" x="581" y="390">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (5)" width="90" x="313" y="300">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_3"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (4)" width="90" x="447" y="300"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (9)" width="90" x="581" y="300">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (4)" width="90" x="313" y="210">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_2"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (3)" width="90" x="447" y="210"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (8)" width="90" x="581" y="210">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (3)" width="90" x="313" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_1"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (2)" width="90" x="447" y="120"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (7)" width="90" x="581" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples" width="90" x="313" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_0"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose" width="90" x="447" y="30"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (2)" width="90" x="581" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
<connect from_op="Clustering" from_port="clustered set" to_port="result 7"/>
<connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Extract Cluster Prototypes" from_port="model" to_port="result 6"/>
<connect from_op="Multiply" from_port="output 1" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Filter Examples (3)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 3" to_op="Filter Examples (4)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 4" to_op="Filter Examples (5)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 5" to_op="Filter Examples (6)" to_port="example set input"/>
<connect from_op="Filter Examples (6)" from_port="example set output" to_op="Transpose (5)" to_port="example set input"/>
<connect from_op="Transpose (5)" from_port="example set output" to_op="Filter Examples (10)" to_port="example set input"/>
<connect from_op="Filter Examples (10)" from_port="example set output" to_port="result 5"/>
<connect from_op="Filter Examples (5)" from_port="example set output" to_op="Transpose (4)" to_port="example set input"/>
<connect from_op="Transpose (4)" from_port="example set output" to_op="Filter Examples (9)" to_port="example set input"/>
<connect from_op="Filter Examples (9)" from_port="example set output" to_port="result 4"/>
<connect from_op="Filter Examples (4)" from_port="example set output" to_op="Transpose (3)" to_port="example set input"/>
<connect from_op="Transpose (3)" from_port="example set output" to_op="Filter Examples (8)" to_port="example set input"/>
<connect from_op="Filter Examples (8)" from_port="example set output" to_port="result 3"/>
<connect from_op="Filter Examples (3)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
<connect from_op="Transpose (2)" from_port="example set output" to_op="Filter Examples (7)" to_port="example set input"/>
<connect from_op="Filter Examples (7)" from_port="example set output" to_port="result 2"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
<portSpacing port="sink_result 8" spacing="0"/>
</process>
</operator>
</process>
Thank you very much.
I would need help and advice for this problem with RM. I am trying to cluster in 5 clusters and get the topic of scientific documents (I have it in PDF and in TXT and I tried with both so the format is not the problem). As you can see in the process after extract cluster prototypes I try to separate each cluster to analyze the most common words in every cluster and exctract the topic of them. I would like to filter the words, for example the ones that appear less than 0.01 using select attributes or remove useless attributes, but this operator is not working. I have to transpose the rows and then I can remove the words that have 0 value adding another operator filter examples. I tried to add text to nominal because I thought maybe RM does not detect that the frequency after separate the clusters is a number, but still is not working.
I hope you understand the proleblem. I write the code below.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="75">
<list key="text_directories">
<parameter key="Fluid" value="C:\Users\Carlos\Desktop\Rapidminer\Data\Engineering\All TXT"/>
</list>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="120"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="120"/>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="313" y="120">
<parameter key="string" value="cluster"/>
<parameter key="invert condition" value="true"/>
</operator>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stopwords (Dictionary)" width="90" x="447" y="120">
<parameter key="file" value="C:\Users\Carlos\Desktop\Rapidminer\Data\Unnecesry words.txt"/>
</operator>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
<connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
<connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="k_means" compatibility="5.3.015" expanded="true" height="76" name="Clustering" width="90" x="45" y="210">
<parameter key="add_cluster_attribute" value="false"/>
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="5"/>
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
</operator>
<operator activated="true" class="extract_prototypes" compatibility="5.3.015" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="45" y="345"/>
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="148" name="Multiply" width="90" x="179" y="165"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (6)" width="90" x="313" y="390">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_4"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (5)" width="90" x="447" y="390"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (10)" width="90" x="581" y="390">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (5)" width="90" x="313" y="300">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_3"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (4)" width="90" x="447" y="300"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (9)" width="90" x="581" y="300">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (4)" width="90" x="313" y="210">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_2"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (3)" width="90" x="447" y="210"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (8)" width="90" x="581" y="210">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (3)" width="90" x="313" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_1"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose (2)" width="90" x="447" y="120"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (7)" width="90" x="581" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples" width="90" x="313" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="cluster=cluster_0"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose" width="90" x="447" y="30"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (2)" width="90" x="581" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="att_1=0.0"/>
<parameter key="invert_filter" value="true"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
<connect from_op="Clustering" from_port="clustered set" to_port="result 7"/>
<connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Extract Cluster Prototypes" from_port="model" to_port="result 6"/>
<connect from_op="Multiply" from_port="output 1" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Filter Examples (3)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 3" to_op="Filter Examples (4)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 4" to_op="Filter Examples (5)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 5" to_op="Filter Examples (6)" to_port="example set input"/>
<connect from_op="Filter Examples (6)" from_port="example set output" to_op="Transpose (5)" to_port="example set input"/>
<connect from_op="Transpose (5)" from_port="example set output" to_op="Filter Examples (10)" to_port="example set input"/>
<connect from_op="Filter Examples (10)" from_port="example set output" to_port="result 5"/>
<connect from_op="Filter Examples (5)" from_port="example set output" to_op="Transpose (4)" to_port="example set input"/>
<connect from_op="Transpose (4)" from_port="example set output" to_op="Filter Examples (9)" to_port="example set input"/>
<connect from_op="Filter Examples (9)" from_port="example set output" to_port="result 4"/>
<connect from_op="Filter Examples (4)" from_port="example set output" to_op="Transpose (3)" to_port="example set input"/>
<connect from_op="Transpose (3)" from_port="example set output" to_op="Filter Examples (8)" to_port="example set input"/>
<connect from_op="Filter Examples (8)" from_port="example set output" to_port="result 3"/>
<connect from_op="Filter Examples (3)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
<connect from_op="Transpose (2)" from_port="example set output" to_op="Filter Examples (7)" to_port="example set input"/>
<connect from_op="Filter Examples (7)" from_port="example set output" to_port="result 2"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
<portSpacing port="sink_result 8" spacing="0"/>
</process>
</operator>
</process>
Thank you very much.
Tagged:
0
Answers