Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Tailoring Text Mining Clusters
kulturevulture
Member Posts: 4 Contributor I
Hello. I am trying to classify electronic notebook items into one of three defined categories (clusters). My process does run and I get a count of notebooks into each of 3 clusters. Is there a way to tailor the creation of the clusters since I know generally the identity of what I want each of the 3 clusters to be but not the actual words which will identify the cluster? I'm thinking of something like PCA where I can select the words which would define a cluster. Any ideas would be appreciated.
Here is my XML:<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
<process expanded="true" height="386" width="415">
<operator activated="true" class="read_excel" compatibility="5.2.003" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
<parameter key="excel_file" value="E:\R-2.14.0\bin\i386\ELN.xls"/>
<parameter key="imported_cell_range" value="A1:C301"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="ELN.true.nominal.id"/>
<parameter key="1" value="Title.true.text.attribute"/>
<parameter key="2" value="Text.true.text.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="112" y="120">
<parameter key="keep_text" value="true"/>
<parameter key="prune_below_absolute" value="1"/>
<parameter key="prune_above_absolute" value="3"/>
<list key="specify_weights"/>
<process expanded="true" height="426" width="501">
<operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="179" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="165"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="210">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="313" y="255"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="k_means" compatibility="5.2.003" expanded="true" height="76" name="Clustering" width="90" x="45" y="300">
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="3"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.2.003" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="300"/>
<operator activated="true" class="write_csv" compatibility="5.2.003" expanded="true" height="76" name="Write CSV" width="90" x="313" y="300"/>
<connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Write CSV" to_port="input"/>
<connect from_op="Write CSV" from_port="file" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Here is my XML:<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
<process expanded="true" height="386" width="415">
<operator activated="true" class="read_excel" compatibility="5.2.003" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
<parameter key="excel_file" value="E:\R-2.14.0\bin\i386\ELN.xls"/>
<parameter key="imported_cell_range" value="A1:C301"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="ELN.true.nominal.id"/>
<parameter key="1" value="Title.true.text.attribute"/>
<parameter key="2" value="Text.true.text.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="112" y="120">
<parameter key="keep_text" value="true"/>
<parameter key="prune_below_absolute" value="1"/>
<parameter key="prune_above_absolute" value="3"/>
<list key="specify_weights"/>
<process expanded="true" height="426" width="501">
<operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="179" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="165"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="210">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="313" y="255"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="k_means" compatibility="5.2.003" expanded="true" height="76" name="Clustering" width="90" x="45" y="300">
<parameter key="add_as_label" value="true"/>
<parameter key="k" value="3"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.2.003" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="300"/>
<operator activated="true" class="write_csv" compatibility="5.2.003" expanded="true" height="76" name="Write CSV" width="90" x="313" y="300"/>
<connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Write CSV" to_port="input"/>
<connect from_op="Write CSV" from_port="file" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0