"Training a Text classification model with more than 1 training data inputs/outputs"

svtorykhsvtorykh Member Posts: 35 Guru
edited June 2019 in Help

Hi All!

 

I have a process that is able to take text column and topic column, build the model and train it. I then can use this model to assign topic to new text.

However, I now need to be able to have 2nd topic column added and may be 3rd and 4th to tell the model that this also can be an option. For example text document can contain both Innovation and Teamwork topics. So I want the model to recognize those 2 topics in a piece of text and then provide the output accordingly. Any idea how to implement it in Rapidminer?

 

Thanks much for support!

Answers

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    hello @svtorykh - welcome to the Community.  Can you please post the XML from your process using the </> button?

     

    Thanks.

     

    Scott

     

  • svtorykhsvtorykh Member Posts: 35 Guru

    Hi Scott,

     

    Here you go!

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="read_excel" compatibility="7.6.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
    <parameter key="excel_file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\RapidMiner Text Tutorials\All Real Deal Theme Training.xlsx"/>
    <parameter key="sheet_number" value="1"/>
    <parameter key="imported_cell_range" value="A1:B11605"/>
    <parameter key="encoding" value="SYSTEM"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <parameter key="date_format" value=""/>
    <parameter key="time_zone" value="SYSTEM"/>
    <parameter key="locale" value="English (United States)"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    <parameter key="1" value="Theme.true.polynominal.attribute"/>
    </list>
    <parameter key="read_not_matching_values_as_missings" value="true"/>
    <parameter key="datamanagement" value="double_array"/>
    <parameter key="data_management" value="auto"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="136">
    <parameter key="create_word_vector" value="true"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="absolute"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="30.0"/>
    <parameter key="prune_below_absolute" value="200"/>
    <parameter key="prune_above_absolute" value="12000"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
    <parameter key="mode" value="non letters"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="English"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="179" y="34">
    <parameter key="min_chars" value="2"/>
    <parameter key="max_chars" value="25"/>
    </operator>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="179" y="187"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="380" y="136">
    <parameter key="file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\Stopwords.txt"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="encoding" value="SYSTEM"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34">
    <parameter key="transform_to" value="lower case"/>
    </operator>
    <operator activated="true" class="text:stem_porter" compatibility="7.5.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="581" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
    <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
    <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Stem (Porter)" to_port="document"/>
    <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="store" compatibility="7.6.001" expanded="true" height="68" name="Store" width="90" x="45" y="238">
    <parameter key="repository_entry" value="../data/Wordlist"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="85">
    <parameter key="attribute_filter_type" value="no_missing_values"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="187">
    <parameter key="attribute_name" value="Theme"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="85">
    <parameter key="split_on_batch_attribute" value="false"/>
    <parameter key="leave_one_out" value="false"/>
    <parameter key="number_of_folds" value="5"/>
    <parameter key="sampling_type" value="automatic"/>
    <parameter key="use_local_random_seed" value="false"/>
    <parameter key="local_random_seed" value="1992"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
    <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="7.6.001" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34">
    <parameter key="number_of_trees" value="20"/>
    <parameter key="reproducible" value="false"/>
    <parameter key="maximum_number_of_threads" value="4"/>
    <parameter key="use_local_random_seed" value="false"/>
    <parameter key="local_random_seed" value="1992"/>
    <parameter key="maximal_depth" value="5"/>
    <parameter key="min_rows" value="10.0"/>
    <parameter key="min_split_improvement" value="0.0"/>
    <parameter key="number_of_bins" value="20"/>
    <parameter key="learning_rate" value="0.1"/>
    <parameter key="sample_rate" value="1.0"/>
    <parameter key="distribution" value="AUTO"/>
    <parameter key="early_stopping" value="false"/>
    <parameter key="stopping_rounds" value="1"/>
    <parameter key="stopping_metric" value="AUTO"/>
    <parameter key="stopping_tolerance" value="0.001"/>
    <parameter key="max_runtime_seconds" value="0"/>
    <list key="expert_parameters"/>
    </operator>
    <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/>
    <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
    <list key="application_parameters"/>
    <parameter key="create_view" value="false"/>
    </operator>
    <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="112" y="136">
    <parameter key="use_example_weights" value="true"/>
    </operator>
    <connect from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="store" compatibility="7.6.001" expanded="true" height="68" name="Store (2)" width="90" x="447" y="34">
    <parameter key="repository_entry" value="../data/Model"/>
    </operator>
    <operator activated="false" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Wordlist" width="90" x="45" y="340">
    <parameter key="repository_entry" value="../data/Wordlist"/>
    </operator>
    <operator activated="false" class="read_excel" compatibility="7.6.001" expanded="true" height="68" name="Read Excel (2)" width="90" x="179" y="391">
    <parameter key="excel_file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\RapidMiner Text Tutorials\Q3 Real Deal Theme Assignment.xlsx"/>
    <parameter key="sheet_number" value="1"/>
    <parameter key="imported_cell_range" value="A1:A1034"/>
    <parameter key="encoding" value="SYSTEM"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <parameter key="date_format" value=""/>
    <parameter key="time_zone" value="SYSTEM"/>
    <parameter key="locale" value="English (United States)"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    </list>
    <parameter key="read_not_matching_values_as_missings" value="true"/>
    <parameter key="datamanagement" value="double_array"/>
    <parameter key="data_management" value="auto"/>
    </operator>
    <operator activated="false" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="179" y="289">
    <parameter key="create_word_vector" value="true"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="percentual"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="100.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
    <parameter key="mode" value="non letters"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="English"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="179" y="34">
    <parameter key="min_chars" value="2"/>
    <parameter key="max_chars" value="25"/>
    </operator>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="179" y="187"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (3)" width="90" x="380" y="136">
    <parameter key="file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\Stopwords.txt"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="encoding" value="SYSTEM"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="447" y="34">
    <parameter key="transform_to" value="lower case"/>
    </operator>
    <operator activated="true" class="text:stem_porter" compatibility="7.5.000" expanded="true" height="68" name="Stem (2)" width="90" x="581" y="34"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
    <connect from_op="Filter Tokens (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
    <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
    <connect from_op="Filter Stopwords (3)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
    <connect from_op="Stem (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Model" width="90" x="380" y="238">
    <parameter key="repository_entry" value="../data/Model"/>
    </operator>
    <operator activated="false" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="514" y="340">
    <list key="application_parameters"/>
    <parameter key="create_view" value="false"/>
    </operator>
    <connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_op="Store" to_port="input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
    <connect from_op="Cross Validation" from_port="model" to_op="Store (2)" to_port="input"/>
    <connect from_op="Cross Validation" from_port="example set" to_port="result 1"/>
    <connect from_op="Cross Validation" from_port="test result set" to_port="result 2"/>
    <connect from_op="Cross Validation" from_port="performance 1" to_port="result 3"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Retrieve Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>

     

     

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    hmm - not 100% sure I understand but basically the data you use to create your training set model has to be robust enough so that it can handle any data types that comes from your test set.  Any unforeseen input types from your test set will naturally be classified poorly.  Perhaps this helps?

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="read_excel" compatibility="7.6.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
    <parameter key="excel_file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\RapidMiner Text Tutorials\All Real Deal Theme Training.xlsx"/>
    <parameter key="imported_cell_range" value="A1:B11605"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    <parameter key="1" value="Theme.true.polynominal.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="136">
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="absolute"/>
    <parameter key="prune_below_absolute" value="200"/>
    <parameter key="prune_above_absolute" value="12000"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="179" y="34">
    <parameter key="min_chars" value="2"/>
    </operator>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="179" y="187"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="380" y="136">
    <parameter key="file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\Stopwords.txt"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
    <operator activated="true" class="text:stem_porter" compatibility="7.5.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="581" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
    <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
    <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Stem (Porter)" to_port="document"/>
    <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="store" compatibility="7.6.001" expanded="true" height="68" name="Store" width="90" x="45" y="238">
    <parameter key="repository_entry" value="../data/Wordlist"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="85">
    <parameter key="attribute_filter_type" value="no_missing_values"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="187">
    <parameter key="attribute_name" value="Theme"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="85">
    <parameter key="number_of_folds" value="5"/>
    <process expanded="true">
    <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="7.6.001" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34">
    <list key="expert_parameters"/>
    </operator>
    <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/>
    <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="112" y="136"/>
    <connect from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Wordlist" width="90" x="45" y="340">
    <parameter key="repository_entry" value="../data/Wordlist"/>
    </operator>
    <operator activated="true" class="read_excel" compatibility="7.6.001" expanded="true" height="68" name="Read Excel (2)" width="90" x="45" y="493">
    <parameter key="excel_file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\RapidMiner Text Tutorials\Q3 Real Deal Theme Assignment.xlsx"/>
    <parameter key="imported_cell_range" value="A1:A1034"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="179" y="442">
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="percentual"/>
    <parameter key="prune_above_percent" value="100.0"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="179" y="34">
    <parameter key="min_chars" value="2"/>
    </operator>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="179" y="187"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (3)" width="90" x="380" y="136">
    <parameter key="file" value="C:\Users\svtorykh\Desktop\TAR\NLP Analysis\Adhoc-Extra\Mastery\Stopwords.txt"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="447" y="34"/>
    <operator activated="true" class="text:stem_porter" compatibility="7.5.000" expanded="true" height="68" name="Stem (2)" width="90" x="581" y="34"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
    <connect from_op="Filter Tokens (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
    <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
    <connect from_op="Filter Stopwords (3)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
    <connect from_op="Stem (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="514" y="340">
    <list key="application_parameters"/>
    </operator>
    <connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_op="Store" to_port="input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
    <connect from_op="Cross Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
    <connect from_op="Cross Validation" from_port="performance 1" to_port="result 3"/>
    <connect from_op="Retrieve Wordlist" from_port="output" to_op="Process Documents from Data (2)" to_port="word list"/>
    <connect from_op="Read Excel (2)" from_port="output" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
    <connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>
  • svtorykhsvtorykh Member Posts: 35 Guru

    Thanks for reply Scott.

    Let me rephrase the problem in business language.

     

    I have a set of text comments with 1 topic assigned for every comment in my data set.

    For example "I like doing my job" is tagged as Meaningful Work, "We have great innovative products" is tagged as Innovation. Comment is located in column A and Topic in column B. This data set is used as training data set to train the model. I can then take this model and assign same topics to comments that are not tagged. I.e. classify/categorize the text.

    Sometimes the comment may contain multiple topics, e.g. "I like my job as I'm continuously innovating every day". Ideally, I would tag this comment as both Meaningful Work and Innovation but second topic would need to come to column C. So my question is, how to build the model using training set with mutiple topics assigned to the same comment. Comment must stay as 1 row in excel with topics being added as columns. Is it clearer now?

     

    Regards,

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    Is this what you're talking about at the meetup next week, @IngoRM?  Or am I missing something obvious?

  • IngoRMIngoRM Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Community Manager, RMResearcher, Member, University Professor Posts: 1,751 RM Founder

    Nope.  What we are looking for here is the operator "Generate Prediction Ranking".  This operator from the Scoring group can be used after the operator "Apply Model" to identify the k most likely classes.  Below is a simple example using the Iris data set which assigns the 2 most likely classes to each test example. This should do the trick.

     

    Of course you could follow this with an operator "Generate Attributes" so that you only keep one single class if the model is very confident (for example whenever "confidence_1" is higher than 90%, replace "class_2" by missing or something like that).

     

    Hope this helps,

    Ingo

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="136">
    <parameter key="repository_entry" value="//Samples/data/Iris"/>
    </operator>
    <operator activated="true" class="split_data" compatibility="7.6.001" expanded="true" height="103" name="Split Data" width="90" x="179" y="136">
    <enumeration key="partitions">
    <parameter key="ratio" value="0.7"/>
    <parameter key="ratio" value="0.3"/>
    </enumeration>
    </operator>
    <operator activated="true" class="naive_bayes" compatibility="7.6.001" expanded="true" height="82" name="Naive Bayes" width="90" x="313" y="34"/>
    <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model" width="90" x="447" y="85">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="generate_prediction_ranking" compatibility="7.6.001" expanded="true" height="82" name="Generate Prediction Ranking" width="90" x="581" y="85">
    <parameter key="number_of_ranks" value="2"/>
    </operator>
    <connect from_op="Retrieve Iris" from_port="output" to_op="Split Data" to_port="example set"/>
    <connect from_op="Split Data" from_port="partition 1" to_op="Naive Bayes" to_port="training set"/>
    <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Generate Prediction Ranking" to_port="example set input"/>
    <connect from_op="Generate Prediction Ranking" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
  • svtorykhsvtorykh Member Posts: 35 Guru

    Thanks folks! Let me check it out!

Sign In or Register to comment.