Similarity of two nominal attributes

jhillerjhiller Member Posts: 12 Contributor II
edited November 2018 in Help

Dear all,

 

I want to compare two nominal productdescriptions by using the "data to similarity" operator.

My Dataset looks like that: id; ai_description_mod; gp_description_mod; [and some other uninteresting attributes...]

 

My idea was to "loop attributes" of the exampleset.

The Subprocess of "loop attributes" should multiply the input. after forking the input, in one line input "descriptionA" is selected and renamed to "description", in the other one "descriptionB" is selected and renamed to "description". After that the two examples are put together by "append" operator.

Then I go on like it is done in that tutorial: http://vancouverdata.blogspot.de/2010/11/text-analytics-with-rapidminer-part-4.html by "Process documents from data", "Tokenize", "Transform Cases" and after all "Data to similarity".

 

Unfortunately "Data to similarity" is done over alle Examples, not only the two description with the same id, I wanted to compare. Later I just want to work on with the similarity-value.3

 

This is my code:

 

          </operator>
<operator activated="true" class="loop_examples" compatibility="7.4.000" expanded="true" height="82" name="Loop Examples" width="90" x="1117" y="34">
<parameter key="iteration_macro" value="example"/>
<process expanded="true">
<operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
<operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="ai_description_mod"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
<parameter key="old_name" value="ai_description_mod"/>
<parameter key="new_name" value="description"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="gp_description_mod"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
<parameter key="old_name" value="gp_description_mod"/>
<parameter key="new_name" value="description"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="103" name="Append" width="90" x="447" y="85">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" breakpoints="after" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="description" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" breakpoints="after" class="data_to_similarity" compatibility="7.4.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<connect from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
<connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>

 

 

Has anyone an idea how to do that?

 

Yours

Johannes

Best Answer

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn
    Solution Accepted

    Yes you'd have to use macros and the generic Loop for that with a Filter Examples. 

     

    This feels a bit hackish but I would go back and check to see if this is what will work and maybe clean out the duplicates earlier with some logic.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
    <parameter key="repository_entry" value="../data/temp"/>
    </operator>
    <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy gp_desription" width="90" x="179" y="34">
    <parameter key="attribute_name" value="gp_description"/>
    <parameter key="new_name" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy ap_desription" width="90" x="313" y="34">
    <parameter key="attribute_name" value="ai_description"/>
    <parameter key="new_name" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteTabs" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="(\ ){14,}"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteStartingNL" width="90" x="581" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="\A\R"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="ReplNewLines" width="90" x="715" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="\R"/>
    <parameter key="replace_by" value=" "/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteDoubleSpaces" width="90" x="849" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="(\ ){2,}"/>
    <parameter key="replace_by" value=" "/>
    </operator>
    <operator activated="false" class="loop_examples" compatibility="7.5.000" expanded="true" height="82" name="Loop Examples" width="90" x="983" y="34">
    <process expanded="true">
    <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
    <parameter key="old_name" value="ai_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
    <parameter key="old_name" value="gp_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append" width="90" x="447" y="85"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="description" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
    <parameter key="measure_types" value="NumericalMeasures"/>
    <parameter key="numerical_measure" value="CosineSimilarity"/>
    </operator>
    <connect from_port="example set" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
    <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
    <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
    <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
    <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
    <connect from_op="Data to Similarity" from_port="example set" to_port="example set"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply (2)" width="90" x="983" y="187"/>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc (2)" width="90" x="1117" y="289">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (4)" width="90" x="1251" y="289">
    <parameter key="old_name" value="gp_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc (2)" width="90" x="1117" y="187">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (3)" width="90" x="1251" y="187">
    <parameter key="old_name" value="ai_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append (2)" width="90" x="1385" y="238"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="1519" y="238">
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="description" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity (2)" width="90" x="1653" y="238">
    <parameter key="measure_types" value="NumericalMeasures"/>
    <parameter key="numerical_measure" value="CosineSimilarity"/>
    </operator>
    <operator activated="true" class="similarity_to_data" compatibility="7.5.000" expanded="true" height="82" name="Similarity to Data" width="90" x="1787" y="238"/>
    <operator activated="true" class="extract_macro" compatibility="7.5.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="1921" y="238">
    <parameter key="macro" value="num"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="concurrency:loop" compatibility="7.5.000" expanded="true" height="82" name="Loop" width="90" x="2055" y="238">
    <parameter key="number_of_iterations" value="%{num}"/>
    <process expanded="true">
    <operator activated="true" class="extract_macro" compatibility="7.5.000" expanded="true" height="68" name="Extract Macro" width="90" x="112" y="34">
    <parameter key="macro" value="extract_id"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="attribute_name" value="FIRST_ID"/>
    <parameter key="example_index" value="%{iteration}"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="7.5.000" expanded="true" height="103" name="Filter Examples" width="90" x="246" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="FIRST_ID.eq.%{extract_id}"/>
    <parameter key="filters_entry_key" value="SECOND_ID.eq.%{extract_id}"/>
    </list>
    </operator>
    <connect from_port="input 1" to_op="Extract Macro" to_port="example set"/>
    <connect from_op="Extract Macro" from_port="example set" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="82" name="Append (3)" width="90" x="2189" y="238"/>
    <operator activated="true" class="remove_duplicates" compatibility="7.5.000" expanded="true" height="103" name="Remove Duplicates" width="90" x="2323" y="238"/>
    <connect from_op="Retrieve" from_port="output" to_op="Generate Copy gp_desription" to_port="example set input"/>
    <connect from_op="Generate Copy gp_desription" from_port="example set output" to_op="Generate Copy ap_desription" to_port="example set input"/>
    <connect from_op="Generate Copy ap_desription" from_port="example set output" to_op="DeleteTabs" to_port="example set input"/>
    <connect from_op="DeleteTabs" from_port="example set output" to_op="DeleteStartingNL" to_port="example set input"/>
    <connect from_op="DeleteStartingNL" from_port="example set output" to_op="ReplNewLines" to_port="example set input"/>
    <connect from_op="ReplNewLines" from_port="example set output" to_op="DeleteDoubleSpaces" to_port="example set input"/>
    <connect from_op="DeleteDoubleSpaces" from_port="example set output" to_op="Multiply (2)" to_port="input"/>
    <connect from_op="Multiply (2)" from_port="output 1" to_op="selectAi_desc (2)" to_port="example set input"/>
    <connect from_op="Multiply (2)" from_port="output 2" to_op="selectGp_desc (2)" to_port="example set input"/>
    <connect from_op="selectGp_desc (2)" from_port="example set output" to_op="Rename (4)" to_port="example set input"/>
    <connect from_op="Rename (4)" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
    <connect from_op="selectAi_desc (2)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
    <connect from_op="Rename (3)" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
    <connect from_op="Append (2)" from_port="merged set" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Data to Similarity (2)" to_port="example set"/>
    <connect from_op="Data to Similarity (2)" from_port="similarity" to_op="Similarity to Data" to_port="similarity"/>
    <connect from_op="Data to Similarity (2)" from_port="example set" to_op="Similarity to Data" to_port="exampleSet"/>
    <connect from_op="Similarity to Data" from_port="exampleSet" to_op="Extract Macro (2)" to_port="example set"/>
    <connect from_op="Extract Macro (2)" from_port="example set" to_op="Loop" to_port="input 1"/>
    <connect from_op="Loop" from_port="output 1" to_op="Append (3)" to_port="example set 1"/>
    <connect from_op="Append (3)" from_port="merged set" to_op="Remove Duplicates" to_port="example set input"/>
    <connect from_op="Remove Duplicates" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

Answers

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    The XML code you posted is invalid, try again or just export the process and attache that. Would you also attach a sample of the data?

  • jhillerjhiller Member Posts: 12 Contributor II

    This is the whole process:

    <?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
    <parameter key="repository_entry" value="../data/temp"/>
    </operator>
    <operator activated="true" class="generate_copy" compatibility="7.4.000" expanded="true" height="82" name="Generate Copy gp_desription" width="90" x="179" y="34">
    <parameter key="attribute_name" value="gp_description"/>
    <parameter key="new_name" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="generate_copy" compatibility="7.4.000" expanded="true" height="82" name="Generate Copy ap_desription" width="90" x="313" y="34">
    <parameter key="attribute_name" value="ai_description"/>
    <parameter key="new_name" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="DeleteTabs" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="replace_what" value="(\ ){14,}"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="DeleteStartingNL" width="90" x="581" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="replace_what" value="\A\R"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="ReplNewLines" width="90" x="715" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="replace_what" value="\R"/>
    <parameter key="replace_by" value=" "/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="DeleteDoubleSpaces" width="90" x="849" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="replace_what" value="(\ ){2,}"/>
    <parameter key="replace_by" value=" "/>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="7.4.000" expanded="true" height="82" name="Loop Examples" width="90" x="983" y="34">
    <parameter key="iteration_macro" value="example"/>
    <process expanded="true">
    <operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
    <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="ai_description_mod"/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
    <parameter key="old_name" value="ai_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="gp_description_mod"/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
    <parameter key="old_name" value="gp_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="103" name="Append" width="90" x="447" y="85">
    <parameter key="datamanagement" value="double_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="merge_type" value="all"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
    <parameter key="create_word_vector" value="true"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="false"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="30.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="description" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
    <parameter key="mode" value="non letters"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="English"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
    <parameter key="transform_to" value="lower case"/>
    </operator>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="data_to_similarity" compatibility="7.4.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
    <parameter key="measure_types" value="NumericalMeasures"/>
    <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
    <parameter key="nominal_measure" value="NominalDistance"/>
    <parameter key="numerical_measure" value="CosineSimilarity"/>
    <parameter key="divergence" value="GeneralizedIDivergence"/>
    <parameter key="kernel_type" value="radial"/>
    <parameter key="kernel_gamma" value="1.0"/>
    <parameter key="kernel_sigma1" value="1.0"/>
    <parameter key="kernel_sigma2" value="0.0"/>
    <parameter key="kernel_sigma3" value="2.0"/>
    <parameter key="kernel_degree" value="3.0"/>
    <parameter key="kernel_shift" value="1.0"/>
    <parameter key="kernel_a" value="1.0"/>
    <parameter key="kernel_b" value="0.0"/>
    </operator>
    <connect from_port="example set" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
    <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
    <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
    <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
    <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Retrieve" from_port="output" to_op="Generate Copy gp_desription" to_port="example set input"/>
    <connect from_op="Generate Copy gp_desription" from_port="example set output" to_op="Generate Copy ap_desription" to_port="example set input"/>
    <connect from_op="Generate Copy ap_desription" from_port="example set output" to_op="DeleteTabs" to_port="example set input"/>
    <connect from_op="DeleteTabs" from_port="example set output" to_op="DeleteStartingNL" to_port="example set input"/>
    <connect from_op="DeleteStartingNL" from_port="example set output" to_op="ReplNewLines" to_port="example set input"/>
    <connect from_op="ReplNewLines" from_port="example set output" to_op="DeleteDoubleSpaces" to_port="example set input"/>
    <connect from_op="DeleteDoubleSpaces" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
    <connect from_op="Loop Examples" from_port="example set" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    For the exampleset see attached zip-file.

     

    Thanks a lot!

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    Ok, so in the example you posted you have two rows that have an ID of 5 and 21. You want to compare the text processesed results of example row 5 and example row 21 via similarity? Right?

  • jhillerjhiller Member Posts: 12 Contributor II

    Yes, that's right.

     

    Do you have an idea?

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    I think what you are looking for is the Simiarlity to Data operator. I will give you a table of ID 5 vs ID 21 and so forth.

     

    Something like this?

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
    <parameter key="repository_entry" value="../data/temp"/>
    </operator>
    <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy gp_desription" width="90" x="179" y="34">
    <parameter key="attribute_name" value="gp_description"/>
    <parameter key="new_name" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy ap_desription" width="90" x="313" y="34">
    <parameter key="attribute_name" value="ai_description"/>
    <parameter key="new_name" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteTabs" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="(\ ){14,}"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteStartingNL" width="90" x="581" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="\A\R"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="ReplNewLines" width="90" x="715" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="\R"/>
    <parameter key="replace_by" value=" "/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteDoubleSpaces" width="90" x="849" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
    <parameter key="replace_what" value="(\ ){2,}"/>
    <parameter key="replace_by" value=" "/>
    </operator>
    <operator activated="false" class="loop_examples" compatibility="7.5.000" expanded="true" height="82" name="Loop Examples" width="90" x="983" y="34">
    <process expanded="true">
    <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
    <parameter key="old_name" value="ai_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
    <parameter key="old_name" value="gp_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append" width="90" x="447" y="85"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="description" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
    <parameter key="measure_types" value="NumericalMeasures"/>
    <parameter key="numerical_measure" value="CosineSimilarity"/>
    </operator>
    <connect from_port="example set" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
    <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
    <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
    <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
    <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
    <connect from_op="Data to Similarity" from_port="example set" to_port="example set"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply (2)" width="90" x="983" y="187"/>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc (2)" width="90" x="1117" y="187">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="ai_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (3)" width="90" x="1251" y="187">
    <parameter key="old_name" value="ai_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc (2)" width="90" x="1117" y="289">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="gp_description_mod"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (4)" width="90" x="1251" y="289">
    <parameter key="old_name" value="gp_description_mod"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append (2)" width="90" x="1385" y="238"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="1519" y="238">
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="description" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity (2)" width="90" x="1653" y="238">
    <parameter key="measure_types" value="NumericalMeasures"/>
    <parameter key="numerical_measure" value="CosineSimilarity"/>
    </operator>
    <operator activated="true" class="similarity_to_data" compatibility="7.5.000" expanded="true" height="82" name="Similarity to Data" width="90" x="1787" y="238"/>
    <connect from_op="Retrieve" from_port="output" to_op="Generate Copy gp_desription" to_port="example set input"/>
    <connect from_op="Generate Copy gp_desription" from_port="example set output" to_op="Generate Copy ap_desription" to_port="example set input"/>
    <connect from_op="Generate Copy ap_desription" from_port="example set output" to_op="DeleteTabs" to_port="example set input"/>
    <connect from_op="DeleteTabs" from_port="example set output" to_op="DeleteStartingNL" to_port="example set input"/>
    <connect from_op="DeleteStartingNL" from_port="example set output" to_op="ReplNewLines" to_port="example set input"/>
    <connect from_op="ReplNewLines" from_port="example set output" to_op="DeleteDoubleSpaces" to_port="example set input"/>
    <connect from_op="DeleteDoubleSpaces" from_port="example set output" to_op="Multiply (2)" to_port="input"/>
    <connect from_op="Multiply (2)" from_port="output 1" to_op="selectAi_desc (2)" to_port="example set input"/>
    <connect from_op="Multiply (2)" from_port="output 2" to_op="selectGp_desc (2)" to_port="example set input"/>
    <connect from_op="selectAi_desc (2)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
    <connect from_op="Rename (3)" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
    <connect from_op="selectGp_desc (2)" from_port="example set output" to_op="Rename (4)" to_port="example set input"/>
    <connect from_op="Rename (4)" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
    <connect from_op="Append (2)" from_port="merged set" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Data to Similarity (2)" to_port="example set"/>
    <connect from_op="Data to Similarity (2)" from_port="similarity" to_op="Similarity to Data" to_port="similarity"/>
    <connect from_op="Data to Similarity (2)" from_port="example set" to_op="Similarity to Data" to_port="exampleSet"/>
    <connect from_op="Similarity to Data" from_port="exampleSet" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
  • jhillerjhiller Member Posts: 12 Contributor II

    Ok, that works but I just need rows 2 and 5. is it possible to check similarity only for examples with the same id?

     result.JPG

  • jhillerjhiller Member Posts: 12 Contributor II

    Thanks a lot! Your solution ist working. I go on trying to improve the solution and post it in case.

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    Great to hear that. The one thing you'll learn about RapidMiner is that it's super flexible and you can get really creative.  Good luck!

  • jhillerjhiller Member Posts: 12 Contributor II

    Hi,

    here is a much more efficient solution. @Thomas_Ott: What do you think about the solution? Can you explain to me, why it is not possible to use the operator "Loop Examples"? I thought that this operator does that what we do with macro "setNum" and "loopNumExamples".

    <?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
    <parameter key="repository_entry" value="../data/temp"/>
    </operator>
    <operator activated="true" class="subprocess" compatibility="7.4.000" expanded="true" height="82" name="getDescrSimilarity" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="124" name="Multiply (2)" width="90" x="45" y="34"/>
    <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectGp_descr" width="90" x="179" y="187">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="gp_description"/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="renameGp_descr" width="90" x="313" y="187">
    <parameter key="old_name" value="gp_description"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectAi_descr" width="90" x="179" y="85">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="ai_description"/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="renameAi_descr" width="90" x="313" y="85">
    <parameter key="old_name" value="ai_description"/>
    <parameter key="new_name" value="description"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="103" name="appendDescr" width="90" x="447" y="136">
    <parameter key="datamanagement" value="double_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="merge_type" value="all"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="tokenizeTransform" width="90" x="581" y="136">
    <parameter key="create_word_vector" value="true"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="false"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="30.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="description" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
    <parameter key="mode" value="non letters"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="English"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
    <parameter key="transform_to" value="lower case"/>
    </operator>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="extract_macro" compatibility="7.4.000" expanded="true" height="68" name="setNum" width="90" x="715" y="136">
    <parameter key="macro" value="num"/>
    <parameter key="macro_type" value="number_of_examples"/>
    <parameter key="statistics" value="average"/>
    <parameter key="attribute_name" value=""/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="concurrency:loop" compatibility="7.4.000" expanded="true" height="82" name="loopNumExamples" width="90" x="849" y="136">
    <parameter key="number_of_iterations" value="%{num}"/>
    <parameter key="iteration_macro" value="iteration"/>
    <parameter key="reuse_results" value="false"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
    <operator activated="true" class="extract_macro" compatibility="7.4.000" expanded="true" height="68" name="Extract Macro" width="90" x="112" y="34">
    <parameter key="macro" value="extract_id"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="statistics" value="average"/>
    <parameter key="attribute_name" value="id"/>
    <parameter key="example_index" value="%{iteration}"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="7.4.000" expanded="true" height="103" name="Filter Examples" width="90" x="246" y="34">
    <parameter key="parameter_expression" value=""/>
    <parameter key="condition_class" value="custom_filters"/>
    <parameter key="invert_filter" value="false"/>
    <list key="filters_list">
    <parameter key="filters_entry_key" value="id.eq.%{extract_id}"/>
    </list>
    <parameter key="filters_logic_and" value="true"/>
    <parameter key="filters_check_metadata" value="true"/>
    </operator>
    <operator activated="true" class="data_to_similarity" compatibility="7.4.000" expanded="true" height="82" name="getSimilarity" width="90" x="380" y="34">
    <parameter key="measure_types" value="NumericalMeasures"/>
    <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
    <parameter key="nominal_measure" value="NominalDistance"/>
    <parameter key="numerical_measure" value="CosineSimilarity"/>
    <parameter key="divergence" value="GeneralizedIDivergence"/>
    <parameter key="kernel_type" value="radial"/>
    <parameter key="kernel_gamma" value="1.0"/>
    <parameter key="kernel_sigma1" value="1.0"/>
    <parameter key="kernel_sigma2" value="0.0"/>
    <parameter key="kernel_sigma3" value="2.0"/>
    <parameter key="kernel_degree" value="3.0"/>
    <parameter key="kernel_shift" value="1.0"/>
    <parameter key="kernel_a" value="1.0"/>
    <parameter key="kernel_b" value="0.0"/>
    </operator>
    <operator activated="true" class="similarity_to_data" compatibility="7.4.000" expanded="true" height="82" name="transformSimilarity" width="90" x="514" y="34">
    <parameter key="table_type" value="long_table"/>
    </operator>
    <connect from_port="input 1" to_op="Extract Macro" to_port="example set"/>
    <connect from_op="Extract Macro" from_port="example set" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="getSimilarity" to_port="example set"/>
    <connect from_op="getSimilarity" from_port="similarity" to_op="transformSimilarity" to_port="similarity"/>
    <connect from_op="getSimilarity" from_port="example set" to_op="transformSimilarity" to_port="exampleSet"/>
    <connect from_op="transformSimilarity" from_port="exampleSet" to_port="output 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="82" name="appendResults" width="90" x="983" y="136">
    <parameter key="datamanagement" value="double_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="merge_type" value="all"/>
    </operator>
    <operator activated="true" class="remove_duplicates" compatibility="7.4.000" expanded="true" height="82" name="Remove Duplicates" width="90" x="1117" y="136">
    <parameter key="attribute_filter_type" value="all"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="treat_missing_values_as_duplicates" value="false"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.4.000" expanded="true" height="82" name="Set Role" width="90" x="1251" y="136">
    <parameter key="attribute_name" value="FIRST_ID"/>
    <parameter key="target_role" value="id"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="renameSIMILARITY" width="90" x="1385" y="136">
    <parameter key="old_name" value="SIMILARITY"/>
    <parameter key="new_name" value="r_sim_descr"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" breakpoints="after" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="removeSecondID" width="90" x="1519" y="136">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="SECOND_ID"/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="true"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="join" compatibility="7.4.000" expanded="true" height="82" name="Join" width="90" x="1653" y="34">
    <parameter key="remove_double_attributes" value="true"/>
    <parameter key="join_type" value="inner"/>
    <parameter key="use_id_attribute_as_key" value="true"/>
    <list key="key_attributes"/>
    <parameter key="keep_both_join_attributes" value="false"/>
    </operator>
    <connect from_port="in 1" to_op="Multiply (2)" to_port="input"/>
    <connect from_op="Multiply (2)" from_port="output 1" to_op="Join" to_port="left"/>
    <connect from_op="Multiply (2)" from_port="output 2" to_op="selectAi_descr" to_port="example set input"/>
    <connect from_op="Multiply (2)" from_port="output 3" to_op="selectGp_descr" to_port="example set input"/>
    <connect from_op="selectGp_descr" from_port="example set output" to_op="renameGp_descr" to_port="example set input"/>
    <connect from_op="renameGp_descr" from_port="example set output" to_op="appendDescr" to_port="example set 2"/>
    <connect from_op="selectAi_descr" from_port="example set output" to_op="renameAi_descr" to_port="example set input"/>
    <connect from_op="renameAi_descr" from_port="example set output" to_op="appendDescr" to_port="example set 1"/>
    <connect from_op="appendDescr" from_port="merged set" to_op="tokenizeTransform" to_port="example set"/>
    <connect from_op="tokenizeTransform" from_port="example set" to_op="setNum" to_port="example set"/>
    <connect from_op="setNum" from_port="example set" to_op="loopNumExamples" to_port="input 1"/>
    <connect from_op="loopNumExamples" from_port="output 1" to_op="appendResults" to_port="example set 1"/>
    <connect from_op="appendResults" from_port="merged set" to_op="Remove Duplicates" to_port="example set input"/>
    <connect from_op="Remove Duplicates" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="renameSIMILARITY" to_port="example set input"/>
    <connect from_op="renameSIMILARITY" from_port="example set output" to_op="removeSecondID" to_port="example set input"/>
    <connect from_op="removeSecondID" from_port="example set output" to_op="Join" to_port="right"/>
    <connect from_op="Join" from_port="join" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Retrieve" from_port="output" to_op="getDescrSimilarity" to_port="in 1"/>
    <connect from_op="getDescrSimilarity" from_port="out 1" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Yours

    Johannes

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    I think you could use Loop Examples but I usually defer to the generic Loop operator. 

Sign In or Register to comment.