RapidMiner

Contributor II jhiller
Contributor II

Similarity of two nominal attributes

Dear all,

 

I want to compare two nominal productdescriptions by using the "data to similarity" operator.

My Dataset looks like that: id; ai_description_mod; gp_description_mod; [and some other uninteresting attributes...]

 

My idea was to "loop attributes" of the exampleset.

The Subprocess of "loop attributes" should multiply the input. after forking the input, in one line input "descriptionA" is selected and renamed to "description", in the other one "descriptionB" is selected and renamed to "description". After that the two examples are put together by "append" operator.

Then I go on like it is done in that tutorial: http://vancouverdata.blogspot.de/2010/11/text-analytics-with-rapidminer-part-4.html by "Process documents from data", "Tokenize", "Transform Cases" and after all "Data to similarity".

 

Unfortunately "Data to similarity" is done over alle Examples, not only the two description with the same id, I wanted to compare. Later I just want to work on with the similarity-value.3

 

This is my code:

 

          </operator>
          <operator activated="true" class="loop_examples" compatibility="7.4.000" expanded="true" height="82" name="Loop Examples" width="90" x="1117" y="34">
            <parameter key="iteration_macro" value="example"/>
            <process expanded="true">
              <operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
              <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="ai_description_mod"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
                <parameter key="old_name" value="ai_description_mod"/>
                <parameter key="new_name" value="description"/>
                <list key="rename_additional_attributes"/>
              </operator>
              <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="gp_description_mod"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
                <parameter key="old_name" value="gp_description_mod"/>
                <parameter key="new_name" value="description"/>
                <list key="rename_additional_attributes"/>
              </operator>
              <operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="103" name="Append" width="90" x="447" y="85">
                <parameter key="datamanagement" value="double_array"/>
                <parameter key="data_management" value="auto"/>
                <parameter key="merge_type" value="all"/>
              </operator>
              <operator activated="true" breakpoints="after" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
                <parameter key="create_word_vector" value="true"/>
                <parameter key="vector_creation" value="TF-IDF"/>
                <parameter key="add_meta_information" value="true"/>
                <parameter key="keep_text" value="false"/>
                <parameter key="prune_method" value="none"/>
                <parameter key="prune_below_percent" value="3.0"/>
                <parameter key="prune_above_percent" value="30.0"/>
                <parameter key="prune_below_rank" value="0.05"/>
                <parameter key="prune_above_rank" value="0.95"/>
                <parameter key="datamanagement" value="double_sparse_array"/>
                <parameter key="select_attributes_and_weights" value="true"/>
                <list key="specify_weights">
                  <parameter key="description" value="1.0"/>
                </list>
                <process expanded="true">
                  <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
                    <parameter key="mode" value="non letters"/>
                    <parameter key="characters" value=".:"/>
                    <parameter key="language" value="English"/>
                    <parameter key="max_token_length" value="3"/>
                  </operator>
                  <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
                    <parameter key="transform_to" value="lower case"/>
                  </operator>
                  <connect from_port="document" to_op="Tokenize" to_port="document"/>
                  <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
                  <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" breakpoints="after" class="data_to_similarity" compatibility="7.4.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
                <parameter key="measure_types" value="NumericalMeasures"/>
                <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
                <parameter key="nominal_measure" value="NominalDistance"/>
                <parameter key="numerical_measure" value="CosineSimilarity"/>
                <parameter key="divergence" value="GeneralizedIDivergence"/>
                <parameter key="kernel_type" value="radial"/>
                <parameter key="kernel_gamma" value="1.0"/>
                <parameter key="kernel_sigma1" value="1.0"/>
                <parameter key="kernel_sigma2" value="0.0"/>
                <parameter key="kernel_sigma3" value="2.0"/>
                <parameter key="kernel_degree" value="3.0"/>
                <parameter key="kernel_shift" value="1.0"/>
                <parameter key="kernel_a" value="1.0"/>
                <parameter key="kernel_b" value="0.0"/>
              </operator>
              <connect from_port="example set" to_op="Multiply" to_port="input"/>
              <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
              <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
              <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
              <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
              <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
              <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
              <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
              <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
            </process>
          </operator>

 

 

Has anyone an idea how to do that?

 

Yours

Johannes

11 REPLIES
RM Certified Expert
RM Certified Expert

Re: Similarity of two nominal attributes

The XML code you posted is invalid, try again or just export the process and attache that. Would you also attach a sample of the data?

Contributor II jhiller
Contributor II

Re: Similarity of two nominal attributes

This is the whole process:

<?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../data/temp"/>
      </operator>
      <operator activated="true" class="generate_copy" compatibility="7.4.000" expanded="true" height="82" name="Generate Copy gp_desription" width="90" x="179" y="34">
        <parameter key="attribute_name" value="gp_description"/>
        <parameter key="new_name" value="gp_description_mod"/>
      </operator>
      <operator activated="true" class="generate_copy" compatibility="7.4.000" expanded="true" height="82" name="Generate Copy ap_desription" width="90" x="313" y="34">
        <parameter key="attribute_name" value="ai_description"/>
        <parameter key="new_name" value="ai_description_mod"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="DeleteTabs" width="90" x="447" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="replace_what" value="(\ ){14,}"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="DeleteStartingNL" width="90" x="581" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="replace_what" value="\A\R"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="ReplNewLines" width="90" x="715" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="replace_what" value="\R"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.4.000" expanded="true" height="82" name="DeleteDoubleSpaces" width="90" x="849" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="replace_what" value="(\ ){2,}"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <operator activated="true" class="loop_examples" compatibility="7.4.000" expanded="true" height="82" name="Loop Examples" width="90" x="983" y="34">
        <parameter key="iteration_macro" value="example"/>
        <process expanded="true">
          <operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
          <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="ai_description_mod"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
            <parameter key="old_name" value="ai_description_mod"/>
            <parameter key="new_name" value="description"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="gp_description_mod"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
            <parameter key="old_name" value="gp_description_mod"/>
            <parameter key="new_name" value="description"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="103" name="Append" width="90" x="447" y="85">
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="merge_type" value="all"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="select_attributes_and_weights" value="true"/>
            <list key="specify_weights">
              <parameter key="description" value="1.0"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="data_to_similarity" compatibility="7.4.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
            <parameter key="measure_types" value="NumericalMeasures"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="CosineSimilarity"/>
            <parameter key="divergence" value="GeneralizedIDivergence"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
          </operator>
          <connect from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
          <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
          <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Generate Copy gp_desription" to_port="example set input"/>
      <connect from_op="Generate Copy gp_desription" from_port="example set output" to_op="Generate Copy ap_desription" to_port="example set input"/>
      <connect from_op="Generate Copy ap_desription" from_port="example set output" to_op="DeleteTabs" to_port="example set input"/>
      <connect from_op="DeleteTabs" from_port="example set output" to_op="DeleteStartingNL" to_port="example set input"/>
      <connect from_op="DeleteStartingNL" from_port="example set output" to_op="ReplNewLines" to_port="example set input"/>
      <connect from_op="ReplNewLines" from_port="example set output" to_op="DeleteDoubleSpaces" to_port="example set input"/>
      <connect from_op="DeleteDoubleSpaces" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
      <connect from_op="Loop Examples" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

For the exampleset see attached zip-file.

 

Thanks a lot!

RM Certified Expert
RM Certified Expert

Re: Similarity of two nominal attributes

Ok, so in the example you posted you have two rows that have an ID of 5 and 21. You want to compare the text processesed results of example row 5 and example row 21 via similarity? Right?

Contributor II jhiller
Contributor II

Re: Similarity of two nominal attributes

Yes, that's right.

 

Do you have an idea?

RM Certified Expert
RM Certified Expert

Re: Similarity of two nominal attributes

I think what you are looking for is the Simiarlity to Data operator. I will give you a table of ID 5 vs ID 21 and so forth.

 

Something like this?

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../data/temp"/>
      </operator>
      <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy gp_desription" width="90" x="179" y="34">
        <parameter key="attribute_name" value="gp_description"/>
        <parameter key="new_name" value="gp_description_mod"/>
      </operator>
      <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy ap_desription" width="90" x="313" y="34">
        <parameter key="attribute_name" value="ai_description"/>
        <parameter key="new_name" value="ai_description_mod"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteTabs" width="90" x="447" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="(\ ){14,}"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteStartingNL" width="90" x="581" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="\A\R"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="ReplNewLines" width="90" x="715" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="\R"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteDoubleSpaces" width="90" x="849" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="(\ ){2,}"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <operator activated="false" class="loop_examples" compatibility="7.5.000" expanded="true" height="82" name="Loop Examples" width="90" x="983" y="34">
        <process expanded="true">
          <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
          <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="ai_description_mod"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
            <parameter key="old_name" value="ai_description_mod"/>
            <parameter key="new_name" value="description"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="gp_description_mod"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
            <parameter key="old_name" value="gp_description_mod"/>
            <parameter key="new_name" value="description"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append" width="90" x="447" y="85"/>
          <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
            <parameter key="select_attributes_and_weights" value="true"/>
            <list key="specify_weights">
              <parameter key="description" value="1.0"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
              <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
            <parameter key="measure_types" value="NumericalMeasures"/>
            <parameter key="numerical_measure" value="CosineSimilarity"/>
          </operator>
          <connect from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
          <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
          <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
          <connect from_op="Data to Similarity" from_port="example set" to_port="example set"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply (2)" width="90" x="983" y="187"/>
      <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc (2)" width="90" x="1117" y="187">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="ai_description_mod"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (3)" width="90" x="1251" y="187">
        <parameter key="old_name" value="ai_description_mod"/>
        <parameter key="new_name" value="description"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc (2)" width="90" x="1117" y="289">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="gp_description_mod"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (4)" width="90" x="1251" y="289">
        <parameter key="old_name" value="gp_description_mod"/>
        <parameter key="new_name" value="description"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append (2)" width="90" x="1385" y="238"/>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="1519" y="238">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="description" value="1.0"/>
        </list>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity (2)" width="90" x="1653" y="238">
        <parameter key="measure_types" value="NumericalMeasures"/>
        <parameter key="numerical_measure" value="CosineSimilarity"/>
      </operator>
      <operator activated="true" class="similarity_to_data" compatibility="7.5.000" expanded="true" height="82" name="Similarity to Data" width="90" x="1787" y="238"/>
      <connect from_op="Retrieve" from_port="output" to_op="Generate Copy gp_desription" to_port="example set input"/>
      <connect from_op="Generate Copy gp_desription" from_port="example set output" to_op="Generate Copy ap_desription" to_port="example set input"/>
      <connect from_op="Generate Copy ap_desription" from_port="example set output" to_op="DeleteTabs" to_port="example set input"/>
      <connect from_op="DeleteTabs" from_port="example set output" to_op="DeleteStartingNL" to_port="example set input"/>
      <connect from_op="DeleteStartingNL" from_port="example set output" to_op="ReplNewLines" to_port="example set input"/>
      <connect from_op="ReplNewLines" from_port="example set output" to_op="DeleteDoubleSpaces" to_port="example set input"/>
      <connect from_op="DeleteDoubleSpaces" from_port="example set output" to_op="Multiply (2)" to_port="input"/>
      <connect from_op="Multiply (2)" from_port="output 1" to_op="selectAi_desc (2)" to_port="example set input"/>
      <connect from_op="Multiply (2)" from_port="output 2" to_op="selectGp_desc (2)" to_port="example set input"/>
      <connect from_op="selectAi_desc (2)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
      <connect from_op="Rename (3)" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
      <connect from_op="selectGp_desc (2)" from_port="example set output" to_op="Rename (4)" to_port="example set input"/>
      <connect from_op="Rename (4)" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
      <connect from_op="Append (2)" from_port="merged set" to_op="Process Documents from Data (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Data to Similarity (2)" to_port="example set"/>
      <connect from_op="Data to Similarity (2)" from_port="similarity" to_op="Similarity to Data" to_port="similarity"/>
      <connect from_op="Data to Similarity (2)" from_port="example set" to_op="Similarity to Data" to_port="exampleSet"/>
      <connect from_op="Similarity to Data" from_port="exampleSet" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Contributor II jhiller
Contributor II

Re: Similarity of two nominal attributes

Ok, that works but I just need rows 2 and 5. is it possible to check similarity only for examples with the same id?

 result.JPG

RM Certified Expert
RM Certified Expert
Solution

Re: Similarity of two nominal attributes

Yes you'd have to use macros and the generic Loop for that with a Filter Examples. 

 

This feels a bit hackish but I would go back and check to see if this is what will work and maybe clean out the duplicates earlier with some logic.

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../data/temp"/>
      </operator>
      <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy gp_desription" width="90" x="179" y="34">
        <parameter key="attribute_name" value="gp_description"/>
        <parameter key="new_name" value="gp_description_mod"/>
      </operator>
      <operator activated="true" class="generate_copy" compatibility="7.5.000" expanded="true" height="82" name="Generate Copy ap_desription" width="90" x="313" y="34">
        <parameter key="attribute_name" value="ai_description"/>
        <parameter key="new_name" value="ai_description_mod"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteTabs" width="90" x="447" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="(\ ){14,}"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteStartingNL" width="90" x="581" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="\A\R"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="ReplNewLines" width="90" x="715" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="\R"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="DeleteDoubleSpaces" width="90" x="849" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|ai_description_mod|gp_description_mod"/>
        <parameter key="replace_what" value="(\ ){2,}"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <operator activated="false" class="loop_examples" compatibility="7.5.000" expanded="true" height="82" name="Loop Examples" width="90" x="983" y="34">
        <process expanded="true">
          <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
          <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="ai_description_mod"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
            <parameter key="old_name" value="ai_description_mod"/>
            <parameter key="new_name" value="description"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="gp_description_mod"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
            <parameter key="old_name" value="gp_description_mod"/>
            <parameter key="new_name" value="description"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append" width="90" x="447" y="85"/>
          <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
            <parameter key="select_attributes_and_weights" value="true"/>
            <list key="specify_weights">
              <parameter key="description" value="1.0"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
              <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
            <parameter key="measure_types" value="NumericalMeasures"/>
            <parameter key="numerical_measure" value="CosineSimilarity"/>
          </operator>
          <connect from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
          <connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
          <connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
          <connect from_op="Data to Similarity" from_port="example set" to_port="example set"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply (2)" width="90" x="983" y="187"/>
      <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectGp_desc (2)" width="90" x="1117" y="289">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="gp_description_mod"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (4)" width="90" x="1251" y="289">
        <parameter key="old_name" value="gp_description_mod"/>
        <parameter key="new_name" value="description"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="selectAi_desc (2)" width="90" x="1117" y="187">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="ai_description_mod"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.5.000" expanded="true" height="82" name="Rename (3)" width="90" x="1251" y="187">
        <parameter key="old_name" value="ai_description_mod"/>
        <parameter key="new_name" value="description"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="103" name="Append (2)" width="90" x="1385" y="238"/>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="1519" y="238">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="description" value="1.0"/>
        </list>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="data_to_similarity" compatibility="7.5.000" expanded="true" height="82" name="Data to Similarity (2)" width="90" x="1653" y="238">
        <parameter key="measure_types" value="NumericalMeasures"/>
        <parameter key="numerical_measure" value="CosineSimilarity"/>
      </operator>
      <operator activated="true" class="similarity_to_data" compatibility="7.5.000" expanded="true" height="82" name="Similarity to Data" width="90" x="1787" y="238"/>
      <operator activated="true" class="extract_macro" compatibility="7.5.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="1921" y="238">
        <parameter key="macro" value="num"/>
        <list key="additional_macros"/>
      </operator>
      <operator activated="true" class="concurrency:loop" compatibility="7.5.000" expanded="true" height="82" name="Loop" width="90" x="2055" y="238">
        <parameter key="number_of_iterations" value="%{num}"/>
        <process expanded="true">
          <operator activated="true" class="extract_macro" compatibility="7.5.000" expanded="true" height="68" name="Extract Macro" width="90" x="112" y="34">
            <parameter key="macro" value="extract_id"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="FIRST_ID"/>
            <parameter key="example_index" value="%{iteration}"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="7.5.000" expanded="true" height="103" name="Filter Examples" width="90" x="246" y="34">
            <list key="filters_list">
              <parameter key="filters_entry_key" value="FIRST_ID.eq.%{extract_id}"/>
              <parameter key="filters_entry_key" value="SECOND_ID.eq.%{extract_id}"/>
            </list>
          </operator>
          <connect from_port="input 1" to_op="Extract Macro" to_port="example set"/>
          <connect from_op="Extract Macro" from_port="example set" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="append" compatibility="7.5.000" expanded="true" height="82" name="Append (3)" width="90" x="2189" y="238"/>
      <operator activated="true" class="remove_duplicates" compatibility="7.5.000" expanded="true" height="103" name="Remove Duplicates" width="90" x="2323" y="238"/>
      <connect from_op="Retrieve" from_port="output" to_op="Generate Copy gp_desription" to_port="example set input"/>
      <connect from_op="Generate Copy gp_desription" from_port="example set output" to_op="Generate Copy ap_desription" to_port="example set input"/>
      <connect from_op="Generate Copy ap_desription" from_port="example set output" to_op="DeleteTabs" to_port="example set input"/>
      <connect from_op="DeleteTabs" from_port="example set output" to_op="DeleteStartingNL" to_port="example set input"/>
      <connect from_op="DeleteStartingNL" from_port="example set output" to_op="ReplNewLines" to_port="example set input"/>
      <connect from_op="ReplNewLines" from_port="example set output" to_op="DeleteDoubleSpaces" to_port="example set input"/>
      <connect from_op="DeleteDoubleSpaces" from_port="example set output" to_op="Multiply (2)" to_port="input"/>
      <connect from_op="Multiply (2)" from_port="output 1" to_op="selectAi_desc (2)" to_port="example set input"/>
      <connect from_op="Multiply (2)" from_port="output 2" to_op="selectGp_desc (2)" to_port="example set input"/>
      <connect from_op="selectGp_desc (2)" from_port="example set output" to_op="Rename (4)" to_port="example set input"/>
      <connect from_op="Rename (4)" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
      <connect from_op="selectAi_desc (2)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
      <connect from_op="Rename (3)" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
      <connect from_op="Append (2)" from_port="merged set" to_op="Process Documents from Data (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Data to Similarity (2)" to_port="example set"/>
      <connect from_op="Data to Similarity (2)" from_port="similarity" to_op="Similarity to Data" to_port="similarity"/>
      <connect from_op="Data to Similarity (2)" from_port="example set" to_op="Similarity to Data" to_port="exampleSet"/>
      <connect from_op="Similarity to Data" from_port="exampleSet" to_op="Extract Macro (2)" to_port="example set"/>
      <connect from_op="Extract Macro (2)" from_port="example set" to_op="Loop" to_port="input 1"/>
      <connect from_op="Loop" from_port="output 1" to_op="Append (3)" to_port="example set 1"/>
      <connect from_op="Append (3)" from_port="merged set" to_op="Remove Duplicates" to_port="example set input"/>
      <connect from_op="Remove Duplicates" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Highlighted
Contributor II jhiller
Contributor II

Re: Similarity of two nominal attributes

Thanks a lot! Your solution ist working. I go on trying to improve the solution and post it in case.

RM Certified Expert
RM Certified Expert

Re: Similarity of two nominal attributes

Great to hear that. The one thing you'll learn about RapidMiner is that it's super flexible and you can get really creative.  Good luck!

Polls
How can RapidMiner increase participation in our new competitions?
Twitter Feed