Comparing a Document with Multiple Example Data

Nicson · January 2018

Hello,

I would like to first process one or more documents (tokenize, n-grams, etc. -> done) and then compare each document with several sample data lists. If there is a match/similarity, the name of the respective list should be matched to the original document. If the documents contain common tokens but do not agree with a list, then "Others" should be mapped additional. It should later be possible to trace which lists fit into a document. I imagine this to be similar to a sentiment analysis with a training model, except that besides positive and negative there are a lot of assignments. Unfortunately, I don't find an approach how to proceed.

I would appreciate your help :smileyhappy:

lionelderkrikor · January 2018

Hi @Nicson,

If I good understand what you want, here a starting point with a process with one wordlist and one document.

I create an attribute with the value :

- "wordlistname_documentname" if all the words of the wordlist are present in the document

- "wordlistname_documentname (others)" if only a part of the words of the wordlist are present in the document.

Here the process :

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="operator_toolbox:create_exampleset_from_doc" compatibility="0.7.000" expanded="true" height="68" name="Word_list_1" width="90" x="45" y="34">
        <parameter key="Input Csv" value="att1&#10;apples&#10;oranges&#10;bananas&#10;"/>
        <parameter key="Parse all as Nominal" value="true"/>
      </operator>
      <operator activated="true" class="text:create_document" compatibility="7.5.000" expanded="true" height="68" name="Document_1" width="90" x="45" y="136">
        <parameter key="text" value="apples are sweeter than oranges but bananas are the sweetest of them all"/>
      </operator>
      <operator activated="true" class="operator_toolbox:get_source_of_object_as_macro" compatibility="0.7.000" expanded="true" height="68" name="Extract Last Modifying Operator" width="90" x="246" y="136">
        <parameter key="macro name" value="documentName"/>
      </operator>
      <operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="447" y="136">
        <parameter key="text_attribute" value="text"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="648" y="136">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (3)" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Tokenize (3)" to_port="document"/>
          <connect from_op="Tokenize (3)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="782" y="187"/>
      <operator activated="true" class="operator_toolbox:get_source_of_object_as_macro" compatibility="0.7.000" expanded="true" height="68" name="Extract Last Modifying Operator (2)" width="90" x="246" y="34">
        <parameter key="macro name" value="wordListName"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="34">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data (2)" width="90" x="648" y="34"/>
      <operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="782" y="34"/>
      <operator activated="true" class="rmx_toolkit:set_minus_advanced" compatibility="2.1.692" expanded="true" height="124" name="Set Minus (Advanced)" width="90" x="983" y="85"/>
      <operator activated="true" class="append" compatibility="8.0.001" expanded="true" height="103" name="Append" width="90" x="1117" y="34"/>
      <operator activated="true" class="remove_duplicates" compatibility="8.0.001" expanded="true" height="103" name="Remove Duplicates" width="90" x="1251" y="85"/>
      <operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (2)" width="90" x="1385" y="85"/>
      <operator activated="true" class="branch" compatibility="8.0.001" expanded="true" height="103" name="Branch" width="90" x="1519" y="85">
        <parameter key="condition_type" value="max_examples"/>
        <parameter key="condition_value" value="0"/>
        <process expanded="true">
          <operator activated="true" class="generate_attributes" compatibility="8.0.001" expanded="true" height="82" name="Generate Attributes (3)" width="90" x="112" y="34">
            <list key="function_descriptions">
              <parameter key="wordlist" value="concat(%{wordListName},&quot;_&quot;,%{documentName})"/>
            </list>
          </operator>
          <connect from_port="input 1" to_op="Generate Attributes (3)" to_port="example set input"/>
          <connect from_op="Generate Attributes (3)" from_port="example set output" to_port="input 1"/>
          <portSpacing port="source_condition" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_input 1" spacing="0"/>
          <portSpacing port="sink_input 2" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="generate_attributes" compatibility="8.0.001" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="179" y="34">
            <list key="function_descriptions">
              <parameter key="wordlist" value="concat(%{wordListName},&quot;_&quot;,%{documentName},&quot; (others)&quot;)"/>
            </list>
          </operator>
          <connect from_port="input 1" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_port="input 1"/>
          <portSpacing port="source_condition" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_input 1" spacing="0"/>
          <portSpacing port="sink_input 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Word_list_1" from_port="output" to_op="Extract Last Modifying Operator (2)" to_port="through"/>
      <connect from_op="Document_1" from_port="output" to_op="Extract Last Modifying Operator" to_port="through"/>
      <connect from_op="Extract Last Modifying Operator" from_port="through" to_op="Documents to Data" to_port="documents 1"/>
      <connect from_op="Documents to Data" from_port="example set" to_op="Process Documents from Data (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data (2)" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Set Minus (Advanced)" to_port="subtrahend set 1"/>
      <connect from_op="Extract Last Modifying Operator (2)" from_port="through" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data (2)" to_port="word list"/>
      <connect from_op="WordList to Data (2)" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Set Minus (Advanced)" to_port="input set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Append" to_port="example set 2"/>
      <connect from_op="Set Minus (Advanced)" from_port="result set" to_op="Append" to_port="example set 1"/>
      <connect from_op="Append" from_port="merged set" to_op="Remove Duplicates" to_port="example set input"/>
      <connect from_op="Remove Duplicates" from_port="example set output" to_op="Branch" to_port="input 1"/>
      <connect from_op="Remove Duplicates" from_port="duplicates" to_op="Multiply (2)" to_port="input"/>
      <connect from_op="Multiply (2)" from_port="output 1" to_op="Branch" to_port="condition"/>
      <connect from_op="Multiply (2)" from_port="output 2" to_port="result 2"/>
      <connect from_op="Branch" from_port="input 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

I think this process can be improved maybe with a Loop operator and/or Select Subprocess operator to generalize

it at N documents and N wordlists.

I hope it will be helpful.

Regards,

Lionel

MartinLiebig · January 2018

Hey @Nicson,

i think what you want to do is to tokenize / n_gram the reference data set and the normal data set the same way and afterwards use a cross distance operator with cosine similarity to find similar items.

Best,

Martin

Nicson · January 2018

Thank you for your answers.

Yes, I have a reference dataset in every list and I want to compare it with every actual document. I created a little visualization to illustrate my project.

The list "Documents" contains all documents, List_A - List_C are the reference lists, which should be checked for their similarity to the contents of the documents. It is also important that the reference data is not only single words but also word pairs (n_grams).

The second picture shows how I imagine the output of the data.

kind regards

sgenzer · January 2018

hello @Nicson - welcome to the community. Helpful hint from moderator: attach your csv/xls files to your posts so the kind people helping you don't have to recreate them.

Scott

Nicson · January 2018

@sgenzer Thanks for your advice, I'll take it into account for future postings.

@mschmitz

I have just been looking at the Cross Distance Operator and its tutorial process. What this operator does is understandable for me, but I have problems to apply it to my project. Assuming I have a single document that I want to compare with a word list, what should this process look like?

MartinLiebig · January 2018

Hi,

Have a look at the attached process. This would be my first try. Another way could be to use the Dictionary Based Sentiment Learner and miss use it to check how many tokens of your list are in the text.

Cheers,

Martin

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:create_document" compatibility="7.5.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="340">
        <parameter key="text" value="Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."/>
        <description align="center" color="transparent" colored="false" width="126">Document to Test</description>
      </operator>
      <operator activated="true" class="text:create_document" compatibility="7.5.000" expanded="true" height="68" name="Create Document (2)" width="90" x="45" y="85">
        <parameter key="text" value="Lorem &#10;Ipsum &#10;Dolor&#10;AnotherTerm"/>
        <description align="center" color="transparent" colored="false" width="126">List of Words</description>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="7.5.000" expanded="true" height="103" name="Process Documents (2)" width="90" x="246" y="85">
        <parameter key="vector_creation" value="Term Occurrences"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="179" y="34"/>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="7.5.000" expanded="true" height="103" name="Process Documents" width="90" x="447" y="238">
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="cross_distances" compatibility="8.0.001" expanded="true" height="103" name="Cross Distances" width="90" x="581" y="85">
        <parameter key="measure_types" value="NumericalMeasures"/>
        <parameter key="numerical_measure" value="CosineSimilarity"/>
      </operator>
      <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Create Document (2)" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
      <connect from_op="Process Documents (2)" from_port="example set" to_op="Cross Distances" to_port="reference set"/>
      <connect from_op="Process Documents (2)" from_port="word list" to_op="Process Documents" to_port="word list"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Cross Distances" to_port="request set"/>
      <connect from_op="Cross Distances" from_port="result set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Comparing a Document with Multiple Example Data

Answers