Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Extract Information & Tokenize delivers funny results

Legacy UserLegacy User Member Posts: 0 Newbie
edited November 2018 in Help
Hi there,

I am trying to tokenize text that I extract from a Webpage via XPath.
The extraction works fine, but when I use the Tokenize Operator the resulting Wordlist contains a lot of words that are not part of the extracted data.

I am guessing the Tokenize Operater tokenizes the original Text (which woud be the complete Webpage) but how can OI get RM to only tokenize the extracted text?

I cannot find any solution in this forum so I hope anyone has a good idea.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="web:get_webpage" compatibility="5.3.001" expanded="true" height="60" name="Get Page" width="90" x="112" y="30">
        <parameter key="url" value="http://www.tripadvisor.com/ShowTopic-g194739-i7938-k1383033-Europcar_rental_office-Chiusi_Tuscany.html"/>
        <parameter key="random_user_agent" value="true"/>
        <list key="query_parameters"/>
        <list key="request_properties"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="246" y="30">
        <process expanded="true">
          <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information (3)" width="90" x="45" y="30">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries">
              <parameter key="extract" value="&lt;p&gt;+.&lt;/p&gt;"/>
            </list>
            <list key="xpath_queries">
              <parameter key="EXTRACT" value="string(//h:div[@class='postBody'][not(contains(.,'http://www.'))])"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (3)" width="90" x="380" y="30"/>
          <connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
          <connect from_op="Extract Information (3)" from_port="document" to_op="Tokenize (3)" to_port="document"/>
          <connect from_op="Tokenize (3)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents" from_port="word list" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • mmaragmmarag Member Posts: 35 Maven
    I tried this one but still nothing.....

    its very strange indeed
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.015">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="web:get_webpage" compatibility="5.3.001" expanded="true" height="60" name="Get Page" width="90" x="112" y="30">
            <parameter key="url" value="http://www.tripadvisor.com/ShowTopic-g194739-i7938-k1383033-Europcar_rental_office-Chiusi_Tuscany.html"/>
            <parameter key="random_user_agent" value="true"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="313" y="75">
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information (3)" width="90" x="45" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="extract" value="&lt;p&gt;+.&lt;/p&gt;"/>
                </list>
                <list key="xpath_queries">
                  <parameter key="EXTRACT" value="string(//h:div[@class='postBody'][not(contains(.,'http://www.'))])"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
              <connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="447" y="75">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="EXTRACT"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="5.3.015" expanded="true" height="76" name="Nominal to Text" width="90" x="648" y="120">
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="849" y="120">
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="120"/>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
          <connect from_op="Process Documents" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
  • Legacy UserLegacy User Member Posts: 0 Newbie
    thanks mmarag, i figured it might be an issue with datatypes but still i does not make any sense to me.

    anyone else got an idea? ???
  • Legacy UserLegacy User Member Posts: 0 Newbie
    I played around a bit more anf found a workaround:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.015">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="web:get_webpage" compatibility="5.3.001" expanded="true" height="60" name="Get Page" width="90" x="112" y="30">
            <parameter key="url" value="http://www.tripadvisor.com/ShowTopic-g194739-i7938-k1383033-Europcar_rental_office-Chiusi_Tuscany.html"/>
            <parameter key="random_user_agent" value="true"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="246" y="30">
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information (3)" width="90" x="45" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="extract" value="&lt;p&gt;+.&lt;/p&gt;"/>
                </list>
                <list key="xpath_queries">
                  <parameter key="EXTRACT" value="string(//h:div[@class='postBody'][not(contains(.,'http://www.'))])"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
              <connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="380" y="165">
            <parameter key="select_attributes_and_weights" value="true"/>
            <list key="specify_weights">
              <parameter key="EXTRACT" value="1.0"/>
            </list>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents (2)" width="90" x="581" y="165">
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="30"/>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
          <connect from_op="Process Documents" from_port="example set" to_op="Data to Documents" to_port="example set"/>
          <connect from_op="Process Documents" from_port="word list" to_port="result 1"/>
          <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents (2)" to_port="documents 1"/>
          <connect from_op="Process Documents (2)" from_port="example set" to_port="result 2"/>
          <connect from_op="Process Documents (2)" from_port="word list" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
        </process>
      </operator>
    </process>

    But I would still like to know why the tokenize op does not like the extract information op?
  • frasfras Member Posts: 93 Contributor II
    You need a second "Process Documents", but "from Data" in this case because you have example sets.
    And you have to switch one (or more) attributes role to "text" to start tokenizing.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.0.002">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="web:get_webpage" compatibility="5.3.001" expanded="true" height="60" name="Get Page" width="90" x="45" y="30">
            <parameter key="url" value="http://www.tripadvisor.com/ShowTopic-g194739-i7938-k1383033-Europcar_rental_office-Chiusi_Tuscany.html"/>
            <parameter key="random_user_agent" value="true"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="179" y="30">
            <parameter key="create_word_vector" value="false"/>
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information (3)" width="90" x="45" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="extract" value="&lt;p&gt;+.&lt;/p&gt;"/>
                </list>
                <list key="xpath_queries">
                  <parameter key="EXTRACT" value="string(//h:div[@class='postBody'][not(contains(.,'http://www.'))])"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
              <connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="6.0.002" expanded="true" height="76" name="Nominal to Text" width="90" x="313" y="30">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="EXTRACT"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="246" y="30"/>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
          <connect from_op="Process Documents" from_port="example set" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_port="result 2"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>

Sign In or Register to comment.