Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Xpath - Getting text from div

b00122599b00122599 Member Posts: 26 Contributor II
edited November 2019 in Help
Hey folks,

I've tested my xpath in google docs and it's working fine however I can't get it to work in Rapidminer. In the Extract Information operator if I use the query "//h:div[2]/h:div/h:ul/h:li[1]/h:div" I get the result below:

<div xmlns="http://www.w3.org/1999/xhtml" class="stats-value">
  <svg class="icon">
    <use xmlns:xlink="urn:x-prefix:xlink" xlink:href="#trophy" />
  </svg>
  Text that I want
</div>

However I just want the text "Text that I want" not the surrounding html. So I try "//h:div[2]/h:div/h:ul/h:li[1]/h:div/text()". This doesn't give me an error but it's not returning anything, no text at all.

Any help is much appreciated.

Neil. 

Best Answer

Answers

  • lionelderkrikorlionelderkrikor RapidMiner Certified Analyst, Member Posts: 1,195 Unicorn
    Hi @b00122599,

    In order we can find a solution can you please share : 

     - your process
     - your data (a priori your excel file where your links are stored)

    Regards,

    Lionel
  • b00122599b00122599 Member Posts: 26 Contributor II
    Please find code below and excel file with links. Thanks for your help

    <?xml version="1.0" encoding="UTF-8"?><process version="9.5.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.5.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="read_excel" compatibility="9.5.000" expanded="true" height="68" name="Read Excel (2)" width="90" x="45" y="136">
            <parameter key="excel_file" value="D:\OneDrive\snookersmall.xlsx"/>
            <parameter key="sheet_selection" value="sheet number"/>
            <parameter key="sheet_number" value="1"/>
            <parameter key="imported_cell_range" value="A1"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations"/>
            <parameter key="date_format" value=""/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information"/>
            <parameter key="read_not_matching_values_as_missings" value="true"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="9.0.000" expanded="true" height="68" name="Get Pages (2)" width="90" x="313" y="136">
            <parameter key="link_attribute" value="LINKS"/>
            <parameter key="random_user_agent" value="false"/>
            <parameter key="user_agent" value="googlebot"/>
            <parameter key="connection_timeout" value="10000"/>
            <parameter key="read_timeout" value="10000"/>
            <parameter key="follow_redirects" value="true"/>
            <parameter key="accept_cookies" value="none"/>
            <parameter key="cookie_scope" value="global"/>
            <parameter key="request_method" value="GET"/>
            <parameter key="delay" value="none"/>
            <parameter key="delay_amount" value="1000"/>
            <parameter key="min_delay_amount" value="0"/>
            <parameter key="max_delay_amount" value="1000"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.2.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="581" y="136">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information (2)" width="90" x="246" y="34">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <parameter key="attribute_type" value="Nominal"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="Snookname" value="//h:div[1]/h:h1/text()"/>
                  <parameter key="Titleswon" value="//h:div[2]/h:div/h:ul/h:li[1]/h:div/text()"/>
                </list>
                <list key="namespaces"/>
                <parameter key="ignore_CDATA" value="true"/>
                <parameter key="assume_html" value="true"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
              </operator>
              <operator activated="true" class="web:extract_html_text_content" compatibility="9.0.000" expanded="true" height="68" name="Extract Content (2)" width="90" x="514" y="34">
                <parameter key="extract_content" value="true"/>
                <parameter key="minimum_text_block_length" value="500"/>
                <parameter key="override_content_type_information" value="true"/>
                <parameter key="neglegt_span_tags" value="true"/>
                <parameter key="neglect_p_tags" value="true"/>
                <parameter key="neglect_b_tags" value="true"/>
                <parameter key="neglect_i_tags" value="true"/>
                <parameter key="neglect_br_tags" value="true"/>
                <parameter key="ignore_non_html_tags" value="true"/>
              </operator>
              <connect from_port="document" to_op="Extract Information (2)" to_port="document"/>
              <connect from_op="Extract Information (2)" from_port="document" to_op="Extract Content (2)" to_port="document"/>
              <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Read Excel (2)" from_port="output" to_op="Get Pages (2)" to_port="Example Set"/>
          <connect from_op="Get Pages (2)" from_port="Example Set" to_op="Process Documents from Data (2)" to_port="example set"/>
          <connect from_op="Process Documents from Data (2)" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    


  • lionelderkrikorlionelderkrikor RapidMiner Certified Analyst, Member Posts: 1,195 Unicorn
    @b00122599,

    It seems that your Excel file is broken. I can not open it with Excel and I cannot load the file in RapidMiner...

    Can you share directly the links here ?

    Regards,

    Lionel
  • lionelderkrikorlionelderkrikor RapidMiner Certified Analyst, Member Posts: 1,195 Unicorn
    Hi again,

    Sorry no need to share the links ! 

    I was able to load your excel file and run your process....

    Regards,

    Lionel
  • lionelderkrikorlionelderkrikor RapidMiner Certified Analyst, Member Posts: 1,195 Unicorn
    @b00122599,

    Yes it's strange.
    To extract the relevant informations, I proceeded in 2 steps.
    But I'm not able to join the 2 resultings example sets (RapidMiner is raising an error).
    Anyway, you can take a look at this process, your information ("Text what I want") is in the second resulting example set...

    <?xml version="1.0" encoding="UTF-8"?><process version="9.5.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.5.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="read_excel" compatibility="9.5.000" expanded="true" height="68" name="Read Excel (2)" width="90" x="45" y="85">
            <parameter key="excel_file" value="C:\Users\Lionel\Downloads\'snookersmall.xlsx'"/>
            <parameter key="sheet_selection" value="sheet number"/>
            <parameter key="sheet_number" value="1"/>
            <parameter key="imported_cell_range" value="A1"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations"/>
            <parameter key="date_format" value=""/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information"/>
            <parameter key="read_not_matching_values_as_missings" value="true"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="9.0.000" expanded="true" height="68" name="Get Pages (2)" width="90" x="179" y="85">
            <parameter key="link_attribute" value="LINKS"/>
            <parameter key="random_user_agent" value="false"/>
            <parameter key="user_agent" value="googlebot"/>
            <parameter key="connection_timeout" value="10000"/>
            <parameter key="read_timeout" value="10000"/>
            <parameter key="follow_redirects" value="true"/>
            <parameter key="accept_cookies" value="none"/>
            <parameter key="cookie_scope" value="global"/>
            <parameter key="request_method" value="GET"/>
            <parameter key="delay" value="none"/>
            <parameter key="delay_amount" value="1000"/>
            <parameter key="min_delay_amount" value="0"/>
            <parameter key="max_delay_amount" value="1000"/>
          </operator>
          <operator activated="true" breakpoints="after" class="text:process_document_from_data" compatibility="8.2.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="380" y="85">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information (2)" width="90" x="45" y="34">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <parameter key="attribute_type" value="Nominal"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="Snookname" value="//h:div[1]/h:h1/text()"/>
                  <parameter key="Titleswon" value="//h:div[2]/h:div/h:ul/h:li[1]/h:div"/>
                </list>
                <list key="namespaces"/>
                <parameter key="ignore_CDATA" value="true"/>
                <parameter key="assume_html" value="true"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
              </operator>
              <operator activated="true" class="web:extract_html_text_content" compatibility="9.0.000" expanded="true" height="68" name="Extract Content (2)" width="90" x="179" y="34">
                <parameter key="extract_content" value="true"/>
                <parameter key="minimum_text_block_length" value="1"/>
                <parameter key="override_content_type_information" value="true"/>
                <parameter key="neglegt_span_tags" value="true"/>
                <parameter key="neglect_p_tags" value="true"/>
                <parameter key="neglect_b_tags" value="true"/>
                <parameter key="neglect_i_tags" value="true"/>
                <parameter key="neglect_br_tags" value="true"/>
                <parameter key="ignore_non_html_tags" value="true"/>
              </operator>
              <connect from_port="document" to_op="Extract Information (2)" to_port="document"/>
              <connect from_op="Extract Information (2)" from_port="document" to_op="Extract Content (2)" to_port="document"/>
              <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.5.000" expanded="true" height="103" name="Multiply" width="90" x="514" y="85"/>
          <operator activated="true" class="select_attributes" compatibility="9.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="659" y="34">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value="Titleswon"/>
            <parameter key="attributes" value="LINKS|Titleswon"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="text:generate_extract" compatibility="8.2.000" expanded="true" height="68" name="Generate Extract" width="90" x="793" y="34">
            <parameter key="source_attribute" value="Titleswon"/>
            <parameter key="query_type" value="Regular Expression"/>
            <list key="string_machting_queries"/>
            <parameter key="attribute_type" value="Nominal"/>
            <list key="regular_expression_queries">
              <parameter key="Titleswon_2" value="&lt;/svg&gt;(.*?)&lt;/div&gt;"/>
            </list>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <parameter key="ignore_CDATA" value="true"/>
            <parameter key="assume_html" value="true"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
          </operator>
          <connect from_op="Read Excel (2)" from_port="output" to_op="Get Pages (2)" to_port="Example Set"/>
          <connect from_op="Get Pages (2)" from_port="Example Set" to_op="Process Documents from Data (2)" to_port="example set"/>
          <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 2" to_port="result 1"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Generate Extract" to_port="Example Set"/>
          <connect from_op="Generate Extract" from_port="Example Set" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
    


    Hope this helps,

    Regards,

    Lionel
  • b00122599b00122599 Member Posts: 26 Contributor II
    Thanks for trying appreciate it. Neil. 
  • b00122599b00122599 Member Posts: 26 Contributor II
    Thanks again I think I may need to look at different xpath approach maybe I should be looking at classes. Cheers, Neil.
Sign In or Register to comment.