The RapidMiner community is on read-only mode until further notice. Technical support via cases will continue to work as is. For any urgent licensing related requests from Students/Faculty members, please use the Altair academic forum here.

Help with xpath

robinrobin Member Posts: 100 Guru
edited December 2018 in Help

I see there are alot of topics on xpath, however I just lack the expereince in this field and really need some help. 

 

The below is the first line of entries from a client feed, there are numerous entries after the 1st. For some reason I am only picking up the first entry and need help on the correct syntax or opperators to pull the entire feed into a MySQL database. 

 

The client feed: 

 

 

<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:g="http://base.google.com/ns/1.0">
<entry>
<g:additional_image_link>http://www.client.co.za/cs/groups/public/documents/client.co.za_portal_webassets/file1518447240125huge</g:additional_image_link&gt;
<g:availability>In Stock</g:availability>
<g:brand>Alcatel One Touch</g:brand>
<g:condition>new</g:condition>
<g:description>500MB 24 Month Data Top Up Price Plan</g:description>
<g:google_product_category>Electronics &gt; Computers &gt; Tablet Computers</g:google_product_category>
<g:gtin>DV8FY44</g:gtin>
<g:id>470654</g:id>
<g:link>http://www.client.co.za/vodacom/shopping/devices/alcatelonetouch-deals/client-smart-tab-2-3g/500mb-24-month-data-top-up-price-plan/470654</g:link&gt;
<g:mpn>DV8FY44</g:mpn>
<g:price>129.00</g:price>
<g:product_type>Tablet</g:product_type>
<g:shipping>
<g:country>SA</g:country>
<g:price>R0.00</g:price>
<g:service>Free next day delivery</g:service>
</g:shipping>
<g:title>Smart Tab 2 3G on 500MB 24 Month Data Top Up Price Plan on a 24 month contract</g:title>
</entry>
</feed>

 

There would be a stack more entries beterrn the </entry> and </feed> lines.

 

Here is my RapidMiner process:

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="45" y="136">
<parameter key="url" value="http://www.client.co.za/client/GoogleDealFeed"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="9910000"/>
<parameter key="read_timeout" value="9910000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="all"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<list key="query_parameters"/>
<list key="request_properties"/>
<parameter key="override_encoding" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="179" y="136">
<parameter key="text_attribute" value="text"/>
<parameter key="label_attribute" value="text"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="136">
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="loop_collection" compatibility="8.1.000" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="136">
<parameter key="set_iteration_macro" value="true"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="unfold" value="false"/>
<process expanded="true">
<operator activated="true" class="text:write_document" compatibility="7.5.000" expanded="true" height="82" name="Write Document" width="90" x="380" y="289">
<parameter key="overwrite" value="true"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<operator activated="true" class="advanced_file_connectors:read_xml" compatibility="8.1.000" expanded="true" height="68" name="Read XML" width="90" x="581" y="289">
<parameter key="file" value="/Users/robinmeisel/Desktop/set-up.xml"/>
<parameter key="xpath_for_examples" value="//default:feed"/>
<enumeration key="xpaths_for_attributes">
<parameter key="xpath_for_attribute" value="default:entry[1]/g:availability/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:brand/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:condition/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:description/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:google_product_category/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:gtin/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:id/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:link/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:mpn/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:price/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:product_type/node()"/>
<parameter key="xpath_for_attribute" value="default:entry[1]/g:title/node()"/>
</enumeration>
<parameter key="use_namespaces" value="true"/>
<list key="namespaces">
<parameter key="g" value="http://base.google.com/ns/1.0"/>
<parameter key="default" value="http://www.w3.org/2005/Atom"/>
</list>
<parameter key="use_default_namespace" value="true"/>
<parameter key="default_namespace" value="http://www.w3.org/2005/Atom"/>
<parameter key="parse_numbers" value="false"/>
<parameter key="decimal_character" value="."/>
<parameter key="grouped_digits" value="false"/>
<parameter key="grouping_character" value=","/>
<parameter key="date_format" value=""/>
<list key="annotations"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="default:entry[1]/g:availability[*]/text().true.attribute_value.attribute"/>
<parameter key="1" value="default:entry[1]/g:brand[*]/text().true.attribute_value.attribute"/>
<parameter key="2" value="default:entry[1]/g:condition[*]/text().true.attribute_value.attribute"/>
<parameter key="3" value="default:entry[1]/g:description[*]/text().true.attribute_value.attribute"/>
<parameter key="4" value="default:entry[1]/g:google_product_category[*]/text().true.attribute_value.attribute"/>
<parameter key="5" value="default:entry[1]/g:gtin[*]/text().true.attribute_value.attribute"/>
<parameter key="6" value="default:entry[1]/g:id[1]/text().true.attribute_value.attribute"/>
<parameter key="7" value="default:entry[1]/g:link[1]/text().true.attribute_value.attribute"/>
<parameter key="8" value="default:entry[1]/g:mpn[1]/text().true.attribute_value.attribute"/>
<parameter key="9" value="default:entry[1]/g:price[1]/text().true.attribute_value.attribute"/>
<parameter key="10" value="default:entry[1]/g:product_type[1]/text().true.attribute_value.attribute"/>
<parameter key="11" value="default:entry[1]/g:title[1]/text().true.attribute_value.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
<connect from_port="single" to_op="Write Document" to_port="document"/>
<connect from_op="Write Document" from_port="file" to_op="Read XML" to_port="file"/>
<connect from_op="Read XML" from_port="output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
</process>

 

As you can see in the RapidMiner operators I have used xpath queries of  

<parameter key="xpath_for_attribute" value="default:entry[1]/g:mpn/node()"/>

But  that only returns the first character. What is the correct xpath to obtain this information from the feed? 

Best Answer

  • kaymankayman Member Posts: 662 Unicorn
    Solution Accepted

    Bit hard to explain, but you had to go one level deeper as you are now calling root level, and then the first node, which is indeed always the first one.

     

    You used :

     

    <parameter key="xpath_for_examples" value="//default:feed"/>

    whereas it had to be :

     

    <parameter key="xpath_for_examples" value="//default:feed/default:entry"/>

     

    by using default:entry[1], the script will always take the first entry from the root. When going one level deeper you will however loop through all of the entries, and for each of the entries take the first of the attributes you want. 

    Now the Xpath becomes like g:price[1]/text() , which is shorthand for //feed/entry/g:price[1]/text (or start with root (feed), loop through all the entry nodes, and give me the text of the first price tag you find)

     

    The operator below uses your xml (I copied the entry just 4 times for testing purposes) and it returns everything as expecting, so hopefully it can get you running again.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="advanced_file_connectors:read_xml" compatibility="7.6.001" expanded="true" height="68" name="Read XML" width="90" x="447" y="34">
    <parameter key="file" value="C:\change_with_your_path\your.xml"/>
    <parameter key="xpath_for_examples" value="//default:feed/default:entry"/>
    <enumeration key="xpaths_for_attributes">
    <parameter key="xpath_for_attribute" value="g:additional_image_link[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:availability[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:brand[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:condition[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:description[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:google_product_category[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:gtin[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:id[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:link[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:mpn[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:price[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:product_type[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:shipping[1]/g:country[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:shipping[1]/g:price[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:shipping[1]/g:service[1]/text()"/>
    <parameter key="xpath_for_attribute" value="g:title[1]/text()"/>
    </enumeration>
    <list key="namespaces">
    <parameter key="g" value="http://base.google.com/ns/1.0"/>
    <parameter key="default" value="http://www.w3.org/2005/Atom"/>
    </list>
    <parameter key="default_namespace" value="http://www.w3.org/2005/Atom"/>
    <list key="annotations"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="g:additional_image_link[1]/text().true.attribute_value.attribute"/>
    <parameter key="1" value="g:availability[1]/text().true.attribute_value.attribute"/>
    <parameter key="2" value="g:brand[1]/text().true.attribute_value.attribute"/>
    <parameter key="3" value="g:condition[1]/text().true.attribute_value.attribute"/>
    <parameter key="4" value="g:description[1]/text().true.attribute_value.attribute"/>
    <parameter key="5" value="g:google_product_category[1]/text().true.attribute_value.attribute"/>
    <parameter key="6" value="g:gtin[1]/text().true.attribute_value.attribute"/>
    <parameter key="7" value="g:id[1]/text().true.attribute_value.attribute"/>
    <parameter key="8" value="g:link[1]/text().true.attribute_value.attribute"/>
    <parameter key="9" value="g:mpn[1]/text().true.attribute_value.attribute"/>
    <parameter key="10" value="g:price[1]/text().true.attribute_value.attribute"/>
    <parameter key="11" value="g:product_type[1]/text().true.attribute_value.attribute"/>
    <parameter key="12" value="g:shipping[1]/g:country[1]/text().true.attribute_value.attribute"/>
    <parameter key="13" value="g:shipping[1]/g:price[1]/text().true.attribute_value.attribute"/>
    <parameter key="14" value="g:shipping[1]/g:service[1]/text().true.attribute_value.attribute"/>
    <parameter key="15" value="g:title[1]/text().true.attribute_value.attribute"/>
    </list>
    </operator>
    <connect from_op="Read XML" from_port="output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Personally I prefer the 'Process XSLT' operator when dealing with XML, as it is much more flexible towards XPath, but it requires some knowledge of the language.

Answers

Sign In or Register to comment.