Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

[Solved]xpath function in RapidMiner: info from multiple grandparents

Kate_StrydomKate_Strydom Member Posts: 19 Contributor II
edited November 2018 in Help
Hi,

I am in the process of learning to use the web crawler operator and then turning the html/txt file extracted into a database. Currently, I am battling to obtain more than one grandparent node in my RapidMiner output. For example I am wanting to extract all the shop names under the
//h:div/h:ul/h:li[position()]/h:a/text(), the node li is the one that changes and the number of li nodes changes per document, that is, is n. I just don't seem to be able to work out how to get all the grandparents from my file. Only the first one appears, unless I change it to position()=3, then I get only the third ones.

I would appreciate it if someone could share their knowledge with me on how to achieve my objective.
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.0.11" expanded="true" name="Process">
   <process expanded="true" height="415" width="685">
     <operator activated="true" class="web:get_webpage" compatibility="5.0.4" expanded="true" height="60" name="Get Page" width="90" x="45" y="255">
       <parameter key="url" value="http://forum.spiegel.de/showthread.php?t=22981&amp;page=6"/>
       <list key="query_parameters"/>
     </operator>
     <operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document" width="90" x="313" y="120">
       <parameter key="query_type" value="XPath"/>
       <list key="string_machting_queries"/>
       <list key="regular_expression_queries"/>
       <list key="regular_region_queries"/>
       <list key="xpath_queries">
         <parameter key="Segmenter" value="/h:html/h:body/h:div[4]/h:div[1]/h:div[2]/h:div[2]/h:div[2]/h:div/h:div/h:div/h:div/h:table"/>
       </list>
       <list key="namespaces">
         <parameter key="xx" value="xml"/>
       </list>
       <parameter key="ignore_CDATA" value="false"/>
       <list key="index_queries"/>
       <process expanded="true" height="499" width="750">
         <operator activated="true" class="text:remove_document_parts" compatibility="5.0.7" expanded="true" height="60" name="Remove Document Parts" width="90" x="112" y="75">
           <parameter key="deletion_regex" value="(&lt;br clear=&quot;none&quot; /&gt;)"/>
         </operator>
         <operator activated="true" class="multiply" compatibility="5.0.11" expanded="true" height="94" name="Multiply" width="90" x="279" y="97"/>
         <operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document (2)" width="90" x="447" y="120">
           <parameter key="query_type" value="XPath"/>
           <list key="string_machting_queries"/>
           <list key="regular_expression_queries"/>
           <list key="regular_region_queries"/>
           <list key="xpath_queries">
             <parameter key="Zitate" value="//h:div[@style='font-style:italic']/text()"/>
           </list>
           <list key="namespaces"/>
           <parameter key="ignore_CDATA" value="false"/>
           <list key="index_queries"/>
           <process expanded="true" height="499" width="750">
             <connect from_port="segment" to_port="document 1"/>
             <portSpacing port="source_segment" spacing="0"/>
             <portSpacing port="sink_document 1" spacing="0"/>
             <portSpacing port="sink_document 2" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document (3)" width="90" x="447" y="255">
           <parameter key="query_type" value="XPath"/>
           <list key="string_machting_queries"/>
           <list key="regular_expression_queries"/>
           <list key="regular_region_queries"/>
           <list key="xpath_queries">
             <parameter key="Posting" value="//h:table/h:tr[2]/h:td[2]/h:div[2]/text()[2]|/h:table/h:tbody/h:tr[2]/h:td[2]/h:div[2]/text()"/>
           </list>
           <list key="namespaces"/>
           <parameter key="ignore_CDATA" value="false"/>
           <list key="index_queries"/>
           <process expanded="true" height="499" width="750">
             <connect from_port="segment" to_port="document 1"/>
             <portSpacing port="source_segment" spacing="0"/>
             <portSpacing port="sink_document 1" spacing="0"/>
             <portSpacing port="sink_document 2" spacing="0"/>
           </process>
         </operator>
         <connect from_port="segment" to_op="Remove Document Parts" to_port="document"/>
         <connect from_op="Remove Document Parts" from_port="document" to_op="Multiply" to_port="input"/>
         <connect from_op="Multiply" from_port="output 1" to_op="Cut Document (2)" to_port="document"/>
         <connect from_op="Multiply" from_port="output 2" to_op="Cut Document (3)" to_port="document"/>
         <connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/>
         <connect from_op="Cut Document (3)" from_port="documents" to_port="document 2"/>
         <portSpacing port="source_segment" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
         <portSpacing port="sink_document 3" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="text:documents_to_data" compatibility="5.0.7" expanded="true" height="76" name="Documents to Data" width="90" x="581" y="120">
       <parameter key="text_attribute" value="Testattr"/>
       <parameter key="label_attribute" value="testattribut"/>
     </operator>
     <connect from_op="Get Page" from_port="output" to_op="Cut Document" to_port="document"/>
     <connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
     <connect from_op="Documents to Data" from_port="example set" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>
Many thanks.
Kate

Answers

  • Kate_StrydomKate_Strydom Member Posts: 19 Contributor II
    Sorry for the post.... the cut document operator appears to have rectified my problem. Have no clue how I will put the documents back together but at least I can move along.

Sign In or Register to comment.