[SOLVED] Get Pages - continue on error

avkavk Member Posts: 4 Contributor I
edited August 2020 in Help
Hello,

I'm trying to fetch a lot of pages using Get Pages, but the process fails on the first error (site down, etc). Is there a way to ignore the error and move on to the next example?
Tagged:

Answers

  • mschmitzmschmitz Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,249 RM Data Scientist
    You could use the Handle Exception Operator for this
    - Head of Data Science Services at RapidMiner -
    Dortmund, Germany
  • avkavk Member Posts: 4 Contributor I
    Thank you, combining "Get PAge", "Handle Exception" and "Loop Examples" worked.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.2.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="all"/>
        <parameter key="logfile" value="C:\Documents and Settings\charper\My Documents\ALiNBUS\googlepagesstep3log"/>
        <parameter key="resultfile" value="C:\Documents and Settings\charper\My Documents\ALiNBUS\googlepagesstep3result"/>
        <process expanded="true">
          <operator activated="true" class="loop_examples" compatibility="6.2.000" expanded="true" height="94" name="Loop Examples" width="90" x="313" y="255">
            <process expanded="true">
              <operator activated="true" class="extract_macro" compatibility="6.2.000" expanded="true" height="60" name="Extract Macro" width="90" x="45" y="30">
                <parameter key="macro" value="href"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="site"/>
                <parameter key="example_index" value="%{example}"/>
                <list key="additional_macros"/>
              </operator>
              <operator activated="true" class="handle_exception" compatibility="6.2.000" expanded="true" height="94" name="Handle Exception" width="90" x="246" y="30">
                <parameter key="exception_macro" value="exceptionmsg"/>
                <process expanded="true">
                  <operator activated="false" class="text:extract_token_number" compatibility="6.1.000" expanded="true" height="60" name="Extract Token Number" width="90" x="45" y="435">
                    <parameter key="condition" value="matches"/>
                    <parameter key="regular_expression" value="LIBRAR"/>
                  </operator>
                  <operator activated="false" class="text:extract_token_number" compatibility="6.1.000" expanded="true" height="60" name="Extract Token Number (2)" width="90" x="179" y="480">
                    <parameter key="metadata_key" value="LIBRARnumber"/>
                    <parameter key="condition" value="matches"/>
                    <parameter key="regular_expression" value="LIBRAR"/>
                    <parameter key="invert_condition" value="true"/>
                  </operator>
                  <operator activated="true" class="web:get_webpage" compatibility="5.3.002" expanded="true" height="60" name="Get Page" width="90" x="45" y="75">
                    <parameter key="url" value="%{href}"/>
                    <parameter key="random_user_agent" value="true"/>
                    <list key="query_parameters"/>
                    <list key="request_properties"/>
                  </operator>
                  <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.002" expanded="true" height="60" name="Extract Content" width="90" x="179" y="75"/>
                  <operator activated="false" class="text:tokenize" compatibility="6.1.000" expanded="true" height="60" name="Tokenize" width="90" x="179" y="165"/>
                  <operator activated="false" class="text:stem_snowball" compatibility="6.1.000" expanded="true" height="60" name="Stem (Snowball)" width="90" x="179" y="255">
                    <parameter key="language" value="Russian"/>
                  </operator>
                  <operator activated="false" class="text:documents_to_data" compatibility="6.1.000" expanded="true" height="60" name="Documents to Data (2)" width="90" x="112" y="345">
                    <parameter key="text_attribute" value="contents"/>
                    <parameter key="label_attribute" value="Href"/>
                  </operator>
                  <connect from_port="in 1" to_port="out 1"/>
                  <connect from_op="Get Page" from_port="output" to_op="Extract Content" to_port="document"/>
                  <connect from_op="Extract Content" from_port="document" to_port="out 2"/>
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="source_in 2" spacing="0"/>
                  <portSpacing port="source_in 3" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                  <portSpacing port="sink_out 3" spacing="0"/>
                </process>
                <process expanded="true">
                  <connect from_port="in 1" to_port="out 1"/>
                  <connect from_port="in 2" to_port="out 2"/>
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="source_in 2" spacing="0"/>
                  <portSpacing port="source_in 3" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                  <portSpacing port="sink_out 3" spacing="0"/>
                </process>
              </operator>
              <connect from_port="example set" to_op="Extract Macro" to_port="example set"/>
              <connect from_op="Extract Macro" from_port="example set" to_op="Handle Exception" to_port="in 1"/>
              <connect from_op="Handle Exception" from_port="out 1" to_port="example set"/>
              <connect from_op="Handle Exception" from_port="out 2" to_port="output 1"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.