Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

[SOLVED] Get Pages - continue on error

avkavk Member Posts: 4 Contributor I
edited August 2020 in Help
Hello,

I'm trying to fetch a lot of pages using Get Pages, but the process fails on the first error (site down, etc). Is there a way to ignore the error and move on to the next example?
Tagged:

Answers

  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,529 RM Data Scientist
    You could use the Handle Exception Operator for this
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
  • avkavk Member Posts: 4 Contributor I
    Thank you, combining "Get PAge", "Handle Exception" and "Loop Examples" worked.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.2.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="all"/>
        <parameter key="logfile" value="C:\Documents and Settings\charper\My Documents\ALiNBUS\googlepagesstep3log"/>
        <parameter key="resultfile" value="C:\Documents and Settings\charper\My Documents\ALiNBUS\googlepagesstep3result"/>
        <process expanded="true">
          <operator activated="true" class="loop_examples" compatibility="6.2.000" expanded="true" height="94" name="Loop Examples" width="90" x="313" y="255">
            <process expanded="true">
              <operator activated="true" class="extract_macro" compatibility="6.2.000" expanded="true" height="60" name="Extract Macro" width="90" x="45" y="30">
                <parameter key="macro" value="href"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="site"/>
                <parameter key="example_index" value="%{example}"/>
                <list key="additional_macros"/>
              </operator>
              <operator activated="true" class="handle_exception" compatibility="6.2.000" expanded="true" height="94" name="Handle Exception" width="90" x="246" y="30">
                <parameter key="exception_macro" value="exceptionmsg"/>
                <process expanded="true">
                  <operator activated="false" class="text:extract_token_number" compatibility="6.1.000" expanded="true" height="60" name="Extract Token Number" width="90" x="45" y="435">
                    <parameter key="condition" value="matches"/>
                    <parameter key="regular_expression" value="LIBRAR"/>
                  </operator>
                  <operator activated="false" class="text:extract_token_number" compatibility="6.1.000" expanded="true" height="60" name="Extract Token Number (2)" width="90" x="179" y="480">
                    <parameter key="metadata_key" value="LIBRARnumber"/>
                    <parameter key="condition" value="matches"/>
                    <parameter key="regular_expression" value="LIBRAR"/>
                    <parameter key="invert_condition" value="true"/>
                  </operator>
                  <operator activated="true" class="web:get_webpage" compatibility="5.3.002" expanded="true" height="60" name="Get Page" width="90" x="45" y="75">
                    <parameter key="url" value="%{href}"/>
                    <parameter key="random_user_agent" value="true"/>
                    <list key="query_parameters"/>
                    <list key="request_properties"/>
                  </operator>
                  <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.002" expanded="true" height="60" name="Extract Content" width="90" x="179" y="75"/>
                  <operator activated="false" class="text:tokenize" compatibility="6.1.000" expanded="true" height="60" name="Tokenize" width="90" x="179" y="165"/>
                  <operator activated="false" class="text:stem_snowball" compatibility="6.1.000" expanded="true" height="60" name="Stem (Snowball)" width="90" x="179" y="255">
                    <parameter key="language" value="Russian"/>
                  </operator>
                  <operator activated="false" class="text:documents_to_data" compatibility="6.1.000" expanded="true" height="60" name="Documents to Data (2)" width="90" x="112" y="345">
                    <parameter key="text_attribute" value="contents"/>
                    <parameter key="label_attribute" value="Href"/>
                  </operator>
                  <connect from_port="in 1" to_port="out 1"/>
                  <connect from_op="Get Page" from_port="output" to_op="Extract Content" to_port="document"/>
                  <connect from_op="Extract Content" from_port="document" to_port="out 2"/>
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="source_in 2" spacing="0"/>
                  <portSpacing port="source_in 3" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                  <portSpacing port="sink_out 3" spacing="0"/>
                </process>
                <process expanded="true">
                  <connect from_port="in 1" to_port="out 1"/>
                  <connect from_port="in 2" to_port="out 2"/>
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="source_in 2" spacing="0"/>
                  <portSpacing port="source_in 3" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                  <portSpacing port="sink_out 3" spacing="0"/>
                </process>
              </operator>
              <connect from_port="example set" to_op="Extract Macro" to_port="example set"/>
              <connect from_op="Extract Macro" from_port="example set" to_op="Handle Exception" to_port="in 1"/>
              <connect from_op="Handle Exception" from_port="out 1" to_port="example set"/>
              <connect from_op="Handle Exception" from_port="out 2" to_port="output 1"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.