crawl web and get links data with macro

crazy_m1nercrazy_m1ner Member Posts: 2 Contributor I
edited November 2019 in Help

I want to get data from google policies in many languages.
I am using  crawl web operator which points to the greek language policies.
I am setting the language as label and filtering to get only the url's which contain policies.
So i end up with an exampleset that has all url's for greek policies.
What i want to achieve is to get all data from these policies and get them in one file.
I found that "get page" operator works perfect for what i want but it is only for one url.
The "get pages" operator has a problem with utf-8 encoding and gives me wrong output.
So i tried a macro to loop for the exampleset i have.
But i cann't figure out exactly how a macro works and the process never compiles.
This is my xml code:

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<parameter key="encoding" value="UTF-8"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="https://policies.google.com/privacy/archive?hl=el&amp;gl=gr"/>
<list key="crawling_rules"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="policy_language" value="if(contains(Link,&quot;intl/el_GR/policies&quot;),&quot;EL&quot;,&quot;Other&quot;)"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
<parameter key="attribute_name" value="policy_language"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="policy_language.equals.EL"/>
</list>
</operator>
<operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="82" name="Loop Examples" width="90" x="581" y="187">
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="447" y="136">
<parameter key="url" value="%test"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="8.2.000" expanded="true" height="82" name="Filter Example Range" width="90" x="45" y="34">
<parameter key="first_example" value="%{example}"/>
<parameter key="last_example" value="%{example}"/>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro" width="90" x="246" y="34">
<parameter key="macro" value="test"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Link"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
<connect from_port="example set" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Get Page" from_port="output" to_port="example set"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

Answers

  • jczogallajczogalla Employee, Member Posts: 144 RM Engineering

    Hi @crazy_m1ner!

     

    Below you find the updated XML for your process. There were some small problems. For one, the execution order was wrong, so the Get Page operator was executed before the test macro was created. Also, the output of Get Page was connected to the example set port of the Loop Examples, which produced another error. Now it works. :)

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.001-SNAPSHOT">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.2.001-SNAPSHOT" expanded="true" name="Process">
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.001-SNAPSHOT" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
    <parameter key="url" value="https://policies.google.com/privacy/archive?hl=el&amp;gl=gr"/>
    <list key="crawling_rules"/>
    </operator>
    <operator activated="true" class="generate_attributes" compatibility="8.2.001-SNAPSHOT" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
    <list key="function_descriptions">
    <parameter key="policy_language" value="if(contains(Link,&quot;intl/el_GR/policies&quot;),&quot;EL&quot;,&quot;Other&quot;)"/>
    </list>
    </operator>
    <operator activated="true" class="set_role" compatibility="8.2.001-SNAPSHOT" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
    <parameter key="attribute_name" value="policy_language"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.2.001-SNAPSHOT" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="policy_language.equals.EL"/>
    </list>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="8.2.001-SNAPSHOT" expanded="true" height="103" name="Loop Examples" width="90" x="581" y="187">
    <process expanded="true">
    <operator activated="true" class="extract_macro" compatibility="8.2.001-SNAPSHOT" expanded="true" height="68" name="Extract Macro" width="90" x="246" y="34">
    <parameter key="macro" value="test"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="attribute_name" value="Link"/>
    <parameter key="example_index" value="%{example}"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="web:get_webpage" compatibility="7.3.001-SNAPSHOT" expanded="true" height="68" name="Get Page" width="90" x="447" y="34">
    <parameter key="url" value="%{test}"/>
    <list key="query_parameters"/>
    <list key="request_properties"/>
    </operator>
    <connect from_port="example set" to_op="Extract Macro" to_port="example set"/>
    <connect from_op="Get Page" from_port="output" to_port="output 1"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
    <connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
    <connect from_op="Loop Examples" from_port="output 1" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Cheers

    Jan

  • crazy_m1nercrazy_m1ner Member Posts: 2 Contributor I

    It does work 
    But the html output code is still wrong encoded
    Even when i try another language i still get wrong encoded html
    I get the same output as get pages operator(in terms of encoding)

  • SGolbertSGolbert RapidMiner Certified Analyst, Member Posts: 344 Unicorn

    Hi @crazy_m1ner,

     

    The Get Page operator has the parameter "override enconding". You can enable it and select UTF-8, and then you will have the right encoding.

     

    If you use UTF-8 frequently, you can also set the default encoding of RapidMiner to UTF-8 in Settings -> Preferences -> General -> Encoding.

     

    Regards,

    Sebastian

Sign In or Register to comment.