Options

Manipulate string in URL in a loop

greg_lorincz79greg_lorincz79 Member Posts: 18 Maven
edited September 2019 in Help

I want to generate an URL for The Guardian API. When querying a search word (here "Brexit"), the API returns the first 10 hits of the list in JSON. In order to be able to see all the results, I need to be able to change the 'page' parameter here: https://content.guardianapis.com/search?page=1&q=Brexit&api-key=a2d0...

 

Here's an example process. What I would love is to be able to loops through all the pages, ie be able to increase the page size by 1. Any ideas would be appreciated!

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.1.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="182" y="78">
<parameter key="generator_type" value="comma_separated_text"/>
<parameter key="number_of_examples" value="100"/>
<parameter key="use_stepsize" value="false"/>
<list key="function_descriptions">
<parameter key="url" value="https://content.guardianapis.com/search?page=3&amp;q=Brexit&amp;api-key=..."/>
</list>
<parameter key="add_id_attribute" value="false"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="input_csv_text" value="url&#10;https://content.guardianapis.com/search?page=3&amp;q=Brexit&amp;api-key=a2d052f9-9052-4297-ac5f-5341b104e479"/>
<parameter key="column_separator" value=","/>
<parameter key="parse_all_as_nominal" value="false"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="trim_attribute_names" value="true"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="411" y="78">
<parameter key="link_attribute" value="url"/>
<parameter key="random_user_agent" value="false"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
</process>

Best Answer

  • Options
    MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,507 RM Data Scientist
    Solution Accepted

    Hi @greg_lorincz79,

    your XML does not work for me. There seems to be some issue?

     

    In any case, the solution is a loop operator. Loop provides you with a macro called iteration. You can just use this in the page parameter like this:

     

    https://content.guardianapis.com/search?page=%{iteration}&amp;q=Brexit&amp;api-key=XXXXX

    %{iteration} is always replaced with the current iteration count.

     

    I would recommend that you delete your API key from your initial post. APIkeys are like passwords, you don't share them.

     

    Best,

    Martin

    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany

Answers

  • Options
    greg_lorincz79greg_lorincz79 Member Posts: 18 Maven

    Thank you, I managed to sort out the looping with a macro. 

    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.1.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="259" y="204">
    <parameter key="generator_type" value="comma_separated_text"/>
    <parameter key="number_of_examples" value="100"/>
    <parameter key="use_stepsize" value="false"/>
    <list key="function_descriptions"/>
    <parameter key="add_id_attribute" value="false"/>
    <list key="numeric_series_configuration"/>
    <list key="date_series_configuration"/>
    <list key="date_series_configuration (interval)"/>
    <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
    <parameter key="input_csv_text" value="macro1&#10;10"/>
    <parameter key="column_separator" value=","/>
    <parameter key="parse_all_as_nominal" value="false"/>
    <parameter key="decimal_point_character" value="."/>
    <parameter key="trim_attribute_names" value="true"/>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro" width="90" x="413" y="205">
    <parameter key="macro" value="i"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="statistics" value="average"/>
    <parameter key="attribute_name" value="macro1"/>
    <parameter key="example_index" value="1"/>
    <list key="additional_macros"/>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="565" y="204">
    <parameter key="number_of_iterations" value="%{i}"/>
    <parameter key="iteration_macro" value="iteration"/>
    <parameter key="reuse_results" value="false"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
    <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="375" y="241">
    <parameter key="url" value="https://docs.aylien.com/textapi/rapidminer-extension/#step-2-extracting-the-article-titles-and-body"/>
    <parameter key="random_user_agent" value="false"/>
    <parameter key="connection_timeout" value="10000"/>
    <parameter key="read_timeout" value="10000"/>
    <parameter key="follow_redirects" value="true"/>
    <parameter key="accept_cookies" value="none"/>
    <parameter key="cookie_scope" value="global"/>
    <parameter key="request_method" value="GET"/>
    <list key="query_parameters"/>
    <list key="request_properties"/>
    <parameter key="override_encoding" value="false"/>
    <parameter key="encoding" value="SYSTEM"/>
    </operator>
    <connect from_op="Get Page" from_port="output" to_port="output 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    </process>
Sign In or Register to comment.