Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Manipulate string in URL in a loop

greg_lorincz79greg_lorincz79 Member Posts: 18 Maven
edited September 2019 in Help

I want to generate an URL for The Guardian API. When querying a search word (here "Brexit"), the API returns the first 10 hits of the list in JSON. In order to be able to see all the results, I need to be able to change the 'page' parameter here: https://content.guardianapis.com/search?page=1&q=Brexit&api-key=a2d0...

 

Here's an example process. What I would love is to be able to loops through all the pages, ie be able to increase the page size by 1. Any ideas would be appreciated!

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.1.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="182" y="78">
<parameter key="generator_type" value="comma_separated_text"/>
<parameter key="number_of_examples" value="100"/>
<parameter key="use_stepsize" value="false"/>
<list key="function_descriptions">
<parameter key="url" value="https://content.guardianapis.com/search?page=3&amp;q=Brexit&amp;api-key=..."/>
</list>
<parameter key="add_id_attribute" value="false"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="input_csv_text" value="url&#10;https://content.guardianapis.com/search?page=3&amp;q=Brexit&amp;api-key=a2d052f9-9052-4297-ac5f-5341b104e479"/>
<parameter key="column_separator" value=","/>
<parameter key="parse_all_as_nominal" value="false"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="trim_attribute_names" value="true"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="411" y="78">
<parameter key="link_attribute" value="url"/>
<parameter key="random_user_agent" value="false"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
</process>

Best Answer

  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,528 RM Data Scientist
    Solution Accepted

    Hi @greg_lorincz79,

    your XML does not work for me. There seems to be some issue?

     

    In any case, the solution is a loop operator. Loop provides you with a macro called iteration. You can just use this in the page parameter like this:

     

    https://content.guardianapis.com/search?page=%{iteration}&amp;q=Brexit&amp;api-key=XXXXX

    %{iteration} is always replaced with the current iteration count.

     

    I would recommend that you delete your API key from your initial post. APIkeys are like passwords, you don't share them.

     

    Best,

    Martin

    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany

Answers

  • greg_lorincz79greg_lorincz79 Member Posts: 18 Maven

    Thank you, I managed to sort out the looping with a macro. 

    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.1.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="259" y="204">
    <parameter key="generator_type" value="comma_separated_text"/>
    <parameter key="number_of_examples" value="100"/>
    <parameter key="use_stepsize" value="false"/>
    <list key="function_descriptions"/>
    <parameter key="add_id_attribute" value="false"/>
    <list key="numeric_series_configuration"/>
    <list key="date_series_configuration"/>
    <list key="date_series_configuration (interval)"/>
    <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
    <parameter key="input_csv_text" value="macro1&#10;10"/>
    <parameter key="column_separator" value=","/>
    <parameter key="parse_all_as_nominal" value="false"/>
    <parameter key="decimal_point_character" value="."/>
    <parameter key="trim_attribute_names" value="true"/>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro" width="90" x="413" y="205">
    <parameter key="macro" value="i"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="statistics" value="average"/>
    <parameter key="attribute_name" value="macro1"/>
    <parameter key="example_index" value="1"/>
    <list key="additional_macros"/>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="565" y="204">
    <parameter key="number_of_iterations" value="%{i}"/>
    <parameter key="iteration_macro" value="iteration"/>
    <parameter key="reuse_results" value="false"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
    <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="375" y="241">
    <parameter key="url" value="https://docs.aylien.com/textapi/rapidminer-extension/#step-2-extracting-the-article-titles-and-body"/>
    <parameter key="random_user_agent" value="false"/>
    <parameter key="connection_timeout" value="10000"/>
    <parameter key="read_timeout" value="10000"/>
    <parameter key="follow_redirects" value="true"/>
    <parameter key="accept_cookies" value="none"/>
    <parameter key="cookie_scope" value="global"/>
    <parameter key="request_method" value="GET"/>
    <list key="query_parameters"/>
    <list key="request_properties"/>
    <parameter key="override_encoding" value="false"/>
    <parameter key="encoding" value="SYSTEM"/>
    </operator>
    <connect from_op="Get Page" from_port="output" to_port="output 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    </process>
Sign In or Register to comment.