Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Duplicate attribute name: Content-Type
Hi all,
Rapid Miner is a fantastic tool I am using.
I am trying to get Keyword clustering using web mining and text mining example by http://www.simafore.com/blog/bid/116340/ , but I get a "Duplicate attribute name: Content-Type" error.
I have to read a mysql database table and get the LINK information as attribute.
(mysql)
LINK attribute is:
http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html
http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html
http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html
I'd like to get keyword clusters that are based on those web pages content.
Do You know a way to get this process working ?
I attach the xml process here.
I thank You for good collaboration in advance !
Have a good day.
Alex
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="45" y="75">
<parameter key="define_connection" value="url"/>
<parameter key="connection" value="libero"/>
<parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
<parameter key="username" value="root"/>
<parameter key="password" value="***********************"/>
<parameter key="define_query" value="table name"/>
<parameter key="table_name" value="textmine"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="PAGE"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="delay" value="random"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="attribute" value="PAGEOUTPUT"/>
<parameter key="attributes" value="PAGEOUTPUT"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
<parameter key="ignore_non_html_tags" value="false"/>
</operator>
<connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
<connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
<operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|text"/>
<parameter key="numeric_condition" value="<5"/>
</operator>
<operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435"/>
<connect from_op="Read Database" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
<connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
<connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
Rapid Miner is a fantastic tool I am using.
I am trying to get Keyword clustering using web mining and text mining example by http://www.simafore.com/blog/bid/116340/ , but I get a "Duplicate attribute name: Content-Type" error.
I have to read a mysql database table and get the LINK information as attribute.
(mysql)
LINK attribute is:
http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html
http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html
http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html
I'd like to get keyword clusters that are based on those web pages content.
Do You know a way to get this process working ?
I attach the xml process here.
I thank You for good collaboration in advance !
Have a good day.
Alex
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="45" y="75">
<parameter key="define_connection" value="url"/>
<parameter key="connection" value="libero"/>
<parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
<parameter key="username" value="root"/>
<parameter key="password" value="***********************"/>
<parameter key="define_query" value="table name"/>
<parameter key="table_name" value="textmine"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="PAGE"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="delay" value="random"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="attribute" value="PAGEOUTPUT"/>
<parameter key="attributes" value="PAGEOUTPUT"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
<parameter key="ignore_non_html_tags" value="false"/>
</operator>
<connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
<connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
<operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|text"/>
<parameter key="numeric_condition" value="<5"/>
</operator>
<operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435"/>
<connect from_op="Read Database" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
<connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
<connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
0
Answers
for me the process you supplied runs fine if I replace the Read Database operator with a data set that contains a Link attribute with the links you provided.
Can you send me a link for which the described error occurs?
Best regards,
Marius
I am very happy to read You rapid reply.
I replaced the Read Database operator, following your suggestion.
We can't get any content from the linked web pages, I don't know why, maybe I have to change the "Get Pages" operator.
The csv file contains now:
Link;
http://corrieredelveneto.corriere.it/notizie/politica/2013/28-maggio-2013/vincitori-vinti-disperati-2221364926711.shtml,
http://www.corriere.it/sette/13_maggio_22/2013-21-gramigna-aulla_12fb6dea-c2e8-11e2-b767-d844a9f1da92.shtml,
http://corrieredelveneto.corriere.it/notizie/cronaca/2013/23-maggio-2013/alluvione-stretta-controlli-ma-resta-nodo-bacini-2221283139532.shtml
Here the whole process:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="75">
<parameter key="csv_file" value="/home/alex/Scrivania/url.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="locale" value="Italian (Italy)"/>
<parameter key="encoding" value="UTF-8"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Link.true.binominal.attribute"/>
</list>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="PAGE"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="user_agent" value="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:20.0) Gecko/20100101 Firefox/20.0"/>
<parameter key="accept_cookies" value="all"/>
<parameter key="delay" value="random"/>
<parameter key="min_delay_amount" value="1000"/>
<parameter key="max_delay_amount" value="2000"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="attribute" value="PAGEOUTPUT"/>
<parameter key="attributes" value="PAGEOUTPUT"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="keep_text" value="true"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content" width="90" x="246" y="75">
<parameter key="neglegt_span_tags" value="false"/>
<parameter key="neglect_p_tags" value="false"/>
<parameter key="neglect_b_tags" value="false"/>
<parameter key="neglect_i_tags" value="false"/>
<parameter key="neglect_br_tags" value="false"/>
<parameter key="ignore_non_html_tags" value="false"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="112" name="Multiply" width="90" x="380" y="345"/>
<operator activated="true" class="write_csv" compatibility="5.3.008" expanded="true" height="76" name="Write CSV" width="90" x="581" y="570">
<parameter key="csv_file" value="/home/alex/Scrivania/out.csv"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|Link"/>
<parameter key="numeric_condition" value="<5"/>
</operator>
<operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435">
<parameter key="add_as_label" value="true"/>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
<connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 3" to_op="Write CSV" to_port="input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
<connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
Can You suggest a solution and attach a full working process for text Keyword clustering ?
I thank You for Your good support Marius!
Have a good evening.
Alex
Best regards,
Marius
I succeed using the Read Csv operator !
now for a scientific research I need to get earthquake (=terremoto) related italian article data from a freely available newspaper article archive search engine
http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto
Searching for " terremoto " You will find 11210 articles.
The pagination system uses a javascript script to assign value to the pageNumber input variable.
function submitform(page) {
var query = document.getElementById("queryString").value;
var action = "archivioStoricoEngine";
if (query != null && query != "") {
action += "?q=" + query;
} else {
action += "?q=";
}
document.getElementById("pageNumber").value = page;
document.getElementById("pagerForm").action=action;
document.getElementById("pagerForm").submit();
}
The form uses POST Method and hidden inputed variables, instead of GET method.
Maybe for You is a simple question, but I am a newbe in data mining field, so please explain to me how can I proceed.
What Rapid Miner operators have I to use?
How can I set the javascript pageNumber variable to loop the article extraction?
Is it possible to add a Referer ?
Here my process, it works for ordinary search engine web page, but I don't know how to extract data from Form POSTing search engine results.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="loop" compatibility="5.3.008" expanded="true" height="94" name="Loop" width="90" x="179" y="210">
<parameter key="set_iteration_macro" value="true"/>
<parameter key="iterations" value="317"/>
<parameter key="timeout" value="120"/>
<parameter key="parallelize_iteration" value="true"/>
<process expanded="true">
<operator activated="true" class="generate_macro" compatibility="5.3.008" expanded="true" height="76" name="Generate Macro" width="90" x="45" y="30">
<list key="function_descriptions">
<parameter key="Pagepos" value="(%{iteration})+1"/>
</list>
</operator>
<operator activated="true" class="log" compatibility="5.3.008" expanded="true" height="76" name="Log" width="90" x="512" y="30">
<parameter key="filename" value="/home/alex/Documents/Logs/log-perfetto.txt"/>
<list key="log">
<parameter key="time" value="operator.Crawl Web.value.time"/>
<parameter key="execution time" value="operator.Crawl Web.value.execution-time"/>
<parameter key="looptime" value="operator.Crawl Web.value.looptime"/>
<parameter key="cpu execution time" value="operator.Crawl Web.value.cpu-execution-time"/>
<parameter key="Max Token Length" value="operator.Tokenize.parameter.max_token_length"/>
</list>
</operator>
<operator activated="true" class="web:process_web" compatibility="5.3.000" expanded="true" height="60" name="Process Documents from Web" width="90" x="112" y="300">
<parameter key="url" value="http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto&amp;queryMode=simpleany&amp;autore=&amp;fromDay=01&amp;fromMonth=01&amp;fromYear=1992&amp;toDay=31&amp;toMonth=12&amp;toYear=2013&amp;orderBy=data&amp;sectionCorriere=true&amp;__checkbox_sectionCorriere=true&amp;__checkbox_sectionLavoro=true&amp;__checkbox_sectionEconomia=true&amp;__checkbox_sectionSalute=true&amp;__checkbox_sectionSoldi=true&amp;__checkbox_sectionViviMilano=true&amp;Ricerca=Cerca&amp;pageNumber=%{Pagepos}"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_text" value="terremoto"/>
</list>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="max_page_size" value="10000"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1"/>
<parameter key="really_ignore_exclusion" value="true"/>
<parameter key="parallelize_process_webpage" value="true"/>
<process expanded="true">
<operator activated="true" class="text:cut_document" compatibility="5.3.000" expanded="true" height="60" name="Cut Document" width="90" x="205" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries">
<parameter key="Article" value="<div> <h1>.</p> </div>"/>
</list>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Article" value="//h:div"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<parameter key="parallelize_segment_processing" value="true"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="5.3.000" expanded="true" height="60" name="Extract Information" width="90" x="214" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Date" value="//h:div/h:p/h:span[1]"/>
<parameter key="Article" value="//h:div"/>
<parameter key="article-link" value="//h:div/h:h1/h:a"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_database" compatibility="5.3.008" expanded="true" height="60" name="Write Database" width="90" x="514" y="300">
<parameter key="define_connection" value="url"/>
<parameter key="connection" value="italiaoggi"/>
<parameter key="database_url" value="jdbc:mysql://localhost:3306/corriere"/>
<parameter key="username" value="root"/>
<parameter key="password" value="*****************"/>
<parameter key="table_name" value="textmine"/>
<parameter key="overwrite_mode" value="append"/>
<parameter key="default_varchar_length" value="10000"/>
<parameter key="db_key_attribute_name" value="Link"/>
</operator>
<connect from_port="input 1" to_op="Generate Macro" to_port="through 1"/>
<connect from_op="Generate Macro" from_port="through 1" to_op="Log" to_port="through 1"/>
<connect from_op="Log" from_port="through 1" to_port="output 1"/>
<connect from_op="Process Documents from Web" from_port="example set" to_op="Write Database" to_port="input"/>
<connect from_op="Write Database" from_port="through" to_port="output 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<connect from_op="Loop" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I wait for Your kind and good suggestion.
Have a wonderful day Marius.
Alex
the Get Page operator supports POST requests. Maybe you can play around a bit with that operator, and if you manage to retrieve one page successfully, you can probably use it in a loop to retrieve all pages.
Just as a side note, did you check that the site policy/copyright allows you to machine-crawl the archive of the Corriere della Sera?
Una buona giornata anche a te!
Marius