Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Duplicate attribute name: Content-Type

rapidoxrapidox Member Posts: 3 Contributor I
edited November 2018 in Help
Hi all,
Rapid Miner is a fantastic tool I am using.

I am trying to get Keyword clustering using web mining and text mining example by http://www.simafore.com/blog/bid/116340/ , but I get a "Duplicate attribute name: Content-Type" error.

I have to read a mysql database table and get the LINK information as attribute.

(mysql)
LINK attribute is:

http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html
http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html
http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html

I'd like to get keyword clusters that are based on those web pages content.

Do You know a way to get this process working ?

I attach the xml process here.

I thank You for good collaboration in advance !

Have a good day.
Alex

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="45" y="75">
        <parameter key="define_connection" value="url"/>
        <parameter key="connection" value="libero"/>
        <parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
        <parameter key="username" value="root"/>
        <parameter key="password" value="***********************"/>
        <parameter key="define_query" value="table name"/>
        <parameter key="table_name" value="textmine"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
        <parameter key="link_attribute" value="Link"/>
        <parameter key="page_attribute" value="PAGE"/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="delay" value="random"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value="PAGEOUTPUT"/>
        <parameter key="attributes" value="PAGEOUTPUT"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
            <parameter key="ignore_non_html_tags" value="false"/>
          </operator>
          <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
          <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|text"/>
        <parameter key="numeric_condition" value="&lt;5"/>
      </operator>
      <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435"/>
      <connect from_op="Read Database" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
      <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>


Answers

  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    for me the process you supplied runs fine if I replace the Read Database operator with a data set that contains a Link attribute with the links you provided.

    Can you send me a link for which the described error occurs?

    Best regards,
    Marius
  • rapidoxrapidox Member Posts: 3 Contributor I
    Hi Marius,
    I am very happy to read You rapid reply.

    I replaced the Read Database operator, following your suggestion.

    We can't get any content from the linked web pages, I don't know why, maybe I have to change the "Get Pages" operator.

    The csv file contains now:

    Link;
    http://corrieredelveneto.corriere.it/notizie/politica/2013/28-maggio-2013/vincitori-vinti-disperati-2221364926711.shtml,
    http://www.corriere.it/sette/13_maggio_22/2013-21-gramigna-aulla_12fb6dea-c2e8-11e2-b767-d844a9f1da92.shtml,
    http://corrieredelveneto.corriere.it/notizie/cronaca/2013/23-maggio-2013/alluvione-stretta-controlli-ma-resta-nodo-bacini-2221283139532.shtml


    Here the whole process:

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="read_csv" compatibility="5.3.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="75">
            <parameter key="csv_file" value="/home/alex/Scrivania/url.csv"/>
            <parameter key="column_separators" value=","/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="locale" value="Italian (Italy)"/>
            <parameter key="encoding" value="UTF-8"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="Link.true.binominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="PAGE"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="user_agent" value="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:20.0) Gecko/20100101 Firefox/20.0"/>
            <parameter key="accept_cookies" value="all"/>
            <parameter key="delay" value="random"/>
            <parameter key="min_delay_amount" value="1000"/>
            <parameter key="max_delay_amount" value="2000"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
            <parameter key="attribute_filter_type" value="no_missing_values"/>
            <parameter key="attribute" value="PAGEOUTPUT"/>
            <parameter key="attributes" value="PAGEOUTPUT"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content" width="90" x="246" y="75">
                <parameter key="neglegt_span_tags" value="false"/>
                <parameter key="neglect_p_tags" value="false"/>
                <parameter key="neglect_b_tags" value="false"/>
                <parameter key="neglect_i_tags" value="false"/>
                <parameter key="neglect_br_tags" value="false"/>
                <parameter key="ignore_non_html_tags" value="false"/>
              </operator>
              <connect from_port="document" to_op="Extract Content" to_port="document"/>
              <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="112" name="Multiply" width="90" x="380" y="345"/>
          <operator activated="true" class="write_csv" compatibility="5.3.008" expanded="true" height="76" name="Write CSV" width="90" x="581" y="570">
            <parameter key="csv_file" value="/home/alex/Scrivania/out.csv"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="|Link"/>
            <parameter key="numeric_condition" value="&lt;5"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435">
            <parameter key="add_as_label" value="true"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
          <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 3" to_op="Write CSV" to_port="input"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
          <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
          <portSpacing port="sink_result 5" spacing="0"/>
        </process>
      </operator>
    </process>

    Can You suggest a solution and attach a full working process for text Keyword clustering ?

    I thank You for Your good support Marius!

    Have a good evening.
    Alex
  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    It's probably rather a problem with the import. Here I create the data manual with Generate Data by User Specification and Append, and the process works like a charm.

    Best regards,
    Marius
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="false" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="179" y="345">
            <parameter key="define_connection" value="url"/>
            <parameter key="connection" value="libero"/>
            <parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
            <parameter key="username" value="root"/>
            <parameter key="password" value="lgklMQSth6iWCSUxBK2HqHVOMUczkF6b"/>
            <parameter key="define_query" value="table name"/>
            <parameter key="table_name" value="textmine"/>
            <enumeration key="parameters"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30">
            <list key="attribute_values">
              <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="120">
            <list key="attribute_values">
              <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="210">
            <list key="attribute_values">
              <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="append" compatibility="5.3.008" expanded="true" height="112" name="Append" width="90" x="179" y="30"/>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="313" y="30">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="PAGE"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="delay" value="random"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="210">
            <parameter key="attribute_filter_type" value="no_missing_values"/>
            <parameter key="attribute" value="PAGEOUTPUT"/>
            <parameter key="attributes" value="PAGEOUTPUT"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="75">
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
                <parameter key="ignore_non_html_tags" value="false"/>
              </operator>
              <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
              <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="|text"/>
            <parameter key="numeric_condition" value="&lt;5"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="782" y="390"/>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
          <connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
          <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
          <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
          <portSpacing port="sink_result 5" spacing="0"/>
        </process>
      </operator>
    </process>
  • rapidoxrapidox Member Posts: 3 Contributor I
    Marius You are Great !!!

    I succeed using the Read Csv operator !

    now for a scientific research I need to get earthquake (=terremoto) related italian article data from a freely available newspaper article archive search engine

    http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto

    Searching for " terremoto " You will find 11210 articles.

    The pagination system uses a javascript script to assign value to the pageNumber input variable.

    function submitform(page) {
    var query = document.getElementById("queryString").value;
    var action = "archivioStoricoEngine";
    if (query != null && query != "") {
    action += "?q=" + query;
    } else {
    action += "?q=";
    }
      document.getElementById("pageNumber").value = page;
      document.getElementById("pagerForm").action=action;
      document.getElementById("pagerForm").submit();
    }

    The form uses POST Method and hidden inputed variables, instead of GET method.

    Maybe for You is a simple question, but I am a newbe in data mining field, so please explain to me how can I proceed.

    What Rapid Miner operators have I to use?

    How can I set the javascript pageNumber variable to loop the article extraction?

    Is it possible to add a Referer ?

    Here my process, it works for ordinary search engine web page, but I don't know how to extract data from Form POSTing search engine results. 

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="loop" compatibility="5.3.008" expanded="true" height="94" name="Loop" width="90" x="179" y="210">
            <parameter key="set_iteration_macro" value="true"/>
            <parameter key="iterations" value="317"/>
            <parameter key="timeout" value="120"/>
            <parameter key="parallelize_iteration" value="true"/>
            <process expanded="true">
              <operator activated="true" class="generate_macro" compatibility="5.3.008" expanded="true" height="76" name="Generate Macro" width="90" x="45" y="30">
                <list key="function_descriptions">
                  <parameter key="Pagepos" value="(%{iteration})+1"/>
                </list>
              </operator>
              <operator activated="true" class="log" compatibility="5.3.008" expanded="true" height="76" name="Log" width="90" x="512" y="30">
                <parameter key="filename" value="/home/alex/Documents/Logs/log-perfetto.txt"/>
                <list key="log">
                  <parameter key="time" value="operator.Crawl Web.value.time"/>
                  <parameter key="execution time" value="operator.Crawl Web.value.execution-time"/>
                  <parameter key="looptime" value="operator.Crawl Web.value.looptime"/>
                  <parameter key="cpu execution time" value="operator.Crawl Web.value.cpu-execution-time"/>
                  <parameter key="Max Token Length" value="operator.Tokenize.parameter.max_token_length"/>
                </list>
              </operator>
              <operator activated="true" class="web:process_web" compatibility="5.3.000" expanded="true" height="60" name="Process Documents from Web" width="90" x="112" y="300">
                <parameter key="url" value="http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto&amp;amp;queryMode=simpleany&amp;amp;autore=&amp;amp;fromDay=01&amp;amp;fromMonth=01&amp;amp;fromYear=1992&amp;amp;toDay=31&amp;amp;toMonth=12&amp;amp;toYear=2013&amp;amp;orderBy=data&amp;amp;sectionCorriere=true&amp;amp;__checkbox_sectionCorriere=true&amp;amp;__checkbox_sectionLavoro=true&amp;amp;__checkbox_sectionEconomia=true&amp;amp;__checkbox_sectionSalute=true&amp;amp;__checkbox_sectionSoldi=true&amp;amp;__checkbox_sectionViviMilano=true&amp;amp;Ricerca=Cerca&amp;amp;pageNumber=%{Pagepos}"/>
                <list key="crawling_rules">
                  <parameter key="follow_link_with_matching_text" value="terremoto"/>
                </list>
                <parameter key="add_pages_as_attribute" value="true"/>
                <parameter key="max_page_size" value="10000"/>
                <parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1"/>
                <parameter key="really_ignore_exclusion" value="true"/>
                <parameter key="parallelize_process_webpage" value="true"/>
                <process expanded="true">
                  <operator activated="true" class="text:cut_document" compatibility="5.3.000" expanded="true" height="60" name="Cut Document" width="90" x="205" y="30">
                    <parameter key="query_type" value="XPath"/>
                    <list key="string_machting_queries">
                      <parameter key="Article" value="&lt;div&gt; &lt;h1&gt;.&lt;/p&gt; &lt;/div&gt;"/>
                    </list>
                    <list key="regular_expression_queries"/>
                    <list key="regular_region_queries"/>
                    <list key="xpath_queries">
                      <parameter key="Article" value="//h:div"/>
                    </list>
                    <list key="namespaces"/>
                    <parameter key="ignore_CDATA" value="false"/>
                    <list key="index_queries"/>
                    <parameter key="parallelize_segment_processing" value="true"/>
                    <process expanded="true">
                      <operator activated="true" class="text:extract_information" compatibility="5.3.000" expanded="true" height="60" name="Extract Information" width="90" x="214" y="30">
                        <parameter key="query_type" value="XPath"/>
                        <list key="string_machting_queries"/>
                        <list key="regular_expression_queries"/>
                        <list key="regular_region_queries"/>
                        <list key="xpath_queries">
                          <parameter key="Date" value="//h:div/h:p/h:span[1]"/>
                          <parameter key="Article" value="//h:div"/>
                          <parameter key="article-link" value="//h:div/h:h1/h:a"/>
                        </list>
                        <list key="namespaces"/>
                        <parameter key="ignore_CDATA" value="false"/>
                        <list key="index_queries"/>
                      </operator>
                      <connect from_port="segment" to_op="Extract Information" to_port="document"/>
                      <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
                      <portSpacing port="source_segment" spacing="0"/>
                      <portSpacing port="sink_document 1" spacing="0"/>
                      <portSpacing port="sink_document 2" spacing="0"/>
                    </process>
                  </operator>
                  <connect from_port="document" to_op="Cut Document" to_port="document"/>
                  <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="write_database" compatibility="5.3.008" expanded="true" height="60" name="Write Database" width="90" x="514" y="300">
                <parameter key="define_connection" value="url"/>
                <parameter key="connection" value="italiaoggi"/>
                <parameter key="database_url" value="jdbc:mysql://localhost:3306/corriere"/>
                <parameter key="username" value="root"/>
                <parameter key="password" value="*****************"/>
                <parameter key="table_name" value="textmine"/>
                <parameter key="overwrite_mode" value="append"/>
                <parameter key="default_varchar_length" value="10000"/>
                <parameter key="db_key_attribute_name" value="Link"/>
              </operator>
              <connect from_port="input 1" to_op="Generate Macro" to_port="through 1"/>
              <connect from_op="Generate Macro" from_port="through 1" to_op="Log" to_port="through 1"/>
              <connect from_op="Log" from_port="through 1" to_port="output 1"/>
              <connect from_op="Process Documents from Web" from_port="example set" to_op="Write Database" to_port="input"/>
              <connect from_op="Write Database" from_port="through" to_port="output 2"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Loop" from_port="output 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    I wait for Your kind and good suggestion.

    Have a wonderful day Marius.

    Alex
  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Ciao Alex,

    the Get Page operator supports POST requests. Maybe you can play around a bit with that operator, and if you manage to retrieve one page successfully, you can probably use it in a loop to retrieve all pages.

    Just as a side note, did you check that the site policy/copyright allows you to machine-crawl the archive of the Corriere della Sera?

    Una buona giornata anche a te!
    Marius
Sign In or Register to comment.