The Altair Community is migrating to a new platform to provide a better experience for you. The RapidMiner Community will merge with the Altair Community at the same time. In preparation for the migration, both communities are on read-only mode from July 15th - July 24th, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here.
Options

Duplicate attribute name: Content-Type

rapidoxrapidox Member Posts: 3 Contributor I
edited November 2018 in Help
Hi all,
Rapid Miner is a fantastic tool I am using.

I am trying to get Keyword clustering using web mining and text mining example by http://www.simafore.com/blog/bid/116340/ , but I get a "Duplicate attribute name: Content-Type" error.

I have to read a mysql database table and get the LINK information as attribute.

(mysql)
LINK attribute is:

http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html
http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html
http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html

I'd like to get keyword clusters that are based on those web pages content.

Do You know a way to get this process working ?

I attach the xml process here.

I thank You for good collaboration in advance !

Have a good day.
Alex

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="45" y="75">
        <parameter key="define_connection" value="url"/>
        <parameter key="connection" value="libero"/>
        <parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
        <parameter key="username" value="root"/>
        <parameter key="password" value="***********************"/>
        <parameter key="define_query" value="table name"/>
        <parameter key="table_name" value="textmine"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
        <parameter key="link_attribute" value="Link"/>
        <parameter key="page_attribute" value="PAGE"/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="delay" value="random"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value="PAGEOUTPUT"/>
        <parameter key="attributes" value="PAGEOUTPUT"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
            <parameter key="ignore_non_html_tags" value="false"/>
          </operator>
          <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
          <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|text"/>
        <parameter key="numeric_condition" value="&lt;5"/>
      </operator>
      <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435"/>
      <connect from_op="Read Database" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
      <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>


Answers

  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    for me the process you supplied runs fine if I replace the Read Database operator with a data set that contains a Link attribute with the links you provided.

    Can you send me a link for which the described error occurs?

    Best regards,
    Marius
  • Options
    rapidoxrapidox Member Posts: 3 Contributor I
    Hi Marius,
    I am very happy to read You rapid reply.

    I replaced the Read Database operator, following your suggestion.

    We can't get any content from the linked web pages, I don't know why, maybe I have to change the "Get Pages" operator.

    The csv file contains now:

    Link;
    http://corrieredelveneto.corriere.it/notizie/politica/2013/28-maggio-2013/vincitori-vinti-disperati-2221364926711.shtml,
    http://www.corriere.it/sette/13_maggio_22/2013-21-gramigna-aulla_12fb6dea-c2e8-11e2-b767-d844a9f1da92.shtml,
    http://corrieredelveneto.corriere.it/notizie/cronaca/2013/23-maggio-2013/alluvione-stretta-controlli-ma-resta-nodo-bacini-2221283139532.shtml


    Here the whole process:

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="read_csv" compatibility="5.3.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="75">
            <parameter key="csv_file" value="/home/alex/Scrivania/url.csv"/>
            <parameter key="column_separators" value=","/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="locale" value="Italian (Italy)"/>
            <parameter key="encoding" value="UTF-8"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="Link.true.binominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="PAGE"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="user_agent" value="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:20.0) Gecko/20100101 Firefox/20.0"/>
            <parameter key="accept_cookies" value="all"/>
            <parameter key="delay" value="random"/>
            <parameter key="min_delay_amount" value="1000"/>
            <parameter key="max_delay_amount" value="2000"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
            <parameter key="attribute_filter_type" value="no_missing_values"/>
            <parameter key="attribute" value="PAGEOUTPUT"/>
            <parameter key="attributes" value="PAGEOUTPUT"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content" width="90" x="246" y="75">
                <parameter key="neglegt_span_tags" value="false"/>
                <parameter key="neglect_p_tags" value="false"/>
                <parameter key="neglect_b_tags" value="false"/>
                <parameter key="neglect_i_tags" value="false"/>
                <parameter key="neglect_br_tags" value="false"/>
                <parameter key="ignore_non_html_tags" value="false"/>
              </operator>
              <connect from_port="document" to_op="Extract Content" to_port="document"/>
              <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="112" name="Multiply" width="90" x="380" y="345"/>
          <operator activated="true" class="write_csv" compatibility="5.3.008" expanded="true" height="76" name="Write CSV" width="90" x="581" y="570">
            <parameter key="csv_file" value="/home/alex/Scrivania/out.csv"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="|Link"/>
            <parameter key="numeric_condition" value="&lt;5"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435">
            <parameter key="add_as_label" value="true"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
          <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 3" to_op="Write CSV" to_port="input"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
          <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
          <portSpacing port="sink_result 5" spacing="0"/>
        </process>
      </operator>
    </process>

    Can You suggest a solution and attach a full working process for text Keyword clustering ?

    I thank You for Your good support Marius!

    Have a good evening.
    Alex
  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    It's probably rather a problem with the import. Here I create the data manual with Generate Data by User Specification and Append, and the process works like a charm.

    Best regards,
    Marius
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="false" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="179" y="345">
            <parameter key="define_connection" value="url"/>
            <parameter key="connection" value="libero"/>
            <parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
            <parameter key="username" value="root"/>
            <parameter key="password" value="lgklMQSth6iWCSUxBK2HqHVOMUczkF6b"/>
            <parameter key="define_query" value="table name"/>
            <parameter key="table_name" value="textmine"/>
            <enumeration key="parameters"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30">
            <list key="attribute_values">
              <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="120">
            <list key="attribute_values">
              <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="210">
            <list key="attribute_values">
              <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="append" compatibility="5.3.008" expanded="true" height="112" name="Append" width="90" x="179" y="30"/>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="313" y="30">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="PAGE"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="delay" value="random"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="210">
            <parameter key="attribute_filter_type" value="no_missing_values"/>
            <parameter key="attribute" value="PAGEOUTPUT"/>
            <parameter key="attributes" value="PAGEOUTPUT"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="75">
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
                <parameter key="ignore_non_html_tags" value="false"/>
              </operator>
              <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
              <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
          <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="|text"/>
            <parameter key="numeric_condition" value="&lt;5"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="782" y="390"/>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
          <connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
          <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
          <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
          <portSpacing port="sink_result 5" spacing="0"/>
        </process>
      </operator>
    </process>
  • Options
    rapidoxrapidox Member Posts: 3 Contributor I
    Marius You are Great !!!

    I succeed using the Read Csv operator !

    now for a scientific research I need to get earthquake (=terremoto) related italian article data from a freely available newspaper article archive search engine

    http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto

    Searching for " terremoto " You will find 11210 articles.

    The pagination system uses a javascript script to assign value to the pageNumber input variable.

    function submitform(page) {
    var query = document.getElementById("queryString").value;
    var action = "archivioStoricoEngine";
    if (query != null && query != "") {
    action += "?q=" + query;
    } else {
    action += "?q=";
    }
      document.getElementById("pageNumber").value = page;
      document.getElementById("pagerForm").action=action;
      document.getElementById("pagerForm").submit();
    }

    The form uses POST Method and hidden inputed variables, instead of GET method.

    Maybe for You is a simple question, but I am a newbe in data mining field, so please explain to me how can I proceed.

    What Rapid Miner operators have I to use?

    How can I set the javascript pageNumber variable to loop the article extraction?

    Is it possible to add a Referer ?

    Here my process, it works for ordinary search engine web page, but I don't know how to extract data from Form POSTing search engine results. 

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="loop" compatibility="5.3.008" expanded="true" height="94" name="Loop" width="90" x="179" y="210">
            <parameter key="set_iteration_macro" value="true"/>
            <parameter key="iterations" value="317"/>
            <parameter key="timeout" value="120"/>
            <parameter key="parallelize_iteration" value="true"/>
            <process expanded="true">
              <operator activated="true" class="generate_macro" compatibility="5.3.008" expanded="true" height="76" name="Generate Macro" width="90" x="45" y="30">
                <list key="function_descriptions">
                  <parameter key="Pagepos" value="(%{iteration})+1"/>
                </list>
              </operator>
              <operator activated="true" class="log" compatibility="5.3.008" expanded="true" height="76" name="Log" width="90" x="512" y="30">
                <parameter key="filename" value="/home/alex/Documents/Logs/log-perfetto.txt"/>
                <list key="log">
                  <parameter key="time" value="operator.Crawl Web.value.time"/>
                  <parameter key="execution time" value="operator.Crawl Web.value.execution-time"/>
                  <parameter key="looptime" value="operator.Crawl Web.value.looptime"/>
                  <parameter key="cpu execution time" value="operator.Crawl Web.value.cpu-execution-time"/>
                  <parameter key="Max Token Length" value="operator.Tokenize.parameter.max_token_length"/>
                </list>
              </operator>
              <operator activated="true" class="web:process_web" compatibility="5.3.000" expanded="true" height="60" name="Process Documents from Web" width="90" x="112" y="300">
                <parameter key="url" value="http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto&amp;amp;queryMode=simpleany&amp;amp;autore=&amp;amp;fromDay=01&amp;amp;fromMonth=01&amp;amp;fromYear=1992&amp;amp;toDay=31&amp;amp;toMonth=12&amp;amp;toYear=2013&amp;amp;orderBy=data&amp;amp;sectionCorriere=true&amp;amp;__checkbox_sectionCorriere=true&amp;amp;__checkbox_sectionLavoro=true&amp;amp;__checkbox_sectionEconomia=true&amp;amp;__checkbox_sectionSalute=true&amp;amp;__checkbox_sectionSoldi=true&amp;amp;__checkbox_sectionViviMilano=true&amp;amp;Ricerca=Cerca&amp;amp;pageNumber=%{Pagepos}"/>
                <list key="crawling_rules">
                  <parameter key="follow_link_with_matching_text" value="terremoto"/>
                </list>
                <parameter key="add_pages_as_attribute" value="true"/>
                <parameter key="max_page_size" value="10000"/>
                <parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1"/>
                <parameter key="really_ignore_exclusion" value="true"/>
                <parameter key="parallelize_process_webpage" value="true"/>
                <process expanded="true">
                  <operator activated="true" class="text:cut_document" compatibility="5.3.000" expanded="true" height="60" name="Cut Document" width="90" x="205" y="30">
                    <parameter key="query_type" value="XPath"/>
                    <list key="string_machting_queries">
                      <parameter key="Article" value="&lt;div&gt; &lt;h1&gt;.&lt;/p&gt; &lt;/div&gt;"/>
                    </list>
                    <list key="regular_expression_queries"/>
                    <list key="regular_region_queries"/>
                    <list key="xpath_queries">
                      <parameter key="Article" value="//h:div"/>
                    </list>
                    <list key="namespaces"/>
                    <parameter key="ignore_CDATA" value="false"/>
                    <list key="index_queries"/>
                    <parameter key="parallelize_segment_processing" value="true"/>
                    <process expanded="true">
                      <operator activated="true" class="text:extract_information" compatibility="5.3.000" expanded="true" height="60" name="Extract Information" width="90" x="214" y="30">
                        <parameter key="query_type" value="XPath"/>
                        <list key="string_machting_queries"/>
                        <list key="regular_expression_queries"/>
                        <list key="regular_region_queries"/>
                        <list key="xpath_queries">
                          <parameter key="Date" value="//h:div/h:p/h:span[1]"/>
                          <parameter key="Article" value="//h:div"/>
                          <parameter key="article-link" value="//h:div/h:h1/h:a"/>
                        </list>
                        <list key="namespaces"/>
                        <parameter key="ignore_CDATA" value="false"/>
                        <list key="index_queries"/>
                      </operator>
                      <connect from_port="segment" to_op="Extract Information" to_port="document"/>
                      <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
                      <portSpacing port="source_segment" spacing="0"/>
                      <portSpacing port="sink_document 1" spacing="0"/>
                      <portSpacing port="sink_document 2" spacing="0"/>
                    </process>
                  </operator>
                  <connect from_port="document" to_op="Cut Document" to_port="document"/>
                  <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="write_database" compatibility="5.3.008" expanded="true" height="60" name="Write Database" width="90" x="514" y="300">
                <parameter key="define_connection" value="url"/>
                <parameter key="connection" value="italiaoggi"/>
                <parameter key="database_url" value="jdbc:mysql://localhost:3306/corriere"/>
                <parameter key="username" value="root"/>
                <parameter key="password" value="*****************"/>
                <parameter key="table_name" value="textmine"/>
                <parameter key="overwrite_mode" value="append"/>
                <parameter key="default_varchar_length" value="10000"/>
                <parameter key="db_key_attribute_name" value="Link"/>
              </operator>
              <connect from_port="input 1" to_op="Generate Macro" to_port="through 1"/>
              <connect from_op="Generate Macro" from_port="through 1" to_op="Log" to_port="through 1"/>
              <connect from_op="Log" from_port="through 1" to_port="output 1"/>
              <connect from_op="Process Documents from Web" from_port="example set" to_op="Write Database" to_port="input"/>
              <connect from_op="Write Database" from_port="through" to_port="output 2"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Loop" from_port="output 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    I wait for Your kind and good suggestion.

    Have a wonderful day Marius.

    Alex
  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Ciao Alex,

    the Get Page operator supports POST requests. Maybe you can play around a bit with that operator, and if you manage to retrieve one page successfully, you can probably use it in a loop to retrieve all pages.

    Just as a side note, did you check that the site policy/copyright allows you to machine-crawl the archive of the Corriere della Sera?

    Una buona giornata anche a te!
    Marius
Sign In or Register to comment.