Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Web Crawling

gunjanamitgunjanamit Member Posts: 28 Contributor II
edited November 2018 in Help
I am trying to read two websites, tokenize the web pages data and then finding similarity.

But I am not getting results in the very first step - On reading web pages

My XML is


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
    <process expanded="true" height="359" width="547">
      <operator activated="true" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel" width="90" x="57" y="26">
        <parameter key="excel_file" value="C:\Users\guagg\Desktop\All\RapidMiner\read.xls"/>
        <parameter key="imported_cell_range" value="A1:A3"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Comment"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Comments.true.file_path.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.2.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="120">
        <parameter key="link_attribute" value="Comments"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.2.003" expanded="true" height="60" name="Data to Documents" width="90" x="246" y="30">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.2.003" expanded="true" height="94" name="Process Documents" width="90" x="246" y="255">
        <process expanded="true" height="360" width="695">
          <operator activated="true" class="text:tokenize" compatibility="5.2.003" expanded="true" height="60" name="Tokenize" width="90" x="193" y="106"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.2.003" expanded="true" height="60" name="Transform Cases" width="90" x="306" y="104"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="422" y="104"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="data_to_similarity" compatibility="5.2.006" expanded="true" height="76" name="Data to Similarity" width="90" x="380" y="165">
        <parameter key="measure_types" value="NumericalMeasures"/>
        <parameter key="numerical_measure" value="CosineSimilarity"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
      <connect from_op="Process Documents" from_port="word list" to_port="result 2"/>
      <connect from_op="Data to Similarity" from_port="similarity" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="180"/>
      <portSpacing port="sink_result 3" spacing="18"/>
    </process>
  </operator>
</process>

Please suggest....
Sign In or Register to comment.