IF YOU โค๏ธ RAPIDMINER, PLEASE HELP US GET TO #1 AGAIN - VOTE IN KDNUGGETS POLL 2019! ๐Ÿ™ ๐Ÿ™ ๐Ÿ™

Dynamic Web Crawling Tripadvisor

Domi007Domi007 Member Posts: 2 Contributor I
edited November 2018 in Help
Hello everyone,

I'm trying to get the reviews from tripadvisor to do a sentiment analysis. It works, but not in the way I want.

Here's my code on how to fetch html pages.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.4.000">
  <context>
    <input/>
    <output>
      <location>../../Data/Html-Pages-Tripadvisor</location>
    </output>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="6.4.000" expanded="true" name="Process">
    <parameter key="resultfile" value="C:\Users\Dominik\Documents\Studium\Projektarbeiten\Test_Data-Extraction\1_Fetch-Result-Example-Set.res"/>
    <process expanded="true">
      <operator activated="true" class="set_macro" compatibility="6.4.000" expanded="true" height="60" name="Set Macro" width="90" x="45" y="165">
        <parameter key="macro" value="max-pages"/>
        <parameter key="value" value="10"/>
        <description align="center" color="transparent" colored="false" width="126">This macro defines the maximal number of pages fetched for each hotel in Are.</description>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="STF_Hotel_Are_Torg" width="90" x="246" y="75">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d2400688-r154089743-STF_Hotel_Are_Torg-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*STF_Hotel_Are_Torg-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+STF_Hotel_Are_Torg-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Are_Continental_Inn" width="90" x="246" y="165">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d678441-r127124263-Are_Continental_Inn-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Are_Continental_Inn-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Are_Continental_Inn-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Tott_Hotel_Are" width="90" x="246" y="255">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d486763-r153044193-Tott_Hotel_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Tott_Hotel_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Tott_Hotel_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Fjallgarden_Hotel" width="90" x="246" y="345">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d565631-r137349978-Fjallgarden_Hotel-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Fjallgarden_Hotel-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Fjallgarden_Hotel-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Hotel_Diplomat-Aregarden" width="90" x="246" y="435">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d1157031-r148521338-Hotel_Diplomat_Aregarden-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Hotel_Diplomat_Aregarden-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Hotel_Diplomat_Aregarden-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Holiday Club Are" width="90" x="246" y="525">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d1016233-r152160882-Holiday_Club_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Holiday_Club_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Holiday_Club_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Cooperhil Mountain Lodge" width="90" x="246" y="615">
        <parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d1236656-r153179437-Copperhill_Mountain_Lodge-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Copperhill_Mountain_Lodge-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
          <parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Copperhill_Mountain_Lodge-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="%{max-pages}"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_page_size" value="5000"/>
      </operator>
      <operator activated="true" class="append" compatibility="6.4.000" expanded="true" height="184" name="Append" width="90" x="380" y="255"/>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.002" expanded="true" height="60" name="Get Pages" width="90" x="514" y="255">
        <parameter key="link_attribute" value="Link"/>
      </operator>
      <connect from_op="STF_Hotel_Are_Torg" from_port="Example Set" to_op="Append" to_port="example set 1"/>
      <connect from_op="Are_Continental_Inn" from_port="Example Set" to_op="Append" to_port="example set 2"/>
      <connect from_op="Tott_Hotel_Are" from_port="Example Set" to_op="Append" to_port="example set 3"/>
      <connect from_op="Fjallgarden_Hotel" from_port="Example Set" to_op="Append" to_port="example set 4"/>
      <connect from_op="Hotel_Diplomat-Aregarden" from_port="Example Set" to_op="Append" to_port="example set 5"/>
      <connect from_op="Holiday Club Are" from_port="Example Set" to_op="Append" to_port="example set 6"/>
      <connect from_op="Cooperhil Mountain Lodge" from_port="Example Set" to_op="Append" to_port="example set 7"/>
      <connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
     
    </process>
  </operator>
</process>
There I have a lot of Web Crawl operators for each hotel where I want to get the reviews. Crawling rules need the latest review made for a certain hotel.

I expect a more dynamic process where I don't have to care about every hotel available in a certain city. I would like to have a process where I get, in the first step, all the available hotels and, more important, all the reviews of the hotels in a city.

This process just shows how to fetch html pages.

Thank you for your help!

Dominik
Sign In or Register to comment.