The RapidMiner community is on read-only mode until further notice. Technical support via cases will continue to work as is. For any urgent licensing related requests from Students/Faculty members, please use the Altair academic forum here.

Extract Information function

erocoarerocoar Member Posts: 6 Contributor II
edited November 2018 in Help
Hey,

I have some problems with using the extract information function, so I'd be super grateful for any help. :)

For my project I want to read out headline and teaser text from news sites.
E.g.:

<h4 class="headline"><a href="http://www.welt.de/politik/deutschland/article148111602/Lammert-droht-mit-Nein-zu-TTIP.html"  name="_chf_R_20W_">Lammert droht mit Nein zu TTIP</a>
    </h4>

<p class="tsrText">
Norbert Lammert hat mehr Transparenz bei den Verhandlungen zum Freihandelsabkommen TTIP gefordert und mit Ablehnung gedroht. Der bisherige begrenzte Zugang zu Dokumenten sei "indiskutabel". <a href="http://www.welt.de/politik/deutschland/article148111602/Lammert-droht-mit-Nein-zu-TTIP.html" class="more" name="_chf_R_20W_">mehr...</a>
    </p>

Theoretically it should work with
//h:p[@class='tsrText']//text()
//h:h[@class='headline']//text() but it does not, and I assume it is because of the way my process is built up. Here is the process as xml:
<?xml version="1.0" encoding="UTF-8"?>

-<process version="6.5.002">


-<context>

<input/>

<output/>

<macros/>

</context>


-<operator name="Process" expanded="true" compatibility="6.5.002" class="process" activated="true">

<parameter value="init" key="logverbosity"/>

<parameter value="2001" key="random_seed"/>

<parameter value="never" key="send_mail"/>

<parameter value="" key="notification_email"/>

<parameter value="30" key="process_duration_for_mail"/>

<parameter value="SYSTEM" key="encoding"/>


-<process expanded="true">


-<operator name="Read Excel" expanded="true" compatibility="6.5.002" class="read_excel" activated="true" y="30" x="45" width="90" height="60">

<parameter value="C:\Users\Frederik\Downloads\datatest.xlsx" key="excel_file"/>

<parameter value="1" key="sheet_number"/>

<parameter value="A1:B10" key="imported_cell_range"/>

<parameter value="SYSTEM" key="encoding"/>

<parameter value="false" key="first_row_as_names"/>


-<list key="annotations">

<parameter value="Name" key="0"/>

</list>

<parameter value="EEE, d MMM yyyy HH:mm:ss Z" key="date_format"/>

<parameter value="SYSTEM" key="time_zone"/>

<parameter value="English (United States)" key="locale"/>


-<list key="data_set_meta_data_information">

<parameter value="Link.true.file_path.attribute" key="0"/>

<parameter value="Datum.true.date_time.attribute" key="1"/>

</list>

<parameter value="true" key="read_not_matching_values_as_missings"/>

<parameter value="double_array" key="datamanagement"/>

</operator>


-<operator name="Get Pages" expanded="true" compatibility="6.5.000" class="web:retrieve_webpages" activated="true" y="120" x="179" width="90" height="60">

<parameter value="Link" key="link_attribute"/>

<parameter value="false" key="random_user_agent"/>

<parameter value="10000" key="connection_timeout"/>

<parameter value="10000" key="read_timeout"/>

<parameter value="true" key="follow_redirects"/>

<parameter value="none" key="accept_cookies"/>

<parameter value="global" key="cookie_scope"/>

<parameter value="GET" key="request_method"/>

<parameter value="none" key="delay"/>

<parameter value="1000" key="delay_amount"/>

<parameter value="0" key="min_delay_amount"/>

<parameter value="1000" key="max_delay_amount"/>

</operator>


-<operator name="Data to Documents" expanded="true" compatibility="6.5.000" class="text:data_to_documents" activated="true" y="30" x="380" width="90" height="60">

<parameter value="false" key="select_attributes_and_weights"/>

<list key="specify_weights"/>

</operator>


-<operator name="Process Documents" expanded="true" compatibility="6.5.000" class="text:process_documents" activated="true" y="120" x="581" width="90" height="94">

<parameter value="true" key="create_word_vector"/>

<parameter value="Term Frequency" key="vector_creation"/>

<parameter value="true" key="add_meta_information"/>

<parameter value="false" key="keep_text"/>

<parameter value="absolute" key="prune_method"/>

<parameter value="3.0" key="prune_below_percent"/>

<parameter value="30.0" key="prune_above_percent"/>

<parameter value="3" key="prune_below_absolute"/>

<parameter value="9999" key="prune_above_absolute"/>

<parameter value="0.05" key="prune_below_rank"/>

<parameter value="0.95" key="prune_above_rank"/>

<parameter value="double_sparse_array" key="datamanagement"/>


-<process expanded="true">

<operator name="Stem (German)" expanded="true" compatibility="6.5.000" class="text:stem_german" activated="false" y="30" x="782" width="90" height="60"/>


-<operator name="Extract Content" expanded="true" compatibility="6.5.000" class="web:extract_html_text_content" activated="false" y="30" x="112" width="90" height="60">

<parameter value="true" key="extract_content"/>

<parameter value="3" key="minimum_text_block_length"/>

<parameter value="true" key="override_content_type_information"/>

<parameter value="true" key="neglegt_span_tags"/>

<parameter value="true" key="neglect_p_tags"/>

<parameter value="true" key="neglect_b_tags"/>

<parameter value="true" key="neglect_i_tags"/>

<parameter value="true" key="neglect_br_tags"/>

<parameter value="true" key="ignore_non_html_tags"/>

</operator>


-<operator name="Extract Information" expanded="true" compatibility="6.5.000" class="text:extract_information" activated="true" y="30" x="246" width="90" height="60">

<parameter value="XPath" key="query_type"/>

<list key="string_machting_queries"/>

<parameter value="Nominal" key="attribute_type"/>

<list key="regular_expression_queries"/>

<list key="regular_region_queries"/>


-<list key="xpath_queries">

<parameter value="//h:p[@class='tsrText']//text()" key="headline"/>

<parameter value="//h:h[@class='headline']//text()" key="teasertext"/>

</list>

<list key="namespaces"/>

<parameter value="true" key="ignore_CDATA"/>

<parameter value="true" key="assume_html"/>

<list key="index_queries"/>

<list key="jsonpath_queries"/>

</operator>


-<operator name="Tokenize" expanded="true" compatibility="6.5.000" class="text:tokenize" activated="true" y="30" x="380" width="90" height="60">

<parameter value="non letters" key="mode"/>

<parameter value=".:" key="characters"/>

<parameter value="English" key="language"/>

<parameter value="3" key="max_token_length"/>

</operator>


-<operator name="Filter Tokens (by Content)" expanded="true" compatibility="6.5.000" class="text:filter_tokens_by_content" activated="true" y="120" x="313" width="90" height="60">

<parameter value="matches" key="condition"/>

<parameter value="^[A-Z].*" key="regular_expression"/>

<parameter value="true" key="case_sensitive"/>

<parameter value="false" key="invert condition"/>

</operator>


-<operator name="Transform Cases" expanded="true" compatibility="6.5.000" class="text:transform_cases" activated="true" y="120" x="447" width="90" height="60">

<parameter value="lower case" key="transform_to"/>

</operator>


-<operator name="Filter Stopwords (German)" expanded="true" compatibility="6.5.000" class="text:filter_stopwords_german" activated="true" y="120" x="581" width="90" height="60">

<parameter value="Standard" key="stop_word_list"/>

</operator>


-<operator name="Filter Tokens (by Length)" expanded="true" compatibility="6.5.000" class="text:filter_by_length" activated="true" y="30" x="514" width="90" height="60">

<parameter value="4" key="min_chars"/>

<parameter value="25" key="max_chars"/>

</operator>


-<operator name="Generate n-Grams (Terms)" expanded="true" compatibility="6.5.000" class="text:generate_n_grams_terms" activated="true" y="30" x="648" width="90" height="60">

<parameter value="3" key="max_length"/>

</operator>

<connect to_port="document" to_op="Extract Information" from_port="document"/>

<connect to_port="document" to_op="Tokenize" from_port="document" from_op="Extract Information"/>

<connect to_port="document" to_op="Filter Tokens (by Content)" from_port="document" from_op="Tokenize"/>

<connect to_port="document" to_op="Transform Cases" from_port="document" from_op="Filter Tokens (by Content)"/>

<connect to_port="document" to_op="Filter Stopwords (German)" from_port="document" from_op="Transform Cases"/>

<connect to_port="document" to_op="Filter Tokens (by Length)" from_port="document" from_op="Filter Stopwords (German)"/>

<connect to_port="document" to_op="Generate n-Grams (Terms)" from_port="document" from_op="Filter Tokens (by Length)"/>

<connect to_port="document 1" from_port="document" from_op="Generate n-Grams (Terms)"/>

<portSpacing spacing="0" port="source_document"/>

<portSpacing spacing="0" port="sink_document 1"/>

<portSpacing spacing="0" port="sink_document 2"/>

</process>

</operator>

<connect to_port="file" to_op="Read Excel" from_port="input 1"/>

<connect to_port="Example Set" to_op="Get Pages" from_port="output" from_op="Read Excel"/>

<connect to_port="example set" to_op="Data to Documents" from_port="Example Set" from_op="Get Pages"/>

<connect to_port="documents 1" to_op="Process Documents" from_port="documents" from_op="Data to Documents"/>

<connect to_port="result 1" from_port="word list" from_op="Process Documents"/>

<portSpacing spacing="0" port="source_input 1"/>

<portSpacing spacing="0" port="source_input 2"/>

<portSpacing spacing="0" port="sink_result 1"/>

<portSpacing spacing="0" port="sink_result 2"/>

</process>

</operator>

</process>
(the excel input is just a list of links)

Thank you so much for your help :)

Answers

  • erocoarerocoar Member Posts: 6 Contributor II
    Maybe solved, I just save the websites beforehand and then it seems to work with Process Documents from Files. Just , then it seems I cannot use tokenizer and so on to filter the text?

    EDIT: All solved, tokenizing is possible inside the cut document vector :) Thanks
Sign In or Register to comment.