The RapidMiner community is on read-only mode until further notice. Technical support via cases will continue to work as is. For any urgent licensing related requests from Students/Faculty members, please use the Altair academic forum here.
Extract Information function
Hey,
I have some problems with using the extract information function, so I'd be super grateful for any help.
For my project I want to read out headline and teaser text from news sites.
E.g.:
<h4 class="headline"><a href="http://www.welt.de/politik/deutschland/article148111602/Lammert-droht-mit-Nein-zu-TTIP.html" name="_chf_R_20W_">Lammert droht mit Nein zu TTIP</a>
</h4>
<p class="tsrText">
Norbert Lammert hat mehr Transparenz bei den Verhandlungen zum Freihandelsabkommen TTIP gefordert und mit Ablehnung gedroht. Der bisherige begrenzte Zugang zu Dokumenten sei "indiskutabel". <a href="http://www.welt.de/politik/deutschland/article148111602/Lammert-droht-mit-Nein-zu-TTIP.html" class="more" name="_chf_R_20W_">mehr...</a>
</p>
Theoretically it should work with
//h:p[@class='tsrText']//text()
//h:h[@class='headline']//text() but it does not, and I assume it is because of the way my process is built up. Here is the process as xml:
Thank you so much for your help
I have some problems with using the extract information function, so I'd be super grateful for any help.
For my project I want to read out headline and teaser text from news sites.
E.g.:
<h4 class="headline"><a href="http://www.welt.de/politik/deutschland/article148111602/Lammert-droht-mit-Nein-zu-TTIP.html" name="_chf_R_20W_">Lammert droht mit Nein zu TTIP</a>
</h4>
<p class="tsrText">
Norbert Lammert hat mehr Transparenz bei den Verhandlungen zum Freihandelsabkommen TTIP gefordert und mit Ablehnung gedroht. Der bisherige begrenzte Zugang zu Dokumenten sei "indiskutabel". <a href="http://www.welt.de/politik/deutschland/article148111602/Lammert-droht-mit-Nein-zu-TTIP.html" class="more" name="_chf_R_20W_">mehr...</a>
</p>
Theoretically it should work with
//h:p[@class='tsrText']//text()
//h:h[@class='headline']//text() but it does not, and I assume it is because of the way my process is built up. Here is the process as xml:
<?xml version="1.0" encoding="UTF-8"?>(the excel input is just a list of links)
-<process version="6.5.002">
-<context>
<input/>
<output/>
<macros/>
</context>
-<operator name="Process" expanded="true" compatibility="6.5.002" class="process" activated="true">
<parameter value="init" key="logverbosity"/>
<parameter value="2001" key="random_seed"/>
<parameter value="never" key="send_mail"/>
<parameter value="" key="notification_email"/>
<parameter value="30" key="process_duration_for_mail"/>
<parameter value="SYSTEM" key="encoding"/>
-<process expanded="true">
-<operator name="Read Excel" expanded="true" compatibility="6.5.002" class="read_excel" activated="true" y="30" x="45" width="90" height="60">
<parameter value="C:\Users\Frederik\Downloads\datatest.xlsx" key="excel_file"/>
<parameter value="1" key="sheet_number"/>
<parameter value="A1:B10" key="imported_cell_range"/>
<parameter value="SYSTEM" key="encoding"/>
<parameter value="false" key="first_row_as_names"/>
-<list key="annotations">
<parameter value="Name" key="0"/>
</list>
<parameter value="EEE, d MMM yyyy HH:mm:ss Z" key="date_format"/>
<parameter value="SYSTEM" key="time_zone"/>
<parameter value="English (United States)" key="locale"/>
-<list key="data_set_meta_data_information">
<parameter value="Link.true.file_path.attribute" key="0"/>
<parameter value="Datum.true.date_time.attribute" key="1"/>
</list>
<parameter value="true" key="read_not_matching_values_as_missings"/>
<parameter value="double_array" key="datamanagement"/>
</operator>
-<operator name="Get Pages" expanded="true" compatibility="6.5.000" class="web:retrieve_webpages" activated="true" y="120" x="179" width="90" height="60">
<parameter value="Link" key="link_attribute"/>
<parameter value="false" key="random_user_agent"/>
<parameter value="10000" key="connection_timeout"/>
<parameter value="10000" key="read_timeout"/>
<parameter value="true" key="follow_redirects"/>
<parameter value="none" key="accept_cookies"/>
<parameter value="global" key="cookie_scope"/>
<parameter value="GET" key="request_method"/>
<parameter value="none" key="delay"/>
<parameter value="1000" key="delay_amount"/>
<parameter value="0" key="min_delay_amount"/>
<parameter value="1000" key="max_delay_amount"/>
</operator>
-<operator name="Data to Documents" expanded="true" compatibility="6.5.000" class="text:data_to_documents" activated="true" y="30" x="380" width="90" height="60">
<parameter value="false" key="select_attributes_and_weights"/>
<list key="specify_weights"/>
</operator>
-<operator name="Process Documents" expanded="true" compatibility="6.5.000" class="text:process_documents" activated="true" y="120" x="581" width="90" height="94">
<parameter value="true" key="create_word_vector"/>
<parameter value="Term Frequency" key="vector_creation"/>
<parameter value="true" key="add_meta_information"/>
<parameter value="false" key="keep_text"/>
<parameter value="absolute" key="prune_method"/>
<parameter value="3.0" key="prune_below_percent"/>
<parameter value="30.0" key="prune_above_percent"/>
<parameter value="3" key="prune_below_absolute"/>
<parameter value="9999" key="prune_above_absolute"/>
<parameter value="0.05" key="prune_below_rank"/>
<parameter value="0.95" key="prune_above_rank"/>
<parameter value="double_sparse_array" key="datamanagement"/>
-<process expanded="true">
<operator name="Stem (German)" expanded="true" compatibility="6.5.000" class="text:stem_german" activated="false" y="30" x="782" width="90" height="60"/>
-<operator name="Extract Content" expanded="true" compatibility="6.5.000" class="web:extract_html_text_content" activated="false" y="30" x="112" width="90" height="60">
<parameter value="true" key="extract_content"/>
<parameter value="3" key="minimum_text_block_length"/>
<parameter value="true" key="override_content_type_information"/>
<parameter value="true" key="neglegt_span_tags"/>
<parameter value="true" key="neglect_p_tags"/>
<parameter value="true" key="neglect_b_tags"/>
<parameter value="true" key="neglect_i_tags"/>
<parameter value="true" key="neglect_br_tags"/>
<parameter value="true" key="ignore_non_html_tags"/>
</operator>
-<operator name="Extract Information" expanded="true" compatibility="6.5.000" class="text:extract_information" activated="true" y="30" x="246" width="90" height="60">
<parameter value="XPath" key="query_type"/>
<list key="string_machting_queries"/>
<parameter value="Nominal" key="attribute_type"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
-<list key="xpath_queries">
<parameter value="//h:p[@class='tsrText']//text()" key="headline"/>
<parameter value="//h:h[@class='headline']//text()" key="teasertext"/>
</list>
<list key="namespaces"/>
<parameter value="true" key="ignore_CDATA"/>
<parameter value="true" key="assume_html"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
-<operator name="Tokenize" expanded="true" compatibility="6.5.000" class="text:tokenize" activated="true" y="30" x="380" width="90" height="60">
<parameter value="non letters" key="mode"/>
<parameter value=".:" key="characters"/>
<parameter value="English" key="language"/>
<parameter value="3" key="max_token_length"/>
</operator>
-<operator name="Filter Tokens (by Content)" expanded="true" compatibility="6.5.000" class="text:filter_tokens_by_content" activated="true" y="120" x="313" width="90" height="60">
<parameter value="matches" key="condition"/>
<parameter value="^[A-Z].*" key="regular_expression"/>
<parameter value="true" key="case_sensitive"/>
<parameter value="false" key="invert condition"/>
</operator>
-<operator name="Transform Cases" expanded="true" compatibility="6.5.000" class="text:transform_cases" activated="true" y="120" x="447" width="90" height="60">
<parameter value="lower case" key="transform_to"/>
</operator>
-<operator name="Filter Stopwords (German)" expanded="true" compatibility="6.5.000" class="text:filter_stopwords_german" activated="true" y="120" x="581" width="90" height="60">
<parameter value="Standard" key="stop_word_list"/>
</operator>
-<operator name="Filter Tokens (by Length)" expanded="true" compatibility="6.5.000" class="text:filter_by_length" activated="true" y="30" x="514" width="90" height="60">
<parameter value="4" key="min_chars"/>
<parameter value="25" key="max_chars"/>
</operator>
-<operator name="Generate n-Grams (Terms)" expanded="true" compatibility="6.5.000" class="text:generate_n_grams_terms" activated="true" y="30" x="648" width="90" height="60">
<parameter value="3" key="max_length"/>
</operator>
<connect to_port="document" to_op="Extract Information" from_port="document"/>
<connect to_port="document" to_op="Tokenize" from_port="document" from_op="Extract Information"/>
<connect to_port="document" to_op="Filter Tokens (by Content)" from_port="document" from_op="Tokenize"/>
<connect to_port="document" to_op="Transform Cases" from_port="document" from_op="Filter Tokens (by Content)"/>
<connect to_port="document" to_op="Filter Stopwords (German)" from_port="document" from_op="Transform Cases"/>
<connect to_port="document" to_op="Filter Tokens (by Length)" from_port="document" from_op="Filter Stopwords (German)"/>
<connect to_port="document" to_op="Generate n-Grams (Terms)" from_port="document" from_op="Filter Tokens (by Length)"/>
<connect to_port="document 1" from_port="document" from_op="Generate n-Grams (Terms)"/>
<portSpacing spacing="0" port="source_document"/>
<portSpacing spacing="0" port="sink_document 1"/>
<portSpacing spacing="0" port="sink_document 2"/>
</process>
</operator>
<connect to_port="file" to_op="Read Excel" from_port="input 1"/>
<connect to_port="Example Set" to_op="Get Pages" from_port="output" from_op="Read Excel"/>
<connect to_port="example set" to_op="Data to Documents" from_port="Example Set" from_op="Get Pages"/>
<connect to_port="documents 1" to_op="Process Documents" from_port="documents" from_op="Data to Documents"/>
<connect to_port="result 1" from_port="word list" from_op="Process Documents"/>
<portSpacing spacing="0" port="source_input 1"/>
<portSpacing spacing="0" port="source_input 2"/>
<portSpacing spacing="0" port="sink_result 1"/>
<portSpacing spacing="0" port="sink_result 2"/>
</process>
</operator>
</process>
Thank you so much for your help
0
Answers
EDIT: All solved, tokenizing is possible inside the cut document vector Thanks