Implement pairwise_count with execute R

TobiasNehrigTobiasNehrig Member Posts: 41 Guru
edited November 2018 in Help

Hi Experts,

 

I‘d like to implement in an execute R Operator the widyr function pairwise_count() like in https://www.tidytextmining.com/nasa.html#word-co-ocurrences-and-correlations. For this I crawl some pages and process them. But somehow it won’t function. I’ve this error message:

Dec 22, 2017 3:45:59 PM INFO: [1] "Failed to execute the script."

Dec 22, 2017 3:45:59 PM INFO: [1] "replacement has 0 rows, data has 2"

 

This is how my process looks like:

 

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="34">
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
<parameter key="url" value="http://www.spiegel.de"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
<parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
</list>
<parameter key="max_crawl_depth" value="10"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="max_pages" value="5"/>
<parameter key="delay" value="100"/>
<parameter key="max_concurrent_connections" value="200"/>
<parameter key="max_connections_per_host" value="100"/>
<parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="link"/>
<parameter key="random_user_agent" value="true"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data Spon" width="90" x="179" y="34">
<parameter key="vector_creation" value="Term Frequency"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="99999"/>
<parameter key="data_management" value="memory-optimized"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="link" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="2"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="179" y="34">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens a-zA-Z" width="90" x="313" y="34">
<parameter key="condition" value="matches"/>
<parameter key="regular_expression" value="[a-zA-Z]+"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
<connect from_op="Tokenize Token" from_port="document" to_op="Filter Tokens a-zA-Z" to_port="document"/>
<connect from_op="Filter Tokens a-zA-Z" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="text"/>
</operator>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Execute R" width="90" x="447" y="34">
<parameter key="script" value="# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;rm_main = function(data)&#10;{&#10;library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;set.seed(2017)&#10;&#10;test &lt;- data %&gt;%&#10;pairwise_count(word, text, sort= TRUE)&#10;print(test)&#10; return(list(test))&#10;}&#10;"/>
</operator>
<connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data Spon" to_port="example set"/>
<connect from_op="Process Documents from Data Spon" from_port="example set" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Execute R" to_port="input 1"/>
<connect from_op="Select Attributes (2)" from_port="original" to_port="result 2"/>
<connect from_op="Execute R" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>

 

Maybe there is someone who can help me to tackle that problem.

 

Regards

Tobias

 

Tagged:

Best Answer

  • SGolbertSGolbert RapidMiner Certified Analyst, Member Posts: 344 Unicorn
    Solution Accepted

    Hi Tobias,

     

    If I understood correctly, you want to pass the result of pairwise_count() to RapidMiner. That is easy:

     

    dt <- as.data.table(pairwise_count(. . .))

    return(list(dt))

    I hope that's what you are looking for and sorry for the delayed response

    sgenzer

Answers

  • TobiasNehrigTobiasNehrig Member Posts: 41 Guru

    Hi,

    I expand my script with the unnest_tokens function, I thought it might help with pairwise_count function:

     

      test %>%

      unnest_tokens(word, text, token="words") %>%

      print(test)

      test <- data.frame(test)

     

    On the console I can see each word in a row but in the result tab all word for a Document are in a row again.

     

    Now the script runs with out an error but the pairwise_count function deliver no results.

     

    This is how my process now looks like:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="34">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
    <parameter key="url" value="http://www.spiegel.de"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
    <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="5"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data Spon" width="90" x="45" y="136">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="add_meta_information" value="false"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="by ranking"/>
    <parameter key="prune_below_absolute" value="10"/>
    <parameter key="prune_above_absolute" value="3000"/>
    <parameter key="data_management" value="memory-optimized"/>
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="link" value="1.0"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
    <parameter key="minimum_text_block_length" value="2"/>
    </operator>
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="179" y="34">
    <parameter key="mode" value="linguistic tokens"/>
    <parameter key="language" value="German"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens a-zA-Z" width="90" x="313" y="34">
    <parameter key="condition" value="matches"/>
    <parameter key="regular_expression" value="[a-zA-Z]+"/>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
    <connect from_op="Tokenize Token" from_port="document" to_op="Filter Tokens a-zA-Z" to_port="document"/>
    <connect from_op="Filter Tokens a-zA-Z" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="238">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value="text"/>
    <parameter key="attributes" value="text|id"/>
    <parameter key="include_special_attributes" value="true"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.0.001" expanded="true" height="82" name="Generate ID (2)" width="90" x="45" y="340"/>
    <operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (2)" width="90" x="179" y="34">
    <parameter key="attribute_name" value="text"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles">
    <parameter key="text" value="label"/>
    <parameter key="id" value="id"/>
    </list>
    </operator>
    <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="103" name="Execute R" width="90" x="380" y="34">
    <parameter key="script" value="rm_main = function(data)&#10;{&#10;library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;set.seed(2017)&#10;&#10;test &lt;- data &#10;test %&gt;%&#10; unnest_tokens(word, text, token=&quot;words&quot;) %&gt;%&#10; print(test)&#10; test &lt;- data.frame(test)&#10;&#10;pair_test &lt;- test %&gt;%&#10; pairwise_count(text, text,sort=TRUE)&#10; print(pair_test)&#10; pair_test &lt;- data.frame(pair_test) &#10;&#10;return(list(test, pair_test))&#10;}&#10;"/>
    </operator>
    <connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data Spon" to_port="example set"/>
    <connect from_op="Process Documents from Data Spon" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="original" to_op="Generate ID (2)" to_port="example set input"/>
    <connect from_op="Generate ID (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Execute R" to_port="input 1"/>
    <connect from_op="Execute R" from_port="output 1" to_port="result 1"/>
    <connect from_op="Execute R" from_port="output 2" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>

     

    Regards

    Tobias

  • SGolbertSGolbert RapidMiner Certified Analyst, Member Posts: 344 Unicorn

    The problem lies entirely in your R code. You are passing a table with a single column to the script, where you actually need to have the data in a tidy form. You have to work in either in RM or in R to have the data like this:

     

    Document                          Word

    1                                            house

    1                                            dog

    2                                            house

    2                                            cat

    3                                            house

    3                                            dog

     

    Then the script will determine that the combination (house, dog) appears 2 times and (house, cat) once. In your script there are also variables that are undefined (word, text). If you choose to work it out in R, I recommend to save the intermediate results in CSV and then try to solve it interactively. You can also do everything in RapidMiner using n-grams.

     

    Best,

    Sebastian

    sgenzeryyhuang
  • TobiasNehrigTobiasNehrig Member Posts: 41 Guru

    Hi@SGolbert

     

    Many thanks for your hint. I solved the problem with pairwise_count, over all documents. So next I‘ll have to find a solution that pairwise_count runs over each single document.

    But I have also the problem that I can only see the result at the console and I would like to have the results at the RapidMiner Results Table. May I ask you, if you have any advise for this problem?

     

    This is my code:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.000" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="false" class="retrieve" compatibility="8.1.000" expanded="true" height="68" name="Retrieve 2102-Rohseiten-Spiegel" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Local Repository/data/2102-Rohseiten-Spiegel"/>
    </operator>
    <operator activated="false" class="concurrency:loop_attributes" compatibility="8.1.000" expanded="true" height="103" name="Loop Attributes" width="90" x="782" y="544">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="id"/>
    <parameter key="include_special_attributes" value="true"/>
    <process expanded="true">
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="Execute R (2)" width="90" x="380" y="238">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; count(word, sort=TRUE)&#10; #pairwise_count(word, title)&#10; print(woerter)&#10;woerter &lt;- data.frame(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id)&#10;&#10; print(cooccurre)&#10; cooccurre &lt;- as.data.frame(cooccurre, item1, item2, n)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <connect from_port="input 1" to_op="Execute R (2)" to_port="input 1"/>
    <connect from_op="Execute R (2)" from_port="output 1" to_port="output 1"/>
    <connect from_op="Execute R (2)" from_port="output 2" to_port="output 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="collect" compatibility="8.1.000" expanded="true" height="68" name="Collect" width="90" x="648" y="391"/>
    <operator activated="false" class="loop_collection" compatibility="8.1.000" expanded="true" height="103" name="Loop Collection" width="90" x="782" y="391">
    <parameter key="unfold" value="true"/>
    <process expanded="true">
    <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="82" name="Split Text in Words (2)" width="90" x="45" y="136">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10; if(is.data.frame(data)){&#10;&#9;spon_words &lt;- data %&gt;%&#10;&#9;group_by(id)%&gt;%&#10;&#9; unnest_tokens(bigram, text, token = &quot;ngrams&quot;, n = 2)&#10;&#10;&#9; }&#10;&#9;print(spon_words)&#10;&#10; return(list(spon_words)) &#10;}&#10;"/>
    </operator>
    <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="82" name="Seperat (2)" width="90" x="45" y="238">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(tidyr)&#10;library(tokenizers)&#10;&#10;rm_main = function(data)&#10;{&#10;devided_bigrams &lt;-data %&gt;%&#10;&#9;#group_by(id)%&gt;%&#10;&#9;separate(bigram, c(&quot;word1&quot;, &quot;word2&quot;), sep = &quot; &quot;)&#10;&#9;print(devided_bigrams)&#10;&#9;&#10; return(list(devided_bigrams))&#10;}&#10;"/>
    </operator>
    <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="82" name="Count all Bigrams (2)" width="90" x="45" y="340">
    <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#10;&#9;count_bigrams &lt;- data %&gt;%&#10;&#9; count(word1, word2, sort = TRUE)&#10;&#9; &#10;&#9;print(count_bigrams)&#10; #filter(n &gt;= 10)&#10;&#9;counted_bigrams &lt;- data.frame(count_bigrams)&#10; &#10; return(counted_bigrams)&#10;}&#10;"/>
    </operator>
    <operator activated="false" class="multiply" compatibility="8.1.000" expanded="true" height="82" name="Multiply" width="90" x="246" y="238"/>
    <operator activated="false" class="split_data" compatibility="8.1.000" expanded="true" height="82" name="Split Data" width="90" x="380" y="238">
    <enumeration key="partitions">
    <parameter key="ratio" value="0.05"/>
    </enumeration>
    </operator>
    <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="82" name="draw graph (2)" width="90" x="514" y="238">
    <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10; library(igraph)&#10; &#9;library(ggraph)&#10; &#9;set.seed(2017)&#10; &#9;&#10; bigram_graph &lt;- data %&gt;%&#10; filter(n &gt;= 8) %&gt;%&#10; graph_from_data_frame&#10; print(bigram_graph)&#10; &#9;# bigram_graph &lt;- data.frame(bigram_graph)&#10;&#10; &#9;graph1 &lt;- ggraph(bigram_graph, layout = &quot;fr&quot;) +&#10; &#9; geom_edge_link() +&#10; &#9; geom_node_point() +&#10; &#9; geom_node_text(aes(label = name), vjust = 1, hjust =1)&#10;&#10; setwd(&quot;/home/knecht&quot;)&#10;&#9;#graph.write(graph1, &quot;/home/knecht/graph01.txt&quot;, format=&quot;edgelist&quot;)&#10; &#9;ggsave(filename = &quot;foo300.png&quot;, width = 5, height = 4, dpi = 300, units = &quot;in&quot;, device='png')&#10; &#9; &#9;&#10; return(list(graph1))&#10;}&#10;"/>
    </operator>
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="Execute R" width="90" x="313" y="34">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; count(word, sort=TRUE)&#10; #pairwise_count(word, title)&#10; print(woerter)&#10;woerter &lt;- data.frame(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id)&#10;&#10; print(cooccurre)&#10; cooccurre &lt;- as.data.frame(cooccurre, item1, item2, n)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <operator activated="false" class="sort" compatibility="8.1.000" expanded="true" height="82" name="Sort" width="90" x="514" y="595"/>
    <connect from_port="single" to_op="Execute R" to_port="input 1"/>
    <connect from_op="Split Text in Words (2)" from_port="output 1" to_op="Seperat (2)" to_port="input 1"/>
    <connect from_op="Seperat (2)" from_port="output 1" to_op="Count all Bigrams (2)" to_port="input 1"/>
    <connect from_op="Multiply" from_port="output 1" to_op="Split Data" to_port="example set"/>
    <connect from_op="Split Data" from_port="partition 1" to_op="draw graph (2)" to_port="input 1"/>
    <connect from_op="Execute R" from_port="output 1" to_port="output 1"/>
    <connect from_op="Execute R" from_port="output 2" to_port="output 2"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    <description align="center" color="yellow" colored="false" height="105" resized="false" width="180" x="833" y="168">Type your comment</description>
    </process>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.000" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="340">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
    <parameter key="url" value="http://www.spiegel.de"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
    <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="4650"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="179" y="340">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="ignore_non_html_tags" value="false"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.000" expanded="true" height="82" name="Prepare Data" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="8.1.000" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
    <parameter key="attribute_name" value="text"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.1.000" expanded="true" height="82" name="Generate ID" width="90" x="246" y="34"/>
    <operator activated="true" class="order_attributes" compatibility="8.1.000" expanded="true" height="82" name="Reorder Attributes" width="90" x="447" y="34">
    <parameter key="attribute_ordering" value="Title|text"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.1.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Title|text"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.1.000" expanded="true" height="103" name="Filter Examples" width="90" x="782" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Title.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    <parameter key="filters_check_metadata" value="false"/>
    </operator>
    <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
    <connect from_op="Reorder Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="112" y="34">
    <parameter key="query_type" value="Regular Region"/>
    <list key="string_machting_queries"/>
    <list key="regular_expression_queries"/>
    <list key="regular_region_queries">
    <parameter key="sentences" value="\\\.\\s[A-Z]| \\!\\s[A-Z]|\\?\\s[A-Z].\\\.|\\!|\\?"/>
    </list>
    <list key="xpath_queries"/>
    <list key="namespaces"/>
    <list key="index_queries"/>
    <list key="jsonpath_queries"/>
    <process expanded="true">
    <connect from_port="segment" to_port="document 1"/>
    <portSpacing port="source_segment" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="document" to_op="Cut Document" to_port="document"/>
    <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.1.000" expanded="true" height="103" name="Multiply (2)" width="90" x="447" y="34"/>
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="Execute R (3)" width="90" x="648" y="85">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word)&#10; print(woerter)&#10;woerter &lt;- as.data.frame(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id)&#10; #group_by(id)%&gt;%&#10; print(cooccurre)&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Prepare Data" to_port="in 1"/>
    <connect from_op="Prepare Data" from_port="out 1" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply (2)" to_port="input"/>
    <connect from_op="Multiply (2)" from_port="output 1" to_port="result 1"/>
    <connect from_op="Multiply (2)" from_port="output 2" to_op="Execute R (3)" to_port="input 1"/>
    <connect from_op="Execute R (3)" from_port="output 1" to_port="result 2"/>
    <connect from_op="Execute R (3)" from_port="output 2" to_port="result 3"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>

     

    Kind regands

     

    Tobias

    sgenzer
  • TobiasNehrigTobiasNehrig Member Posts: 41 Guru

    Hi Sebatian,

    @SGolbert,

     

    thank you for your response and yes, your understanding is right. I think I found the problem. It seems that there is no problem with my script. Because the same process but only different web page with significant less sub pages, sentences and word works. So I checked again and it seems that this message sould be the problem:


    Mar 23, 2018 10:17:13 AM INFO: Written 48.6% of 73326128 rows in 2 secs using 8 threads. anyBufferGrown=yes; maxBuffUsed=30%. Finished in 2 secs.      
    Mar 23, 2018 10:17:13 AM INFO: Written 79.4% of 73326128 rows in 3 secs using 8 threads. anyBufferGrown=yes; maxBuffUsed=30%. Finished in 0 secs.      
    Mar 23, 2018 10:17:13 AM INFO:                                                                                                                                      
    Mar 23, 2018 10:18:43 AM INFO: Saving results.
    Mar 23, 2018 10:18:43 AM INFO: Process //Local Repository/processes/18-03-23-test-pairwise_count finished successfully after 2:08

     

    Process that won't work:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="false" class="retrieve" compatibility="8.1.001" expanded="true" height="68" name="Retrieve 2102-Rohseiten-Spiegel" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Local Repository/data/2102-Rohseiten-Spiegel"/>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="340">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
    <parameter key="url" value="http://www.spiegel.de"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
    <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="4650"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="179" y="340">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="ignore_non_html_tags" value="false"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="82" name="Prepare Data" width="90" x="246" y="34">
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="8.1.001" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
    <parameter key="attribute_name" value="text"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.1.001" expanded="true" height="82" name="Generate ID" width="90" x="246" y="34"/>
    <operator activated="true" class="order_attributes" compatibility="8.1.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="447" y="34">
    <parameter key="attribute_ordering" value="Title|text"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Title|text"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.1.001" expanded="true" height="103" name="Filter Examples" width="90" x="782" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Title.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    <parameter key="filters_check_metadata" value="false"/>
    </operator>
    <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
    <connect from_op="Reorder Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="112" y="34">
    <parameter key="query_type" value="Regular Region"/>
    <list key="string_machting_queries"/>
    <list key="regular_expression_queries"/>
    <list key="regular_region_queries">
    <parameter key="sentences" value="\\\.\\s[A-Z]| \\!\\s[A-Z]|\\?\\s[A-Z].\\\.|\\!|\\?"/>
    </list>
    <list key="xpath_queries"/>
    <list key="namespaces"/>
    <list key="index_queries"/>
    <list key="jsonpath_queries"/>
    <process expanded="true">
    <connect from_port="segment" to_port="document 1"/>
    <portSpacing port="source_segment" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="document" to_op="Cut Document" to_port="document"/>
    <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.1.001" expanded="true" height="103" name="Multiply (2)" width="90" x="648" y="34"/>
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="Execute R (3)" width="90" x="849" y="85">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; #for( i in unique(id))&#10; #{&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id)&#10; #}&#10; print(cooccurre)&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Prepare Data" to_port="in 1"/>
    <connect from_op="Prepare Data" from_port="out 1" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply (2)" to_port="input"/>
    <connect from_op="Multiply (2)" from_port="output 1" to_port="result 1"/>
    <connect from_op="Multiply (2)" from_port="output 2" to_op="Execute R (3)" to_port="input 1"/>
    <connect from_op="Execute R (3)" from_port="output 1" to_port="result 2"/>
    <connect from_op="Execute R (3)" from_port="output 2" to_port="result 3"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>

     

    Proccess that works:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="82" name="Crawler Rapp" width="90" x="45" y="34">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
    <parameter key="url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/main/main.html"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/.*"/>
    <parameter key="follow_link_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml.*"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="enable_basic_auth" value="false"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="write_pages_to_disk" value="false"/>
    <parameter key="include_binary_content" value="false"/>
    <parameter key="output_file_extension" value="txt"/>
    <parameter key="max_pages" value="4650"/>
    <parameter key="max_page_size" value="1000"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
    <parameter key="ignore_robot_exclusion" value="false"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    <parameter key="connection_timeout" value="10000"/>
    <parameter key="read_timeout" value="10000"/>
    <parameter key="follow_redirects" value="true"/>
    <parameter key="accept_cookies" value="none"/>
    <parameter key="cookie_scope" value="global"/>
    <parameter key="request_method" value="GET"/>
    <parameter key="delay" value="none"/>
    <parameter key="delay_amount" value="1000"/>
    <parameter key="min_delay_amount" value="0"/>
    <parameter key="max_delay_amount" value="1000"/>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="179" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="30.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="extract_content" value="true"/>
    <parameter key="minimum_text_block_length" value="5"/>
    <parameter key="override_content_type_information" value="true"/>
    <parameter key="neglegt_span_tags" value="true"/>
    <parameter key="neglect_p_tags" value="true"/>
    <parameter key="neglect_b_tags" value="true"/>
    <parameter key="neglect_i_tags" value="true"/>
    <parameter key="neglect_br_tags" value="true"/>
    <parameter key="ignore_non_html_tags" value="false"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="82" name="Prepare Data" width="90" x="313" y="34">
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="8.1.001" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
    <parameter key="attribute_name" value="text"/>
    <parameter key="target_role" value="regular"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.1.001" expanded="true" height="82" name="Generate ID" width="90" x="246" y="34">
    <parameter key="create_nominal_ids" value="false"/>
    <parameter key="offset" value="0"/>
    </operator>
    <operator activated="true" class="order_attributes" compatibility="8.1.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="447" y="34">
    <parameter key="sort_mode" value="user specified"/>
    <parameter key="attribute_ordering" value="Title|text"/>
    <parameter key="use_regular_expressions" value="false"/>
    <parameter key="handle_unmatched" value="append"/>
    <parameter key="sort_direction" value="ascending"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="Title|text"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.1.001" expanded="true" height="103" name="Filter Examples" width="90" x="782" y="34">
    <parameter key="parameter_expression" value=""/>
    <parameter key="condition_class" value="custom_filters"/>
    <parameter key="invert_filter" value="false"/>
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Title.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    <parameter key="filters_check_metadata" value="false"/>
    </operator>
    <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
    <connect from_op="Reorder Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="30.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="112" y="34">
    <parameter key="query_type" value="Regular Region"/>
    <list key="string_machting_queries"/>
    <parameter key="attribute_type" value="Nominal"/>
    <list key="regular_expression_queries"/>
    <list key="regular_region_queries">
    <parameter key="sentences" value="\\\.\\s[A-Z]| \\!\\s[A-Z]|\\?\\s[A-Z].\\\.|\\!|\\?"/>
    </list>
    <list key="xpath_queries"/>
    <list key="namespaces"/>
    <parameter key="ignore_CDATA" value="true"/>
    <parameter key="assume_html" value="true"/>
    <list key="index_queries"/>
    <list key="jsonpath_queries"/>
    <process expanded="true">
    <connect from_port="segment" to_port="document 1"/>
    <portSpacing port="source_segment" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="document" to_op="Cut Document" to_port="document"/>
    <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.1.001" expanded="true" height="103" name="Multiply (2)" width="90" x="581" y="34"/>
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="Execute R (3)" width="90" x="715" y="85">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id)&#10; print(cooccurre)&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <connect from_op="Crawler Rapp" from_port="out 1" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Prepare Data" to_port="in 1"/>
    <connect from_op="Prepare Data" from_port="out 1" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply (2)" to_port="input"/>
    <connect from_op="Multiply (2)" from_port="output 1" to_port="result 1"/>
    <connect from_op="Multiply (2)" from_port="output 2" to_op="Execute R (3)" to_port="input 1"/>
    <connect from_op="Execute R (3)" from_port="output 1" to_port="result 2"/>
    <connect from_op="Execute R (3)" from_port="output 2" to_port="result 3"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>

     

  • TobiasNehrigTobiasNehrig Member Posts: 41 Guru

    Hi,

     

    I solved the output problem with filtering my counted words with n>=10 and all valid results are shown. But for me it is not an option to filter results.

    To get my counting without filtering I'm trying to cluster by the id of my pages and use the operator loop cluster. Now my problem is, that I‘d like to see counting results for each ID. I tried the collection operation inside and out side of the loop cluster but I always get only the result of the last loop.

    Is there a way to see all results and compare them in the following?

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="false" class="retrieve" compatibility="8.1.001" expanded="true" height="68" name="Retrieve 2102-Rohseiten-Spiegel" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//Local Repository/data/2102-Rohseiten-Spiegel"/>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="544">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
    <parameter key="url" value="http://www.spiegel.de"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
    <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="4650"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="313" y="544">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="ignore_non_html_tags" value="false"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="82" name="Prepare Data" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="8.1.001" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
    <parameter key="attribute_name" value="text"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.1.001" expanded="true" height="82" name="Generate ID" width="90" x="246" y="34"/>
    <operator activated="true" class="order_attributes" compatibility="8.1.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="447" y="34">
    <parameter key="attribute_ordering" value="Title|text"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Title|text"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.1.001" expanded="true" height="103" name="Filter Examples" width="90" x="782" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Title.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    <parameter key="filters_check_metadata" value="false"/>
    </operator>
    <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
    <connect from_op="Reorder Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="112" y="34">
    <parameter key="query_type" value="Regular Region"/>
    <list key="string_machting_queries"/>
    <list key="regular_expression_queries"/>
    <list key="regular_region_queries">
    <parameter key="sentences" value="\\\.\\s[A-Z]| \\!\\s[A-Z]|\\?\\s[A-Z].\\\.|\\!|\\?"/>
    </list>
    <list key="xpath_queries"/>
    <list key="namespaces"/>
    <list key="index_queries"/>
    <list key="jsonpath_queries"/>
    <process expanded="true">
    <connect from_port="segment" to_port="document 1"/>
    <portSpacing port="source_segment" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="document" to_op="Cut Document" to_port="document"/>
    <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.1.001" expanded="true" height="103" name="Multiply" width="90" x="447" y="34"/>
    <operator activated="true" class="set_role" compatibility="8.1.001" expanded="true" height="82" name="Set Role" width="90" x="581" y="85">
    <parameter key="attribute_name" value="Title"/>
    <parameter key="target_role" value="cluster"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="loop_clusters" compatibility="8.1.001" expanded="true" height="82" name="Loop Clusters" width="90" x="782" y="85">
    <process expanded="true">
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="Execute R (2)" width="90" x="179" y="34">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word, sort =TRUE)%&gt;%&#10; filter(n&gt;=10)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id, sort = TRUE)%&gt;%&#10; # filter(n&gt;=10)&#10; print(cooccurre)&#10;&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <operator activated="true" class="collect" compatibility="8.1.001" expanded="true" height="103" name="Collect" width="90" x="447" y="34">
    <parameter key="unfold" value="true"/>
    </operator>
    <connect from_port="cluster subset" to_op="Execute R (2)" to_port="input 1"/>
    <connect from_op="Execute R (2)" from_port="output 1" to_op="Collect" to_port="input 1"/>
    <connect from_op="Execute R (2)" from_port="output 2" to_op="Collect" to_port="input 2"/>
    <connect from_op="Collect" from_port="collection" to_port="out 1"/>
    <portSpacing port="source_cluster subset" spacing="0"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Prepare Data" to_port="in 1"/>
    <connect from_op="Prepare Data" from_port="out 1" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
    <connect from_op="Multiply" from_port="output 2" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Loop Clusters" to_port="example set"/>
    <connect from_op="Loop Clusters" from_port="out 1" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <description align="center" color="yellow" colored="false" height="163" resized="true" width="444" x="10" y="480">Crawler &lt;br/&gt;</description>
    </process>
    </operator>
    </process>
Sign In or Register to comment.