Problems with an execution of an r-script

TobiasNehrigTobiasNehrig Member Posts: 41 Guru
edited November 2018 in Help

Hi,

I 've some problems to integrate my r-script in RapidMiner. My script is running in RStudio with the data from RapidMiner process, which I have wrote in an res-file.

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<parameter key="logfile" value="/home/knecht/Master2017/Rapp/Logfile.log"/>
<parameter key="resultfile" value="/home/knecht/Master2017/Rapp/resultfile.res"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/main/main.html"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/.*"/>
<parameter key="follow_link_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml.*"/>
</list>
<parameter key="max_crawl_depth" value="10"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="output_dir" value="/home/knecht/Crawler"/>
<parameter key="max_pages" value="1000"/>
<parameter key="max_page_size" value="500"/>
<parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
<parameter key="ignore_robot_exclusion" value="true"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="45" y="136">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="link"/>
<parameter key="random_user_agent" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="238">
<parameter key="keep_text" value="true"/>
<list key="specify_weights">
<parameter key="link" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="2"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="45" y="136">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="45" y="238"/>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="85">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="[-!&quot;#$%&amp;'()*+,./:;&lt;=&gt;?@\[\\\]_`{|}~]([a-z]+)[-!&quot;#$%&amp;'()*+,./:;&lt;=&gt;?@\[\\\]_`{|}~] ^[0-9]+[-!&quot;#$%&amp;'()*+,./:;&lt;=&gt;?@\[\\\]_`{|}~]^[0-9] "/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
<connect from_op="Tokenize Token" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="45" y="340">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="text" value="1.0"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Data to Document" width="90" x="179" y="136"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Execute R" width="90" x="380" y="187">
<parameter key="script" value="# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;rm_main = function(data)&#10;{&#10;library(readr)&#10;library(dplyr)&#10;library(tidytext)&#10;library(tm)&#10;library(tidyr)&#10;library(stringr)&#10;library(widyr)&#10;library(ggraph)&#10;library(igraph)&#10;set.seed(2017)&#10;&#10;#KorpusMitZahlen &lt;- read_file(&quot;/home/knecht/Master2017/Korpus/17-12-03-Rapp-Korpus.res&quot;)&#10;KorpusOhneZahlen &lt;- removeNumbers(data)&#10;&#10;Korpus_DF &lt;- data_frame(text=KorpusOhneZahlen)&#10;&#10;GesamtTermAnzahl &lt;- Korpus_DF %&gt;%&#10; unnest_tokens(word, text)&#10;GesamtTermAnzahl&#10;write.csv(GesamtTermAnzahl, '/home/knecht/Master2017/Wortlisten/17-12-10-Rapp-GesamtTermAnzahl-Liste')&#10;&#10;TermHaeufigkeit &lt;- Korpus_DF %&gt;%&#10; unnest_tokens(word, text) %&gt;%&#10; count (word, sort=TRUE)%&gt;%&#10; ungroup()&#10; &#10;TermHaeufigkeit&#10;write.csv(TermHaeufigkeit, '/home/knecht/Master2017/Wortlisten/17-12-10-Rapp-TermHaeufigkeit-Liste')&#10;&#10;#WortRang &lt;- TermHaeufigkeit %&gt;%&#10;# group_by(text) %&gt;%&#10;# mutate(rank = row_number(),&#10;# 'term frequenz' = /GesamtTermAnzahl)&#10;#WortRang&#10;&#10;NGramKorpus &lt;-Korpus_DF %&gt;%&#10; unnest_tokens(ngram, text, token = &quot;ngrams&quot;, n = 2)&#10;NGramKorpus %&gt;%&#10; count(ngram)&#10;NGramKorpusTeilen &lt;- NGramKorpus %&gt;%&#10; separate(ngram, c(&quot;word1&quot;, &quot;word2&quot;))&#10;NGramZaehlen &lt;- NGramKorpusTeilen %&gt;%&#10; count(word1, word2, sort=TRUE)&#10;NGramZaehlen&#10;#write.csv(NGramZaehlen, '/home/knecht/Master2017/N-Gramme-Listen/17-12-06-Spon-NGram-Liste')&#10;write.csv(NGramZaehlen, '/home/knecht/Master2017/N-Gramme-Listen/17-12-10-Rapp-NGram-Liste')&#10;&#10;NGramZaehlen%&gt;%&#10; filter(n&gt;= 20) %&gt;%&#10; filter(n&lt;= 750) %&gt;%&#10; graph_from_data_frame() %&gt;%&#10; ggraph(layout = &quot;igraph&quot;, algorithm= 'fr') +&#10; geom_edge_link(aes(alpha = n, width = n)) +&#10; geom_node_point(size = 2, color = &quot;lightblue&quot;) +&#10; geom_node_text(aes(label = name), repel = TRUE) +&#10; theme_void()&#10;&#10;}&#10;"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Korpus" width="90" x="380" y="34">
<parameter key="result_file" value="/home/knecht/Master2017/Korpus/17-12-11-Rapp-Korpus.res"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Data to Document" to_port="input"/>
<connect from_op="Data to Document" from_port="output 1" to_op="Write Korpus" to_port="input 1"/>
<connect from_op="Data to Document" from_port="output 2" to_op="Execute R" to_port="input 1"/>
<connect from_op="Execute R" from_port="output 1" to_port="result 2"/>
<connect from_op="Write Korpus" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>

 

R-Script (RStudio):

library(readr)
library(dplyr)
library(tidytext)
library(tm)
library(tidyr)
library(stringr)
library(widyr)
library(ggraph)
library(igraph)
set.seed(2017)

KorpusMitZahlen <- read_file("/home/knecht/Korpus/17-12-03-Rapp-Korpus.res")
KorpusOhneZahlen <- removeNumbers(KorpusMitZahlen)

Korpus_DF <- data_frame(text=KorpusOhneZahlen)

GesamtTermAnzahl <- Korpus_DF %>%
unnest_tokens(word, text)
GesamtTermAnzahl
write.csv(GesamtTermAnzahl, '/home/knecht/Wortlisten/17-12-10-Rapp-GesamtTermAnzahl-Liste')

TermHaeufigkeit <- Korpus_DF %>%
unnest_tokens(word, text) %>%
count (word, sort=TRUE)%>%
ungroup()

TermHaeufigkeit
write.csv(TermHaeufigkeit, '/home/knecht/Wortlisten/17-12-10-Rapp-TermHaeufigkeit-Liste')

NGramKorpus <-Korpus_DF %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2)
NGramKorpus %>%
count(ngram)
NGramKorpusTeilen <- NGramKorpus %>%
separate(ngram, c("word1", "word2"))
NGramZaehlen <- NGramKorpusTeilen %>%
count(word1, word2, sort=TRUE)
NGramZaehlen
#write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-06-Spon-NGram-Liste')
write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-10-Rapp-NGram-Liste')

NGramZaehlen%>%
filter(n>= 20) %>%
filter(n<= 750) %>%
graph_from_data_frame() %>%
ggraph(layout = "igraph", algorithm= 'fr') +
geom_edge_link(aes(alpha = n, width = n)) +
geom_node_point(size = 2, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()

But when I paste my script the Execute-R operator, it won't work with the same input data. I've got the error massage wrong data at port. I thought the Execute-R runs only the r-script.

# rm_main is a mandatory function, 
# the number of arguments has to be the number of input ports (can be none)
rm_main = function(data)
{
library(readr)
library(dplyr)
library(tidytext)
library(tm)
library(tidyr)
library(stringr)
library(widyr)
library(ggraph)
library(igraph)
set.seed(2017)

#KorpusMitZahlen <- read_file("/home/knecht/Korpus/17-12-03-Rapp-Korpus.res")
KorpusOhneZahlen <- removeNumbers(data)

Korpus_DF <- data_frame(text=KorpusOhneZahlen)

GesamtTermAnzahl <- Korpus_DF %>%
unnest_tokens(word, text)
GesamtTermAnzahl
write.csv(GesamtTermAnzahl, '/home/knecht/Wortlisten/17-12-10-Rapp-GesamtTermAnzahl-Liste')

TermHaeufigkeit <- Korpus_DF %>%
unnest_tokens(word, text) %>%
count (word, sort=TRUE)%>%
ungroup()

TermHaeufigkeit
write.csv(TermHaeufigkeit, '/home/knecht/Wortlisten/17-12-10-Rapp-TermHaeufigkeit-Liste')

#WortRang <- TermHaeufigkeit %>%
# group_by(text) %>%
# mutate(rank = row_number(),
# 'term frequenz' = /GesamtTermAnzahl)
#WortRang

NGramKorpus <-Korpus_DF %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2)
NGramKorpus %>%
count(ngram)
NGramKorpusTeilen <- NGramKorpus %>%
separate(ngram, c("word1", "word2"))
NGramZaehlen <- NGramKorpusTeilen %>%
count(word1, word2, sort=TRUE)
NGramZaehlen
#write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-06-Spon-NGram-Liste')
write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-10-Rapp-NGram-Liste')

NGramZaehlen%>%
filter(n>= 20) %>%
filter(n<= 750) %>%
graph_from_data_frame() %>%
ggraph(layout = "igraph", algorithm= 'fr') +
geom_edge_link(aes(alpha = n, width = n)) +
geom_node_point(size = 2, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
}

Maybe someone has an idea.

regards

Tobias

Answers

  • Pavithra_RaoPavithra_Rao Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 123 RM Data Scientist

    Hi Tobias,

     

    When I tried to run the following process in my RM Studio I found that you are converting Data to documents and then inputting more than one outputs to Execute R script; whereas your execute R is coded to take only one data input "rm_main = function(data)"

    Either you could skip data to documents operator and input example set output directly to execute R operator OR define R script code to take all the document input.

     

    Also direct copy paste of R code from "R Studio" would need some modifications in Execute R Script. For eg: you would need a return function in Execute R script to get results at the output port.

     

    You could check the help section of the Execute R operator for few sample process on how to handle input and output of data here.

     

    Hope this helps!

     

    Cheers,

     
  • TobiasNehrigTobiasNehrig Member Posts: 41 Guru

    Hi Pavithra,

     

    Thanks for your help. I've tried what you wrote and skiped the data to documents operator and insert the select attribute operator to hand over only the column 'text'.  But unfourtunatly now the script stops with the Message:

    Dec 13, 2017 4:31:08 AM INFO: [1] "Failed to execute the script."
    Dec 13, 2017 4:31:08 AM INFO: [1] "unnest_tokens expects all columns of input to be atomic vectors (not lists)"

    In RStudio the skript runs without an error message and now it terminates.

    This ist my changed processes:

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
            <parameter key="url" value="http://www.spiegel.de"/>
            <list key="crawling_rules">
              <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
              <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
            </list>
            <parameter key="max_crawl_depth" value="10"/>
            <parameter key="retrieve_as_html" value="true"/>
            <parameter key="add_content_as_attribute" value="true"/>
            <parameter key="write_pages_to_disk" value="true"/>
            <parameter key="output_dir" value="/home/knecht/Master2017/Crawler/Spiegel"/>
            <parameter key="max_pages" value="5"/>
            <parameter key="max_page_size" value="300"/>
            <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
            <parameter key="ignore_robot_exclusion" value="true"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="179" y="34">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="link"/>
            <parameter key="random_user_agent" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights">
              <parameter key="link" value="1.0"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
                <parameter key="minimum_text_block_length" value="2"/>
              </operator>
              <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="45" y="136">
                <parameter key="mode" value="linguistic tokens"/>
                <parameter key="language" value="German"/>
              </operator>
              <operator activated="true" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens a-zA-Z" width="90" x="45" y="238">
                <parameter key="condition" value="matches"/>
                <parameter key="regular_expression" value="[a-zA-Z]+"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="313" y="34"/>
              <connect from_port="document" to_op="Extract Content" to_port="document"/>
              <connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
              <connect from_op="Tokenize Token" from_port="document" to_op="Filter Tokens a-zA-Z" to_port="document"/>
              <connect from_op="Filter Tokens a-zA-Z" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="text"/>
          </operator>
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Execute R" width="90" x="447" y="187">
            <parameter key="script" value="# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;rm_main = function(data)&#10;{&#10;library(readr)&#10;library(dplyr)&#10;library(tidytext)&#10;library(tm)&#10;library(tidyr)&#10;library(stringr)&#10;library(widyr)&#10;library(ggraph)&#10;library(igraph)&#10;set.seed(2017)&#10;&#10;KorpusMitZahlen &lt;- readDataframe(data, &quot;de&quot;)&#10;#KorpusOhneZahlen &lt;- removeNumbers(KorpusMitZahlen)&#10;&#10;Korpus_DF &lt;- data_frame(text=KorpusMitZahlen)&#10;&#10;GesamtTermAnzahl &lt;- Korpus_DF %&gt;%&#10;  unnest_tokens(word, text)&#10;GesamtTermAnzahl&#10;&#10;return(GesamtTermAnzahl)&#10;&#10;}&#10;"/>
          </operator>
          <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Execute R" to_port="input 1"/>
          <connect from_op="Execute R" from_port="output 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    For the test i've also shorten the r-script and inserted a return:

     

    # rm_main is a mandatory function, 
    # the number of arguments has to be the number of input ports (can be none)
    rm_main = function(data)
    {
    library(readr)
    library(dplyr)
    library(tidytext)
    library(tm)
    library(tidyr)
    library(stringr)
    library(widyr)
    library(ggraph)
    library(igraph)
    set.seed(2017)

    KorpusMitZahlen <- readDataframe(data, "de")
    #KorpusOhneZahlen <- removeNumbers(KorpusMitZahlen)

    Korpus_DF <- data_frame(text=KorpusMitZahlen)

    GesamtTermAnzahl <- Korpus_DF %>%
    unnest_tokens(word, text)
    GesamtTermAnzahl

    return(GesamtTermAnzahl)

    }

    Regards,

    Tobias

Sign In or Register to comment.