Options

Create a co-occurrence graph

TobiasNehrigTobiasNehrig Member Posts: 41 Guru
edited November 2018 in Help

Hello Experts,

 

I have to create a co-occurrence graph with RapidMiner. I have already crawled a given web page and processed the files to create the text corpus. But now I have no idea how to create the co- occurrence graph.

 

My Question is:

Is there a way to create a co-occurrence graph directly with RapidMiner from a text corpus or is there a better way with an R-script and which would that be?

 

This is my code so far:

 

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<parameter key="logfile" value="/home/knecht/Master2017/Rapp/Logfile.log"/>
<parameter key="resultfile" value="/home/knecht/Master2017/Rapp/resultfile.res"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/main/main.html"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/.*"/>
<parameter key="follow_link_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml.*"/>
</list>
<parameter key="max_crawl_depth" value="10"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="output_dir" value="/home/knecht/Crawler"/>
<parameter key="max_pages" value="1000"/>
<parameter key="max_page_size" value="500"/>
<parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
<parameter key="ignore_robot_exclusion" value="true"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="45" y="136">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="link"/>
<parameter key="random_user_agent" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="289">
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="percentual"/>
<list key="specify_weights">
<parameter key="link" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="2"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize non letters" width="90" x="45" y="136"/>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Sätze" width="90" x="45" y="238">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize linguistic token" width="90" x="45" y="340">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="45" y="437"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize non letters" to_port="document"/>
<connect from_op="Tokenize non letters" from_port="document" to_op="Tokenize Sätze" to_port="document"/>
<connect from_op="Tokenize Sätze" from_port="document" to_op="Tokenize linguistic token" to_port="document"/>
<connect from_op="Tokenize linguistic token" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="179" y="289"/>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Wordlist exa" width="90" x="581" y="442">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Wordlist_exa.res"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Wordlist" width="90" x="581" y="340">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Wordlist.res"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply" width="90" x="179" y="136"/>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Process Documents" width="90" x="581" y="238">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Proccess_Documents.res"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="34">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="text" value="1.0"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Data to Document" width="90" x="313" y="34"/>
<operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="447" y="136">
<parameter key="text_attribute" value="text"/>
<parameter key="label_attribute" value="text"/>
<parameter key="data_management" value="memory-optimized"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Documents to Data" width="90" x="581" y="136">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Document_to_Data.res"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Korpus" width="90" x="581" y="34">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Corpus.res"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="word list" to_op="Write Wordlist" to_port="input 1"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Wordlist exa" to_port="input 1"/>
<connect from_op="Write Wordlist exa" from_port="input 1" to_port="result 5"/>
<connect from_op="Write Wordlist" from_port="input 1" to_port="result 4"/>
<connect from_op="Multiply" from_port="output 1" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Write Process Documents" to_port="input 1"/>
<connect from_op="Write Process Documents" from_port="input 1" to_port="result 3"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Data to Document" to_port="input"/>
<connect from_op="Data to Document" from_port="output 1" to_op="Write Korpus" to_port="input 1"/>
<connect from_op="Data to Document" from_port="output 2" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Write Documents to Data" to_port="input 1"/>
<connect from_op="Write Documents to Data" from_port="input 1" to_port="result 2"/>
<connect from_op="Write Korpus" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
</process>
</operator>
</process>

 

 

Thanks for the help!

 

Tobias

 

 

Best Answer

Answers

  • Options
    sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    Hello @TobiasNehrig - that's a good question.  Not to my knowledge.  However you can easily take your R script and use the Execute R operator to do the graph (download the R extension from the marketplace).


    Scott

     

  • Options
    TobiasNehrigTobiasNehrig Member Posts: 41 Guru

    Hi,

    thanks for the rapid reply. I thought so but I hoped for an easier way.

     

    Tobias

Sign In or Register to comment.