Cleaning twitter data

ilzeilze Member Posts: 2 Contributor I
edited November 2018 in Help

I'm new to RapidMiner, and I am struggling to understand how the Filter commands can be used to clean up twitter feeds. I am importing these from a CSV file and am trying to create sub-processes within the process documents operator to remove twitter handles (@), RT and hashtags. I have tried for example to use Filter Tokens by Content specifying that the condition is contains the string @. Although the process runs without errors I cannot see in the results that the twitter handles were removed. Can anybody please advise on how to go about cleaning up the data?

Tagged:

Answers

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    When you load in the tweets from CSV they will come in as a Nominal datatype. To use the Filter Tokens by Content, you would need to convert those tweets into a Text data type via a Nominal to Text operator. 

     

    Here's a sample using the Search Twitter operator that does some cleaning.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
    <context>
    <input/>
    <output/>
    <macros>
    <macro>
    <key>keywords</key>
    <value>Donald Trump</value>
    </macro>
    </macros>
    </context>
    <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="social_media:search_twitter" compatibility="7.3.000" expanded="true" height="68" name="Search Twitter" width="90" x="45" y="34">
    <parameter key="connection" value="ThomasOtt"/>
    <parameter key="query" value="%{keywords}"/>
    <parameter key="limit" value="1000"/>
    <parameter key="language" value="en"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Text|Id|Retweet-Count"/>
    </operator>
    <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="Replace" width="90" x="313" y="34">
    <parameter key="replace_what" value="#(.*)"/>
    <parameter key="replace_by" value="hashtag_$1"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="7.5.000" expanded="true" height="82" name="Nominal to Text" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Text"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
    <parameter key="prune_method" value="percentual"/>
    <parameter key="prune_above_percent" value="50.0"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34"/>
    <operator activated="true" class="text:replace_tokens" compatibility="7.4.001" expanded="true" height="68" name="Replace Tokens" width="90" x="447" y="34">
    <list key="replace_dictionary">
    <parameter key="https" value="link"/>
    <parameter key="http" value="link"/>
    </list>
    </operator>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="581" y="34"/>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="715" y="34">
    <parameter key="string" value="link"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="849" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Replace Tokens" to_port="document"/>
    <connect from_op="Replace Tokens" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
    <connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
    <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="7.4.001" expanded="true" height="82" name="WordList to Data" width="90" x="715" y="85"/>
    <operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="849" y="85">
    <parameter key="attribute_name" value="total"/>
    <parameter key="sorting_direction" value="decreasing"/>
    </operator>
    <operator activated="true" class="write_excel" compatibility="7.5.000" expanded="true" height="82" name="Write Excel" width="90" x="983" y="85">
    <parameter key="excel_file" value="C:\Users\ThomasOtt\Desktop\Important Twitter Words for %{keywords}.xlsx"/>
    <parameter key="encoding" value="SYSTEM"/>
    </operator>
    <connect from_op="Search Twitter" from_port="output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/>
    <connect from_op="Replace" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
    <connect from_op="Sort" from_port="example set output" to_op="Write Excel" to_port="input"/>
    <connect from_op="Write Excel" from_port="through" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>

     

     

Sign In or Register to comment.