Options

RSS Feeds Classification Process

montaqimontaqi Member Posts: 10 Contributor II
edited August 2019 in Help
I am working on a process that I read bbc worlds news from its rss url then I predict positive/negative to each news as well as over all news.

I have built two text files containing positive words and negative words. Following is my XML, but somehow the result generated is not what I want. The result from this process only predicts 4 words, which is very strange. What am I doing wrong?

<process version="5.1.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
    <process expanded="true" height="460" width="614">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
        <list key="text_directories">
          <parameter key="positive" value="C:\Documents and Settings\TU001YU\Desktop\positive"/>
          <parameter key="negative" value="C:\Documents and Settings\TU001YU\Desktop\negative"/>
        </list>
        <process expanded="true" height="524" width="806">
          <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="75"/>
          <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize" width="90" x="313" y="75">
            <parameter key="mode" value="linguistic tokens"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="web:read_rss" compatibility="5.1.000" expanded="true" height="60" name="Read RSS Feed" width="90" x="45" y="345">
        <parameter key="url" value="http://feeds.bbci.co.uk/news/world/rss.xml"/>
        <parameter key="random_user_agent" value="true"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="255">
        <list key="specify_weights"/>
        <process expanded="true" height="524" width="806">
          <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize (2)" width="90" x="179" y="30"/>
          <operator activated="true" class="text:stem_porter" compatibility="5.1.001" expanded="true" height="60" name="Stem (Porter)" width="90" x="313" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="447" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.1.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="581" y="30">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="99"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
          <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="5.1.001" expanded="true" height="76" name="WordList to Data" width="90" x="380" y="165"/>
      <operator activated="true" class="naive_bayes_kernel" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes (Kernel)" width="90" x="246" y="30"/>
      <operator activated="true" class="apply_model" compatibility="5.1.006" expanded="true" height="76" name="Apply Model" width="90" x="447" y="30">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Naive Bayes (Kernel)" to_port="training set"/>
      <connect from_op="Process Documents from Files" from_port="word list" to_op="Process Documents from Data" to_port="word list"/>
      <connect from_op="Read RSS Feed" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 3"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Naive Bayes (Kernel)" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>
Tagged:

Answers

  • Options
    landland RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 2,531 Unicorn
    Hi,
    sorry, but I think this process does not make any sense...

    You are building a naive bayes model that will just count positive against negative words. Ok, if you just want to classify according to these prepared word lists...

    But then you are retrieving an RSS feed, processing it according to the word list resulting from your two word list files. But you throw away the result and use the same wordlist that came in to classify? This is just senseless. Use the ExampleSet instead.

    Anyway this is not the way you usually use data mining. It would be much easier for you to write a script that counts the occurring of your words. Normally you will need the information for a small sample of posts, where they are positive or negative. Then the system will learn which words are positive and which negative automatically! Much more flexible...

    Greetings,
      Sebastian
  • Options
    montaqimontaqi Member Posts: 10 Contributor II
    Thank you for your reply Sebastian, I am new to Text Mining but I am trying to learn. I built this model following another post on this forum:
    http://rapid-i.com/rapidforum/index.php/topic,3078.0.html

    If I can classify text files using this model how come I cannot classify rss fees using the same model?
    Sebastian Land wrote:

    Hi,
    sorry, but I think this process does not make any sense...

    You are building a naive bayes model that will just count positive against negative words. Ok, if you just want to classify according to these prepared word lists...

    But then you are retrieving an RSS feed, processing it according to the word list resulting from your two word list files. But you throw away the result and use the same wordlist that came in to classify? This is just senseless. Use the ExampleSet instead.

    Anyway this is not the way you usually use data mining. It would be much easier for you to write a script that counts the occurring of your words. Normally you will need the information for a small sample of posts, where they are positive or negative. Then the system will learn which words are positive and which negative automatically! Much more flexible...

    Greetings,
      Sebastian
  • Options
    Marco_BoeckMarco_Boeck Administrator, Moderator, Employee, Member, University Professor Posts: 1,995 RM Engineering
    Hi,

    you are using the wordlist instead of using the result of your lower Process Documents operator for classification.
    I corrected your process:

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.008">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="5.1.008" expanded="true" name="Process">
       <process expanded="true" height="346" width="567">
         <operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
           <list key="text_directories">
             <parameter key="positive" value="C:\Documents and Settings\TU001YU\Desktop\positive"/>
             <parameter key="negative" value="C:\Documents and Settings\TU001YU\Desktop\negative"/>
           </list>
           <process expanded="true" height="524" width="806">
             <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="75"/>
             <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize" width="90" x="313" y="75">
               <parameter key="mode" value="linguistic tokens"/>
             </operator>
             <connect from_port="document" to_op="Transform Cases" to_port="document"/>
             <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
             <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
             <portSpacing port="source_document" spacing="0"/>
             <portSpacing port="sink_document 1" spacing="0"/>
             <portSpacing port="sink_document 2" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="web:read_rss" compatibility="5.1.000" expanded="true" height="60" name="Read RSS Feed" width="90" x="45" y="165">
           <parameter key="url" value="http://feeds.bbci.co.uk/news/world/rss.xml"/>
           <parameter key="random_user_agent" value="true"/>
         </operator>
         <operator activated="true" class="text:process_document_from_data" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="120">
           <list key="specify_weights"/>
           <process expanded="true" height="524" width="806">
             <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases (2)" width="90" x="45" y="30"/>
             <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize (2)" width="90" x="179" y="30"/>
             <operator activated="true" class="text:stem_porter" compatibility="5.1.001" expanded="true" height="60" name="Stem (Porter)" width="90" x="313" y="30"/>
             <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="447" y="30"/>
             <operator activated="true" class="text:filter_by_length" compatibility="5.1.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="581" y="30">
               <parameter key="min_chars" value="2"/>
               <parameter key="max_chars" value="99"/>
             </operator>
             <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
             <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
             <connect from_op="Tokenize (2)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
             <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
             <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
             <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
             <portSpacing port="source_document" spacing="0"/>
             <portSpacing port="sink_document 1" spacing="0"/>
             <portSpacing port="sink_document 2" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="naive_bayes_kernel" compatibility="5.1.008" expanded="true" height="76" name="Naive Bayes (Kernel)" width="90" x="246" y="30"/>
         <operator activated="true" class="apply_model" compatibility="5.1.008" expanded="true" height="76" name="Apply Model" width="90" x="447" y="30">
           <list key="application_parameters"/>
         </operator>
         <connect from_op="Process Documents from Files" from_port="example set" to_op="Naive Bayes (Kernel)" to_port="training set"/>
         <connect from_op="Process Documents from Files" from_port="word list" to_op="Process Documents from Data" to_port="word list"/>
         <connect from_op="Read RSS Feed" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
         <connect from_op="Process Documents from Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
         <connect from_op="Naive Bayes (Kernel)" from_port="model" to_op="Apply Model" to_port="model"/>
         <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
         <connect from_op="Apply Model" from_port="model" to_port="result 2"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
         <portSpacing port="sink_result 3" spacing="0"/>
       </process>
     </operator>
    </process>
    If you're interested in text mining, you may want to check out our webinars, for example this one: http://rapid-i.com/component/page,shop.product_details/flypage,garden_flypage.tpl/product_id,63/category_id,16/option,com_virtuemart/Itemid,180/vmcchk,1/lang,en/

    Regards,
    Marco
Sign In or Register to comment.