unable to classify/learn text "sentences"

lavramulavramu Member Posts: 16 Contributor II
HI,

I am trying to classify some text using a learning algorithm. For this my input is text files. I like to use the sentences of the texts as the unit (not words) hence I use the wordlist output of "Process document from files". While loading the files I give the label/class name and then tokenize using linguistic sentences. Now the example set retains the label attribute when the output is exampleset . But for the wordlist output , I get all the sentences in the rows but it loses the label attribute hence my process errors out before running with "Input example set must have special attribute 'label' when I use the validation operator .Even when I feed this to the wordlist to data operator, it does not help.

Wordlist output has the sentences, a new attribute created called "attribute name" with the same sentences as the data , total occurences, document occurences and 0 or 1 with the attribute as the two labels/classes i gave indicating the precense. (My classes are low/high). So there is no longer an attribute named label with values low high but now I have two colums named low and high with values 0 or 1.

Hence it is erroring out. How do I retain the label attribute indicating the class with which the sentence belongs to so that I can use validation operator for classifying.

Please can you help.
Basically I want to classify sentences by reading files tokenizing them on sentences and feed them into a learning algo. Simple. Not words but the whole sentences.


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="text:process_document_from_file" compatibility="5.3.001" expanded="true" height="76" name="Process Documents from Files (2)" width="90" x="45" y="30">
       <list key="text_directories">
         <parameter key="verylow" value="C:\Users\Uma\Desktop\nvivo\VeryLow"/>
         <parameter key="veryhigh" value="C:\Users\Uma\Desktop\nvivo\VeryHigh"/>
       </list>
       <parameter key="keep_text" value="true"/>
       <process expanded="true">
         <operator activated="true" class="text:transform_cases" compatibility="5.3.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
         <operator activated="true" class="text:replace_tokens" compatibility="5.3.001" expanded="true" height="60" name="Replace Tokens" width="90" x="180" y="30">
           <list key="replace_dictionary">
             <parameter key="reference.*coverage" value=" "/>
             <parameter key="&lt;internals.*]" value=" "/>
             <parameter key="&lt;page&gt;" value=" "/>
           </list>
         </operator>
         <operator activated="true" class="text:tokenize" compatibility="5.3.001" expanded="true" height="60" name="Tokenize" width="90" x="315" y="30">
           <parameter key="mode" value="linguistic sentences"/>
         </operator>
         <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="450" y="30"/>
         <operator activated="true" class="text:stem_porter" compatibility="5.3.001" expanded="true" height="60" name="Stem (Porter)" width="90" x="514" y="30"/>
         <connect from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Replace Tokens" to_port="document"/>
         <connect from_op="Replace Tokens" from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
         <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
         <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="text:wordlist_to_data" compatibility="5.3.001" expanded="true" height="76" name="WordList to Data" width="90" x="179" y="30"/>
     <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="447" y="210">
       <parameter key="number_of_validations" value="5"/>
       <process expanded="true">
         <operator activated="true" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="30"/>
         <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
         <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
         <portSpacing port="source_training" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
         <portSpacing port="sink_through 1" spacing="0"/>
       </process>
       <process expanded="true">
         <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
           <list key="application_parameters"/>
         </operator>
         <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="179" y="210">
           <parameter key="skip_undefined_labels" value="false"/>
           <parameter key="use_example_weights" value="false"/>
           <list key="class_weights"/>
         </operator>
         <connect from_port="model" to_op="Apply Model" to_port="model"/>
         <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
         <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
         <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
         <portSpacing port="source_model" spacing="0"/>
         <portSpacing port="source_test set" spacing="0"/>
         <portSpacing port="source_through 1" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Process Documents from Files (2)" from_port="word list" to_op="WordList to Data" to_port="word list"/>
     <connect from_op="WordList to Data" from_port="example set" to_op="Validation" to_port="training"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
   </process>
 </operator>
</process>

Answers

  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    the wordlist cannot be used to solve the classification problem. It's merely a list of the words/tokens that appear in the document.

    What you need to do is to create a document from each single sentence, if you want to classify the sentences. You can use the CutDocument operator for this, for example as done in the process below.

    Best regards,
    Marius
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.013">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
        <description>Cuts documents into sentences.</description>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="5.3.013" expanded="true" height="60" name="Retrieve 00 - HotelCustomerSatisfaction_en" width="90" x="45" y="165">
            <parameter key="repository_entry" value="//LocalRepository/Training/Text Mining/Data/00 - HotelCustomerSatisfaction_en"/>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="5.3.013" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="120">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="text"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="30">
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:cut_document" compatibility="5.3.000" expanded="true" height="60" name="Cut Document" width="90" x="179" y="30">
                <parameter key="query_type" value="Regular Expression"/>
                <list key="string_machting_queries">
                  <parameter key="test" value="\..\."/>
                </list>
                <list key="regular_expression_queries">
                  <parameter key="sentence" value="([^\.!\?:]+)[\.!\?:]"/>
                </list>
                <list key="regular_region_queries"/>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <list key="index_queries"/>
                <process expanded="true">
                  <connect from_port="segment" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="document" to_op="Cut Document" to_port="document"/>
              <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve 00 - HotelCustomerSatisfaction_en" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.