Options

[SOLVED] kNN Text Classification - Warning for missing regular attributes

MerlotMerlot Member Posts: 12 Contributor II
edited July 2019 in Help
Hi all,

to classifiy texts, I use "Process Documents" to create TF_IDF vectors. These are used to learn a kNN. When I apply the KNN model to new data, I receive thousands of warnings like
Dec 7, 2011 6:37:15 PM com.rapidminer.tools.WrapperLoggingHandler logWarning
WARNING: KNNClassification: The given example set does not contain a regular attribute with name 'XXX'. This might cause problems for some models depending on this particular attribute.
where "XXX" stands for any word used within my texts.

Is there something wrong with my model?

Create Model:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.014">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.1.014" expanded="true" name="Process">
   <process expanded="true" height="579" width="708">
     <operator activated="false" class="retrieve" compatibility="5.1.014" expanded="true" height="60" name="Retrieve" width="90" x="112" y="255">
       <parameter key="repository_entry" value="data/v1_class_blogs"/>
     </operator>
     <operator activated="false" class="optimize_parameters_grid" compatibility="5.1.014" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="581" y="30">
       <list key="parameters">
         <parameter key="k-NN.k" value="[1.0;3.0;10;linear]"/>
       </list>
       <process expanded="true" height="579" width="951">
         <operator activated="false" class="x_validation" compatibility="5.1.014" expanded="true" height="112" name="Validation" width="90" x="246" y="30">
           <parameter key="number_of_validations" value="3"/>
           <process expanded="true">
             <operator activated="false" class="naive_bayes" compatibility="5.1.014" expanded="true" name="Naive Bayes"/>
             <operator activated="false" class="k_nn" compatibility="5.1.014" expanded="true" name="k-NN">
               <parameter key="k" value="3"/>
             </operator>
             <connect from_port="training" to_op="k-NN" to_port="training set"/>
             <connect from_op="k-NN" from_port="model" to_port="model"/>
             <portSpacing port="source_training" spacing="0"/>
             <portSpacing port="sink_model" spacing="0"/>
             <portSpacing port="sink_through 1" spacing="0"/>
           </process>
           <process expanded="true">
             <operator activated="false" class="apply_model" compatibility="5.1.014" expanded="true" name="Apply Model">
               <list key="application_parameters"/>
             </operator>
             <operator activated="false" class="performance_classification" compatibility="5.1.014" expanded="true" name="Performance">
               <list key="class_weights"/>
             </operator>
             <connect from_port="model" to_op="Apply Model" to_port="model"/>
             <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
             <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
             <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
             <portSpacing port="source_model" spacing="0"/>
             <portSpacing port="source_test set" spacing="0"/>
             <portSpacing port="source_through 1" spacing="0"/>
             <portSpacing port="sink_averagable 1" spacing="0"/>
             <portSpacing port="sink_averagable 2" spacing="0"/>
           </process>
         </operator>
         <operator activated="false" class="log" compatibility="5.1.014" expanded="true" height="76" name="Log" width="90" x="434" y="66">
           <list key="log">
             <parameter key="k" value="operator.k-NN.parameter.k"/>
             <parameter key="performance" value="operator.Performance.value.accuracy"/>
           </list>
         </operator>
         <connect from_port="input 1" to_op="Validation" to_port="training"/>
         <connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
         <connect from_op="Log" from_port="through 1" to_port="performance"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="source_input 2" spacing="0"/>
         <portSpacing port="sink_performance" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="text:process_document_from_data" compatibility="5.1.003" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="75">
       <parameter key="prune_method" value="absolute"/>
       <parameter key="prune_below_absolute" value="2"/>
       <parameter key="prune_above_absolute" value="999"/>
       <list key="specify_weights"/>
       <process expanded="true" height="581" width="955">
         <operator activated="true" class="text:transform_cases" compatibility="5.1.003" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
         <operator activated="true" class="text:tokenize" compatibility="5.1.003" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
         <operator activated="true" class="text:filter_stopwords_german" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="380" y="30"/>
         <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="514" y="30"/>
         <operator activated="true" class="text:stem_snowball" compatibility="5.1.003" expanded="true" height="60" name="Stem (Snowball)" width="90" x="648" y="30">
           <parameter key="language" value="German"/>
         </operator>
         <operator activated="true" class="text:filter_by_length" compatibility="5.1.003" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="782" y="30">
           <parameter key="max_chars" value="999"/>
         </operator>
         <connect from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
         <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
         <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
         <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
         <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="false" class="x_validation" compatibility="5.1.014" expanded="true" height="112" name="Validation (2)" width="90" x="380" y="75">
       <parameter key="number_of_validations" value="3"/>
       <process expanded="true" height="597" width="459">
         <operator activated="false" class="naive_bayes" compatibility="5.1.014" expanded="true" height="76" name="Naive Bayes (2)" width="90" x="179" y="120"/>
         <operator activated="false" class="k_nn" compatibility="5.1.014" expanded="true" height="76" name="k-NN (2)" width="90" x="179" y="30">
           <parameter key="k" value="2"/>
         </operator>
         <connect from_port="training" to_op="k-NN (2)" to_port="training set"/>
         <connect from_op="k-NN (2)" from_port="model" to_port="model"/>
         <portSpacing port="source_training" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
         <portSpacing port="sink_through 1" spacing="0"/>
       </process>
       <process expanded="true" height="597" width="459">
         <operator activated="false" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model (2)" width="90" x="45" y="30">
           <list key="application_parameters"/>
         </operator>
         <operator activated="false" class="performance_classification" compatibility="5.1.014" expanded="true" height="76" name="Performance (2)" width="90" x="246" y="165">
           <list key="class_weights"/>
         </operator>
         <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
         <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
         <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
         <connect from_op="Performance (2)" from_port="performance" to_port="averagable 1"/>
         <portSpacing port="source_model" spacing="0"/>
         <portSpacing port="source_test set" spacing="0"/>
         <portSpacing port="source_through 1" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="k_nn" compatibility="5.1.014" expanded="true" height="76" name="k-NN (3)" width="90" x="581" y="165">
       <parameter key="k" value="2"/>
     </operator>
     <operator activated="false" class="text:process_document_from_data" compatibility="5.1.003" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="313" y="210">
       <parameter key="prune_method" value="absolute"/>
       <parameter key="prune_below_absolute" value="2"/>
       <parameter key="prune_above_absolute" value="999"/>
       <list key="specify_weights"/>
       <process expanded="true">
         <operator activated="false" class="text:transform_cases" compatibility="5.1.003" expanded="true" name="Transform Cases (2)"/>
         <operator activated="false" class="text:tokenize" compatibility="5.1.003" expanded="true" name="Tokenize (2)"/>
         <operator activated="false" class="text:filter_stopwords_german" compatibility="5.1.003" expanded="true" name="Filter Stopwords (2)"/>
         <operator activated="false" class="text:filter_stopwords_english" compatibility="5.1.003" expanded="true" name="Filter Stopwords (3)"/>
         <operator activated="false" class="text:stem_snowball" compatibility="5.1.003" expanded="true" name="Stem (2)">
           <parameter key="language" value="German"/>
         </operator>
         <operator activated="false" class="text:filter_by_length" compatibility="5.1.003" expanded="true" name="Filter Tokens (2)">
           <parameter key="max_chars" value="999"/>
         </operator>
         <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
         <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
         <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
         <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
         <connect from_op="Filter Stopwords (3)" from_port="document" to_op="Stem (2)" to_port="document"/>
         <connect from_op="Stem (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
         <connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="false" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model (3)" width="90" x="552" y="312">
       <list key="application_parameters"/>
     </operator>
     <connect from_port="input 1" to_op="Process Documents from Data" to_port="example set"/>
     <connect from_op="Process Documents from Data" from_port="example set" to_op="k-NN (3)" to_port="training set"/>
     <connect from_op="k-NN (3)" from_port="model" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="source_input 2" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>
Apply Model:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.014">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.1.014" expanded="true" name="Process">
   <process expanded="true" height="579" width="708">
     <operator activated="true" class="text:process_document_from_data" compatibility="5.1.003" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="165">
       <parameter key="prune_method" value="absolute"/>
       <parameter key="prune_below_absolute" value="2"/>
       <parameter key="prune_above_absolute" value="999"/>
       <list key="specify_weights"/>
       <process expanded="true" height="581" width="955">
         <operator activated="true" class="text:transform_cases" compatibility="5.1.003" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
         <operator activated="true" class="text:tokenize" compatibility="5.1.003" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
         <operator activated="true" class="text:filter_stopwords_german" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="380" y="30"/>
         <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="514" y="30"/>
         <operator activated="true" class="text:stem_snowball" compatibility="5.1.003" expanded="true" height="60" name="Stem (Snowball)" width="90" x="648" y="30">
           <parameter key="language" value="German"/>
         </operator>
         <operator activated="true" class="text:filter_by_length" compatibility="5.1.003" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="782" y="30">
           <parameter key="max_chars" value="999"/>
         </operator>
         <connect from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
         <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
         <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
         <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
         <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="read_model" compatibility="5.1.014" expanded="true" height="60" name="Read Model" width="90" x="179" y="75">
       <parameter key="model_file" value="/MOUNT/BTRFS/CRAWLER/1und1/model.mod"/>
     </operator>
     <operator activated="true" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model" width="90" x="447" y="120">
       <list key="application_parameters"/>
     </operator>
     <connect from_port="input 1" to_op="Process Documents from Data" to_port="example set"/>
     <connect from_op="Process Documents from Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
     <connect from_op="Read Model" from_port="output" to_op="Apply Model" to_port="model"/>
     <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="source_input 2" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>
Regards
Merlot

Answers

  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi, first of all: k-NN is probably not the best operator for text classification, since it has a problem with high dimensional data - and word vectors are usually very high dimensional. Try something like an SVM.
    However, the log warnings you see are not related to k-NN, but basically they mean that the documents on which you apply your model contain different words than the ones on which you trained it. Thus the word vectors differ, and k-NN sees unknown attributes. What you have to do is to store the word vector in addition to the model and pass it into the second Process Documents operator. See the processes below for an example.
    Additionally you have to make sure that both Process Documents operators use the same parameters and the same subprocess to process the documents.

    One more thing: we strongly recommend using the repository to store your models, data etc. (with Store and Retrieve operators) instead of plain files (as with the Read Model operator).

    Kind regards,
    Marius
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.014">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.014" expanded="true" name="Process">
        <process expanded="true" height="579" width="708">
          <operator activated="false" class="retrieve" compatibility="5.1.014" expanded="true" height="60" name="Retrieve" width="90" x="112" y="255">
            <parameter key="repository_entry" value="data/v1_class_blogs"/>
          </operator>
          <operator activated="false" class="optimize_parameters_grid" compatibility="5.1.014" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="447" y="255">
            <list key="parameters">
              <parameter key="k-NN.k" value="[1.0;3.0;10;linear]"/>
            </list>
            <process expanded="true" height="579" width="951">
              <operator activated="false" class="x_validation" compatibility="5.1.014" expanded="true" height="112" name="Validation" width="90" x="246" y="30">
                <parameter key="number_of_validations" value="3"/>
                <process expanded="true">
                  <operator activated="false" class="naive_bayes" compatibility="5.1.014" expanded="true" name="Naive Bayes"/>
                  <operator activated="false" class="k_nn" compatibility="5.1.014" expanded="true" name="k-NN">
                    <parameter key="k" value="3"/>
                  </operator>
                  <connect from_port="training" to_op="k-NN" to_port="training set"/>
                  <connect from_op="k-NN" from_port="model" to_port="model"/>
                  <portSpacing port="source_training" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="false" class="apply_model" compatibility="5.1.014" expanded="true" name="Apply Model">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="false" class="performance_classification" compatibility="5.1.014" expanded="true" name="Performance">
                    <list key="class_weights"/>
                  </operator>
                  <connect from_port="model" to_op="Apply Model" to_port="model"/>
                  <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
                  <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                  <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_averagable 1" spacing="0"/>
                  <portSpacing port="sink_averagable 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="false" class="log" compatibility="5.1.014" expanded="true" height="76" name="Log" width="90" x="434" y="66">
                <list key="log">
                  <parameter key="k" value="operator.k-NN.parameter.k"/>
                  <parameter key="performance" value="operator.Performance.value.accuracy"/>
                </list>
              </operator>
              <connect from_port="input 1" to_op="Validation" to_port="training"/>
              <connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
              <connect from_op="Log" from_port="through 1" to_port="performance"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_performance" spacing="0"/>
              <portSpacing port="sink_result 1" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.1.003" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="75">
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="999"/>
            <list key="specify_weights"/>
            <process expanded="true" height="581" width="955">
              <operator activated="true" class="text:transform_cases" compatibility="5.1.003" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
              <operator activated="true" class="text:tokenize" compatibility="5.1.003" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
              <operator activated="true" class="text:filter_stopwords_german" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="380" y="30"/>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="514" y="30"/>
              <operator activated="true" class="text:stem_snowball" compatibility="5.1.003" expanded="true" height="60" name="Stem (Snowball)" width="90" x="648" y="30">
                <parameter key="language" value="German"/>
              </operator>
              <operator activated="true" class="text:filter_by_length" compatibility="5.1.003" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="782" y="30">
                <parameter key="max_chars" value="999"/>
              </operator>
              <connect from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
              <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
              <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="store" compatibility="5.1.014" expanded="true" height="60" name="Store (2)" width="90" x="514" y="165">
            <parameter key="repository_entry" value="word vector"/>
          </operator>
          <operator activated="false" class="x_validation" compatibility="5.1.014" expanded="true" height="112" name="Validation (2)" width="90" x="246" y="300">
            <parameter key="number_of_validations" value="3"/>
            <process expanded="true" height="597" width="459">
              <operator activated="false" class="naive_bayes" compatibility="5.1.014" expanded="true" height="76" name="Naive Bayes (2)" width="90" x="179" y="120"/>
              <operator activated="false" class="k_nn" compatibility="5.1.014" expanded="true" height="76" name="k-NN (2)" width="90" x="179" y="30">
                <parameter key="k" value="2"/>
              </operator>
              <connect from_port="training" to_op="k-NN (2)" to_port="training set"/>
              <connect from_op="k-NN (2)" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="597" width="459">
              <operator activated="false" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model (2)" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="false" class="performance_classification" compatibility="5.1.014" expanded="true" height="76" name="Performance (2)" width="90" x="246" y="165">
                <list key="class_weights"/>
              </operator>
              <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
              <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
              <connect from_op="Performance (2)" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="k_nn" compatibility="5.1.014" expanded="true" height="76" name="k-NN (3)" width="90" x="380" y="30">
            <parameter key="k" value="2"/>
          </operator>
          <operator activated="true" class="store" compatibility="5.1.014" expanded="true" height="60" name="Store" width="90" x="510" y="72">
            <parameter key="repository_entry" value="model"/>
          </operator>
          <operator activated="false" class="text:process_document_from_data" compatibility="5.1.003" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="313" y="210">
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="999"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="false" class="text:transform_cases" compatibility="5.1.003" expanded="true" name="Transform Cases (2)"/>
              <operator activated="false" class="text:tokenize" compatibility="5.1.003" expanded="true" name="Tokenize (2)"/>
              <operator activated="false" class="text:filter_stopwords_german" compatibility="5.1.003" expanded="true" name="Filter Stopwords (2)"/>
              <operator activated="false" class="text:filter_stopwords_english" compatibility="5.1.003" expanded="true" name="Filter Stopwords (3)"/>
              <operator activated="false" class="text:stem_snowball" compatibility="5.1.003" expanded="true" name="Stem (2)">
                <parameter key="language" value="German"/>
              </operator>
              <operator activated="false" class="text:filter_by_length" compatibility="5.1.003" expanded="true" name="Filter Tokens (2)">
                <parameter key="max_chars" value="999"/>
              </operator>
              <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
              <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
              <connect from_op="Filter Stopwords (3)" from_port="document" to_op="Stem (2)" to_port="document"/>
              <connect from_op="Stem (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
              <connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="false" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model (3)" width="90" x="552" y="312">
            <list key="application_parameters"/>
          </operator>
          <connect from_port="input 1" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="k-NN (3)" to_port="training set"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_op="Store (2)" to_port="input"/>
          <connect from_op="k-NN (3)" from_port="model" to_op="Store" to_port="input"/>
          <connect from_op="Store" from_port="through" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.014">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.014" expanded="true" name="Process">
        <process expanded="true" height="579" width="708">
          <operator activated="true" class="retrieve" compatibility="5.1.014" expanded="true" height="60" name="Retrieve Model" width="90" x="179" y="30">
            <parameter key="repository_entry" value="model"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="5.1.014" expanded="true" height="60" name="Retrieve Word Vector" width="90" x="45" y="255">
            <parameter key="repository_entry" value="word vector"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.1.003" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="165">
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="999"/>
            <list key="specify_weights"/>
            <process expanded="true" height="581" width="955">
              <operator activated="true" class="text:transform_cases" compatibility="5.1.003" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
              <operator activated="true" class="text:tokenize" compatibility="5.1.003" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
              <operator activated="true" class="text:filter_stopwords_german" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="380" y="30"/>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="514" y="30"/>
              <operator activated="true" class="text:stem_snowball" compatibility="5.1.003" expanded="true" height="60" name="Stem (Snowball)" width="90" x="648" y="30">
                <parameter key="language" value="German"/>
              </operator>
              <operator activated="true" class="text:filter_by_length" compatibility="5.1.003" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="782" y="30">
                <parameter key="max_chars" value="999"/>
              </operator>
              <connect from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
              <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
              <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model" width="90" x="447" y="120">
            <list key="application_parameters"/>
          </operator>
          <connect from_port="input 1" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Retrieve Model" from_port="output" to_op="Apply Model" to_port="model"/>
          <connect from_op="Retrieve Word Vector" from_port="output" to_op="Process Documents from Data" to_port="word list"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • Options
    MerlotMerlot Member Posts: 12 Contributor II
    Hi,

    thank you very much. Now, everything works fine. :-)

    Cheers
    Merlot
Sign In or Register to comment.