Options

Count of the non-zero values

KathiKathi Member Posts: 11 Contributor I
Hi everyone,
I am working with a n-Gram process and want RapidMiner to count just the non-zero values. I tried the 'generate aggregation' operator with the numeric condition > 0. But the output is still the count of all the values (including the zero values). Do I have the wrong parameters? Or do I need a different operator?

Regards and thanks for the help!

Best Answer

  • Options
    kaymankayman Member Posts: 662 Unicorn
    edited October 2019 Solution Accepted
    Hi @Kathi, I wanted to leave some of the fun for you :-)

    If you only need to know the unique values per document you can change the vector creation to binary, than it just checks if it exists or not so it will state 1 even if the same n-gram appears 20 times.

    If you need both total and unique try this : 
    <?xml version="1.0" encoding="UTF-8"?><process version="9.4.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.4.001" expanded="true" height="68" name="Retrieve Input Analyse" width="90" x="45" y="34">
            <parameter key="repository_entry" value="Input Analyse"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.4.001" expanded="true" height="82" name="Multiply" width="90" x="179" y="34"/>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="50"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:generate_n_grams_terms" compatibility="8.2.000" expanded="true" height="68" name="Generate n-Grams (Terms) (2)" width="90" x="313" y="34">
                <parameter key="max_length" value="4"/>
              </operator>
              <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="514" y="34">
                <parameter key="condition" value="matches"/>
                <parameter key="regular_expression" value="^\w+_\w+_\w+_\w+"/>
                <parameter key="case_sensitive" value="false"/>
                <parameter key="invert condition" value="false"/>
                <description align="center" color="transparent" colored="false" width="126">only keep 4-gram</description>
              </operator>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Generate n-Grams (Terms) (2)" to_port="document"/>
              <connect from_op="Generate n-Grams (Terms) (2)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
              <connect from_op="Filter Tokens (by Content)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="concurrency:loop_attributes" compatibility="9.4.001" expanded="true" height="82" name="Loop Attributes" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="attribute_name_macro" value="la"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="false"/>
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="9.4.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="rename" compatibility="9.4.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
                <parameter key="old_name" value="%{la}"/>
                <parameter key="new_name" value="ngram"/>
                <list key="rename_additional_attributes"/>
              </operator>
              <operator activated="true" class="generate_attributes" compatibility="9.4.001" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
                <list key="function_descriptions">
                  <parameter key="value" value="%{la}"/>
                </list>
                <parameter key="keep_all" value="true"/>
              </operator>
              <connect from_port="input 1" to_op="Select Attributes" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
              <connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="operator_toolbox:advanced_append" compatibility="2.2.000" expanded="true" height="82" name="Append (Superset)" width="90" x="581" y="34"/>
          <operator activated="true" class="blending:pivot" compatibility="9.4.001" expanded="true" height="82" name="Pivot (3)" origin="GENERATED_TURBOPREP" width="90" x="715" y="34">
            <parameter key="group_by_attributes" value="value"/>
            <parameter key="column_grouping_attribute" value="ID"/>
            <list key="aggregation_attributes">
              <parameter key="ngram" value="sum"/>
            </list>
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="default_aggregation_function" value="first"/>
          </operator>
          <operator activated="true" class="rename_by_replacing" compatibility="9.4.001" expanded="true" height="82" name="Rename by Replacing" width="90" x="849" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="replace_what" value="sum\(ngram\)_(.*)$"/>
            <parameter key="replace_by" value="$1"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="9.4.001" expanded="true" height="82" name="Set Role" width="90" x="983" y="34">
            <parameter key="attribute_name" value="value"/>
            <parameter key="target_role" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_aggregation" compatibility="9.4.001" expanded="true" height="82" name="Generate Aggregation" width="90" x="1117" y="34">
            <parameter key="attribute_name" value="sum_of_all"/>
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="aggregation_function" value="sum"/>
            <parameter key="concatenation_separator" value="|"/>
            <parameter key="keep_all" value="true"/>
            <parameter key="ignore_missings" value="true"/>
            <parameter key="ignore_missing_attributes" value="false"/>
          </operator>
          <operator activated="true" class="concurrency:loop_attributes" compatibility="9.4.001" expanded="true" height="82" name="Loop Attributes (2)" width="90" x="1251" y="187">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value="%{la}"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="attribute_name_macro" value="la"/>
            <parameter key="reuse_results" value="true"/>
            <parameter key="enable_parallel_execution" value="true"/>
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="9.4.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="45" y="34">
                <parameter key="attribute_filter_type" value="all"/>
                <parameter key="attribute" value=""/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="numerical_to_polynominal" compatibility="9.4.001" expanded="true" height="82" name="Numerical to Polynominal" width="90" x="179" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="numeric"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="real"/>
                <parameter key="block_type" value="value_series"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_series_end"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="replace" compatibility="9.4.001" expanded="true" height="82" name="Replace" width="90" x="313" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="nominal"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="file_path"/>
                <parameter key="block_type" value="single_value"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="single_value"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
                <parameter key="replace_what" value="^[^0].*$"/>
                <parameter key="replace_by" value="1"/>
              </operator>
              <operator activated="true" class="parse_numbers" compatibility="9.4.001" expanded="true" height="82" name="Parse Numbers" width="90" x="447" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="nominal"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="file_path"/>
                <parameter key="block_type" value="single_value"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="single_value"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
                <parameter key="decimal_character" value="."/>
                <parameter key="grouped_digits" value="false"/>
                <parameter key="grouping_character" value=","/>
                <parameter key="infinity_representation" value=""/>
                <parameter key="unparsable_value_handling" value="fail"/>
              </operator>
              <connect from_port="input 1" to_op="Select Attributes (2)" to_port="example set input"/>
              <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Numerical to Polynominal" to_port="example set input"/>
              <connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Replace" to_port="example set input"/>
              <connect from_op="Replace" from_port="example set output" to_op="Parse Numbers" to_port="example set input"/>
              <connect from_op="Parse Numbers" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="generate_aggregation" compatibility="9.4.001" expanded="true" height="82" name="Generate Aggregation (2)" width="90" x="1385" y="187">
            <parameter key="attribute_name" value="sum_of_unique"/>
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="aggregation_function" value="sum"/>
            <parameter key="concatenation_separator" value="|"/>
            <parameter key="keep_all" value="true"/>
            <parameter key="ignore_missings" value="true"/>
            <parameter key="ignore_missing_attributes" value="false"/>
          </operator>
          <operator activated="true" class="concurrency:join" compatibility="9.4.001" expanded="true" height="82" name="Join" width="90" x="1519" y="34">
            <parameter key="remove_double_attributes" value="true"/>
            <parameter key="join_type" value="inner"/>
            <parameter key="use_id_attribute_as_key" value="true"/>
            <list key="key_attributes">
              <parameter key="value" value="value"/>
            </list>
            <parameter key="keep_both_join_attributes" value="false"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.4.001" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="1653" y="34">
            <list key="function_descriptions">
              <parameter key="delta" value="[sum_of_all]-[sum_of_unique]"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <connect from_op="Retrieve Input Analyse" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Loop Attributes" to_port="input 1"/>
          <connect from_op="Loop Attributes" from_port="output 1" to_op="Append (Superset)" to_port="example set 1"/>
          <connect from_op="Append (Superset)" from_port="merged set" to_op="Pivot (3)" to_port="input"/>
          <connect from_op="Pivot (3)" from_port="output" to_op="Rename by Replacing" to_port="example set input"/>
          <connect from_op="Rename by Replacing" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Generate Aggregation" to_port="example set input"/>
          <connect from_op="Generate Aggregation" from_port="example set output" to_op="Join" to_port="left"/>
          <connect from_op="Generate Aggregation" from_port="original" to_op="Loop Attributes (2)" to_port="input 1"/>
          <connect from_op="Loop Attributes (2)" from_port="output 1" to_op="Generate Aggregation (2)" to_port="example set input"/>
          <connect from_op="Generate Aggregation (2)" from_port="example set output" to_op="Join" to_port="right"/>
          <connect from_op="Join" from_port="join" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    


Answers

  • Options
    kaymankayman Member Posts: 662 Unicorn
    Is it possible to share your input data set also (or part of it)
  • Options
    kaymankayman Member Posts: 662 Unicorn
    edited October 2019
    Hi @Kathi, You apply your filter on an attribute called 'Anzahl der Dokumente', but this attribute doesn't exist anymore as all of your attributes are renamed after the Transpose to a generic name. Also your source isn't having this attribute (only ID and Beschreibung) so it cannot apply this logic either.

    Generating ID's is not really an option here either, as the ID's for your top flow will be representing the vectors created by this loop, and the second ID will be linked to the vector attributes of the second group, so no relation with he first group so the set minus operator will be looking at 2 different data sets. 

    Using the transpose you lose basically all relations between your output and input, so this might be unwanted also.
    What would you like to achieve? As one flow contains 4 grams and the other one contains 3 grams, is the idea to keep all that have 4 grams? In general this would be the difference between Flow 1 and 2, up to 3 grams they will be equal but Flow 1 will also contain 4 grams.


  • Options
    KathiKathi Member Posts: 11 Contributor I
    edited October 2019
    Hi kayman,
    thanks for your answer and ckecking my process!
    My input file contains different documents (each row of the table contains one document). Every document has an id (1. column of my excel file) and a text decription (2. column). 
    You are right with the ID. It is even better to use my own id, so I removed this operators ans uploaded the new process. Thanks!
    I use the transpose operator to represent the 4-grams in the rows and the id of the documents in the columns of my table (picture Output 1).
    Now I want to know in how many documents is every 4-gram included? So I would like RapidMiner to count the cells that do not contain a zero in each row individually.I prepared a excel file with an example (example output). I hope I could explain my idea.
    Regards
  • Options
    kaymankayman Member Posts: 662 Unicorn
    If the only requirement is to understand how many times a given word or ngram is used then you could use the wordlist output of the process data from documents operator. This tells exactly the count per attribute over documents.

    Adding the wordlist 2 exampleset operator allows you to do further actions on that one.
  • Options
    KathiKathi Member Posts: 11 Contributor I
    edited October 2019
    Unfortunately I can't use the output of the process data from document operator because my output is too big for excel. That's why I want to aggregate my output.
    Regards
  • Options
    kaymankayman Member Posts: 662 Unicorn
    Hi @Kathi, I think I get it now :-)

    Something like below ?
    Note that I pruned the set and filtered to have only actual 4 grams. This speeds the thing up but can be adjusted as you wish.

    So we get only the 4 grams, we transform from multi column to key - value logic and then we pivot it to get something pretty close to your excel example.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.4.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.4.001" expanded="true" height="68" name="Retrieve Input Analyse" width="90" x="45" y="34">
            <parameter key="repository_entry" value="Input Analyse"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.4.001" expanded="true" height="103" name="Multiply" width="90" x="179" y="34"/>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="10"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="246" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:generate_n_grams_terms" compatibility="8.2.000" expanded="true" height="68" name="Generate n-Grams (Terms) (2)" width="90" x="380" y="34">
                <parameter key="max_length" value="4"/>
              </operator>
              <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="514" y="34">
                <parameter key="condition" value="matches"/>
                <parameter key="regular_expression" value="^\w+_\w+_\w+_\w+"/>
                <parameter key="case_sensitive" value="false"/>
                <parameter key="invert condition" value="false"/>
                <description align="center" color="transparent" colored="false" width="126">only keep 4-gram</description>
              </operator>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Generate n-Grams (Terms) (2)" to_port="document"/>
              <connect from_op="Generate n-Grams (Terms) (2)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
              <connect from_op="Filter Tokens (by Content)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="concurrency:loop_attributes" compatibility="9.4.001" expanded="true" height="82" name="Loop Attributes" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="attribute_name_macro" value="la"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="false"/>
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="9.4.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="rename" compatibility="9.4.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
                <parameter key="old_name" value="%{la}"/>
                <parameter key="new_name" value="ngram"/>
                <list key="rename_additional_attributes"/>
              </operator>
              <operator activated="true" class="generate_attributes" compatibility="9.4.001" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
                <list key="function_descriptions">
                  <parameter key="value" value="%{la}"/>
                </list>
                <parameter key="keep_all" value="true"/>
              </operator>
              <connect from_port="input 1" to_op="Select Attributes" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
              <connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="operator_toolbox:advanced_append" compatibility="2.2.000" expanded="true" height="82" name="Append (Superset)" width="90" x="581" y="34"/>
          <operator activated="true" class="blending:pivot" compatibility="9.4.001" expanded="true" height="82" name="Pivot (3)" origin="GENERATED_TURBOPREP" width="90" x="715" y="34">
            <parameter key="group_by_attributes" value="value"/>
            <parameter key="column_grouping_attribute" value="ID"/>
            <list key="aggregation_attributes">
              <parameter key="ngram" value="sum"/>
            </list>
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="default_aggregation_function" value="first"/>
          </operator>
          <operator activated="true" class="rename_by_replacing" compatibility="9.4.001" expanded="true" height="82" name="Rename by Replacing" width="90" x="849" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="replace_what" value="sum\(ngram\)_(.*)$"/>
            <parameter key="replace_by" value="$1"/>
          </operator>
          <operator activated="true" class="generate_aggregation" compatibility="9.4.001" expanded="true" height="82" name="Generate Aggregation" width="90" x="983" y="34">
            <parameter key="attribute_name" value="sum_of_all"/>
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="aggregation_function" value="sum"/>
            <parameter key="concatenation_separator" value="|"/>
            <parameter key="keep_all" value="true"/>
            <parameter key="ignore_missings" value="true"/>
            <parameter key="ignore_missing_attributes" value="false"/>
          </operator>
          <connect from_op="Retrieve Input Analyse" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Loop Attributes" to_port="input 1"/>
          <connect from_op="Loop Attributes" from_port="output 1" to_op="Append (Superset)" to_port="example set 1"/>
          <connect from_op="Append (Superset)" from_port="merged set" to_op="Pivot (3)" to_port="input"/>
          <connect from_op="Pivot (3)" from_port="output" to_op="Rename by Replacing" to_port="example set input"/>
          <connect from_op="Rename by Replacing" from_port="example set output" to_op="Generate Aggregation" to_port="example set input"/>
          <connect from_op="Generate Aggregation" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    


  • Options
    KathiKathi Member Posts: 11 Contributor I
    edited October 2019
    Hi kayman,
    thanks for you perseverance :)
    Almost :) Now the process shows how often the respective n-gram is used in total. But I would like to know in how many documents the respective n-gram is used. So if a n-gram is used twice in a document, it should still be counted only once.
    Does my explanation make sence to you?
    Regards.
  • Options
    KathiKathi Member Posts: 11 Contributor I
    edited October 2019
    Hi kayman,
    thanks for your help!!! Thats the output I need :) Now I'm gonna try to understand the process :smiley:
    Regards

Sign In or Register to comment.