Options

Getting count of data dictionary occurences

b00122599b00122599 Member Posts: 26 Contributor II
edited June 2020 in Help
Hey folks, I have a process that is successfully looping through a collection of text files and getting the 10 most occurring words in each file. However I am now trying to add a data dictionary so that I am only doing this for words in the data dictionary. I am reading in a text file with a list of words for my data dictionary. However for the results I am getting just a count of 1 for each word when it appears. I know for example that without the data dictionary one word occurs 213 times in one text file however with the new process I am only seeing it counted once. I've tried a number of options but can't get it to count properly as before when using the data dictionary any pointers would be much appreciated.

Cheers,

Neil. 
<?xml version="1.0" encoding="UTF-8"?><process version="9.7.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.7.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="concurrency:loop_files" compatibility="9.7.000" expanded="true" height="82" name="Loop Files" width="90" x="112" y="85">
        <parameter key="directory" value="D:/OneDrive/College/Blanch/Year2/Project/docs/Complete/Mufc/arabic"/>
        <parameter key="filter_type" value="glob"/>
        <parameter key="recursive" value="false"/>
        <parameter key="enable_macros" value="true"/>
        <parameter key="macro_for_file_name" value="file_name"/>
        <parameter key="macro_for_file_type" value="file_type"/>
        <parameter key="macro_for_folder_name" value="folder_name"/>
        <parameter key="reuse_results" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="text:read_document" compatibility="9.3.001" expanded="true" height="68" name="Read Document" width="90" x="45" y="136">
            <parameter key="extract_text_only" value="true"/>
            <parameter key="use_file_extension_as_type" value="true"/>
            <parameter key="content_type" value="txt"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="text:process_document_from_file" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Files" width="90" x="179" y="34">
            <list key="text_directories">
              <parameter key="list" value="C:/Users/Neil/Desktop/test"/>
            </list>
            <parameter key="file_pattern" value="*"/>
            <parameter key="extract_text_only" value="true"/>
            <parameter key="use_file_extension_as_type" value="true"/>
            <parameter key="content_type" value="txt"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="create_word_vector" value="false"/>
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="add_meta_information" value="false"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="179" y="85">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="85">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English) (2)" width="90" x="715" y="85"/>
              <operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="983" y="85">
                <parameter key="min_chars" value="4"/>
                <parameter key="max_chars" value="25"/>
              </operator>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English) (2)" to_port="document"/>
              <connect from_op="Filter Stopwords (English) (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="9.3.001" expanded="true" height="103" name="Process Documents (2)" width="90" x="246" y="187">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="true"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="5.0"/>
            <parameter key="prune_above_rank" value="5.0"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="85">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="85">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="85"/>
              <operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length) (2)" width="90" x="514" y="85">
                <parameter key="min_chars" value="4"/>
                <parameter key="max_chars" value="25"/>
              </operator>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length) (2)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length) (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:wordlist_to_data" compatibility="9.3.001" expanded="true" height="82" name="WordList to Data" width="90" x="380" y="85"/>
          <operator activated="true" class="generate_attributes" compatibility="6.4.000" expanded="true" height="82" name="Generate Attributes" width="90" x="514" y="34">
            <list key="function_descriptions">
              <parameter key="filename" value="macro(&quot;file_name&quot;)"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <operator activated="true" class="sort" compatibility="9.7.000" expanded="true" height="82" name="Sort" width="90" x="447" y="238">
            <parameter key="attribute_name" value="total"/>
            <parameter key="sorting_direction" value="decreasing"/>
          </operator>
          <operator activated="true" class="filter_example_range" compatibility="9.7.000" expanded="true" height="82" name="Filter Example Range" width="90" x="648" y="136">
            <parameter key="first_example" value="1"/>
            <parameter key="last_example" value="10"/>
            <parameter key="invert_filter" value="false"/>
          </operator>
          <connect from_port="file object" to_op="Read Document" to_port="file"/>
          <connect from_op="Read Document" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
          <connect from_op="Process Documents from Files" from_port="word list" to_op="Process Documents (2)" to_port="word list"/>
          <connect from_op="Process Documents (2)" from_port="word list" to_op="WordList to Data" to_port="word list"/>
          <connect from_op="WordList to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Sort" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
          <connect from_op="Filter Example Range" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <connect from_port="input 1" to_op="Loop Files" to_port="input 1"/>
      <connect from_op="Loop Files" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>










Answers

  • Options
    Telcontar120Telcontar120 Moderator, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,635 Unicorn
    You should simply be able to do an inner join on your data dictionary after you have generated the list of words and counts using the Join operator.  This will leave you only the words in the dictionary plus their counts.
    Brian T.
    Lindon Ventures 
    Data Science Consulting from Certified RapidMiner Experts
  • Options
    b00122599b00122599 Member Posts: 26 Contributor II
    Thanks very much I have  tried the inner join below but I'm afraid this gives me the same result I just have the words in the data dictionary that appear and a count of 1. Thanks again for the help.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.7.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.7.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="concurrency:loop_files" compatibility="9.7.000" expanded="true" height="82" name="Loop Files" width="90" x="112" y="85">
            <parameter key="directory" value="D:/OneDrive/College/Blanch/Year2/Project/docs/Complete/Mufc/arabic"/>
            <parameter key="filter_type" value="glob"/>
            <parameter key="recursive" value="false"/>
            <parameter key="enable_macros" value="true"/>
            <parameter key="macro_for_file_name" value="file_name"/>
            <parameter key="macro_for_file_type" value="file_type"/>
            <parameter key="macro_for_folder_name" value="folder_name"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="true"/>
            <process expanded="true">
              <operator activated="true" class="text:read_document" compatibility="9.3.001" expanded="true" height="68" name="Read Document" width="90" x="45" y="136">
                <parameter key="extract_text_only" value="true"/>
                <parameter key="use_file_extension_as_type" value="true"/>
                <parameter key="content_type" value="txt"/>
                <parameter key="encoding" value="SYSTEM"/>
              </operator>
              <operator activated="true" class="text:process_document_from_file" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Files" width="90" x="179" y="34">
                <list key="text_directories">
                  <parameter key="list" value="C:/Users/Neil/Desktop/test"/>
                </list>
                <parameter key="file_pattern" value="*"/>
                <parameter key="extract_text_only" value="true"/>
                <parameter key="use_file_extension_as_type" value="true"/>
                <parameter key="content_type" value="txt"/>
                <parameter key="encoding" value="SYSTEM"/>
                <parameter key="create_word_vector" value="true"/>
                <parameter key="vector_creation" value="Term Occurrences"/>
                <parameter key="add_meta_information" value="true"/>
                <parameter key="keep_text" value="true"/>
                <parameter key="prune_method" value="none"/>
                <parameter key="prune_below_percent" value="3.0"/>
                <parameter key="prune_above_percent" value="30.0"/>
                <parameter key="prune_below_rank" value="0.05"/>
                <parameter key="prune_above_rank" value="0.95"/>
                <parameter key="datamanagement" value="double_sparse_array"/>
                <parameter key="data_management" value="auto"/>
                <process expanded="true">
                  <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="179" y="85">
                    <parameter key="mode" value="non letters"/>
                    <parameter key="characters" value=".:"/>
                    <parameter key="language" value="English"/>
                    <parameter key="max_token_length" value="3"/>
                  </operator>
                  <operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="85">
                    <parameter key="transform_to" value="lower case"/>
                  </operator>
                  <operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English) (2)" width="90" x="715" y="85"/>
                  <operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="983" y="85">
                    <parameter key="min_chars" value="4"/>
                    <parameter key="max_chars" value="25"/>
                  </operator>
                  <connect from_port="document" to_op="Tokenize" to_port="document"/>
                  <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
                  <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English) (2)" to_port="document"/>
                  <connect from_op="Filter Stopwords (English) (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
                  <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="text:wordlist_to_data" compatibility="9.3.001" expanded="true" height="82" name="WordList to Data (2)" width="90" x="380" y="34"/>
              <operator activated="true" class="text:process_documents" compatibility="9.3.001" expanded="true" height="103" name="Process Documents (2)" width="90" x="246" y="238">
                <parameter key="create_word_vector" value="true"/>
                <parameter key="vector_creation" value="Term Occurrences"/>
                <parameter key="add_meta_information" value="true"/>
                <parameter key="keep_text" value="true"/>
                <parameter key="prune_method" value="none"/>
                <parameter key="prune_below_percent" value="3.0"/>
                <parameter key="prune_above_percent" value="30.0"/>
                <parameter key="prune_below_rank" value="5.0"/>
                <parameter key="prune_above_rank" value="5.0"/>
                <parameter key="datamanagement" value="double_sparse_array"/>
                <parameter key="data_management" value="auto"/>
                <process expanded="true">
                  <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="85">
                    <parameter key="mode" value="non letters"/>
                    <parameter key="characters" value=".:"/>
                    <parameter key="language" value="English"/>
                    <parameter key="max_token_length" value="3"/>
                  </operator>
                  <operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="85">
                    <parameter key="transform_to" value="lower case"/>
                  </operator>
                  <operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="85"/>
                  <operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length) (2)" width="90" x="514" y="85">
                    <parameter key="min_chars" value="4"/>
                    <parameter key="max_chars" value="25"/>
                  </operator>
                  <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
                  <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
                  <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
                  <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length) (2)" to_port="document"/>
                  <connect from_op="Filter Tokens (by Length) (2)" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="text:wordlist_to_data" compatibility="9.3.001" expanded="true" height="82" name="WordList to Data" width="90" x="380" y="136"/>
              <operator activated="true" class="concurrency:join" compatibility="9.7.000" expanded="true" height="82" name="Join" width="90" x="581" y="34">
                <parameter key="remove_double_attributes" value="true"/>
                <parameter key="join_type" value="inner"/>
                <parameter key="use_id_attribute_as_key" value="false"/>
                <list key="key_attributes">
                  <parameter key="word" value="word"/>
                </list>
                <parameter key="keep_both_join_attributes" value="false"/>
              </operator>
              <operator activated="true" class="generate_attributes" compatibility="6.4.000" expanded="true" height="82" name="Generate Attributes" width="90" x="581" y="187">
                <list key="function_descriptions">
                  <parameter key="filename" value="macro(&quot;file_name&quot;)"/>
                </list>
                <parameter key="keep_all" value="true"/>
              </operator>
              <operator activated="true" class="sort" compatibility="9.7.000" expanded="true" height="82" name="Sort" width="90" x="581" y="340">
                <parameter key="attribute_name" value="total"/>
                <parameter key="sorting_direction" value="decreasing"/>
              </operator>
              <operator activated="true" class="filter_example_range" compatibility="9.7.000" expanded="true" height="82" name="Filter Example Range" width="90" x="715" y="187">
                <parameter key="first_example" value="1"/>
                <parameter key="last_example" value="10"/>
                <parameter key="invert_filter" value="false"/>
              </operator>
              <connect from_port="file object" to_op="Read Document" to_port="file"/>
              <connect from_op="Read Document" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
              <connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data (2)" to_port="word list"/>
              <connect from_op="WordList to Data (2)" from_port="example set" to_op="Join" to_port="left"/>
              <connect from_op="Process Documents (2)" from_port="word list" to_op="WordList to Data" to_port="word list"/>
              <connect from_op="WordList to Data" from_port="example set" to_op="Join" to_port="right"/>
              <connect from_op="Join" from_port="join" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_op="Sort" to_port="example set input"/>
              <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
              <connect from_op="Filter Example Range" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_file object" spacing="0"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="input 1" to_op="Loop Files" to_port="input 1"/>
          <connect from_op="Loop Files" from_port="output 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    
    
    
    
    



Sign In or Register to comment.