Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

LDA results (number of topics and words per topic)

ayaRizkayaRizk Member Posts: 6 Contributor II
edited November 2022 in Help
Hi, I am trying to extract topics from files and have set the number of topics to be extracted to 50 and the top words per topic to 20 (defaults are 10 and 5 respectively). But when I run the process I get about 90 topics (that also varies with different runs) and more than 50 words per topic. 

I am not sure what I'm doing wrong. Attaching the process here - appreciate the help!

Edit: Now I realized the source of the problem. The parent process "Optimize parameters" has option to set max number of topics and words per topic as the chosen parameters. I was setting their values in the LDA operator and forgetting the ones in the parent operator, which seems to override the LDA values.

Regards,
Aya

<?xml version="1.0" encoding="UTF-8"?><process version="9.10.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.10.011" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="concurrency:loop_files" compatibility="9.10.011" expanded="true" height="82" name="Loop Files" width="90" x="112" y="34">
        <parameter key="directory" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Transcriptions"/>
        <parameter key="filter_type" value="regex"/>
        <parameter key="filter_by_regex" value=".*\.docx$"/>
        <parameter key="recursive" value="true"/>
        <parameter key="skip_inaccessible" value="true"/>
        <parameter key="enable_macros" value="false"/>
        <parameter key="macro_for_file_name" value="file_name"/>
        <parameter key="macro_for_file_type" value="file_type"/>
        <parameter key="macro_for_folder_name" value="folder_name"/>
        <parameter key="reuse_results" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="operator_toolbox:read_word_files" compatibility="2.14.000" expanded="true" height="68" name="Read Office File" width="90" x="380" y="34">
            <parameter key="detect_file_type" value="false"/>
            <parameter key="file_extension" value="docx"/>
          </operator>
          <connect from_port="file object" to_op="Read Office File" to_port="file"/>
          <connect from_op="Read Office File" from_port="doc" to_port="output 1"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="9.10.011" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="34">
        <parameter key="set_iteration_macro" value="false"/>
        <parameter key="macro_name" value="iteration"/>
        <parameter key="macro_start_value" value="1"/>
        <parameter key="unfold" value="false"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="false" class="text:documents_to_data" compatibility="9.4.000" expanded="true" height="68" name="Documents to Data" width="90" x="313" y="442">
            <parameter key="add_meta_information" value="true"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="use_processed_text" value="false"/>
          </operator>
          <operator activated="false" class="text:json_to_data" compatibility="9.4.000" expanded="true" height="68" name="JSON To Data" width="90" x="179" y="442">
            <parameter key="ignore_arrays" value="false"/>
            <parameter key="limit_attributes" value="false"/>
            <parameter key="skip_invalid_documents" value="false"/>
            <parameter key="guess_data_types" value="true"/>
            <parameter key="keep_missing_attributes" value="false"/>
            <parameter key="missing_values_aliases" value=", null, NaN, missing"/>
          </operator>
          <operator activated="false" class="read_csv" compatibility="9.10.011" expanded="true" height="68" name="Read CSV" width="90" x="447" y="442">
            <parameter key="column_separators" value=";"/>
            <parameter key="trim_lines" value="false"/>
            <parameter key="use_quotes" value="true"/>
            <parameter key="quotes_character" value="&quot;"/>
            <parameter key="escape_character" value="\"/>
            <parameter key="skip_comments" value="false"/>
            <parameter key="comment_characters" value="#"/>
            <parameter key="starting_row" value="1"/>
            <parameter key="parse_numbers" value="true"/>
            <parameter key="decimal_character" value="."/>
            <parameter key="grouped_digits" value="false"/>
            <parameter key="grouping_character" value=","/>
            <parameter key="infinity_representation" value=""/>
            <parameter key="date_format" value=""/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information"/>
            <parameter key="read_not_matching_values_as_missings" value="true"/>
          </operator>
          <operator activated="false" class="retrieve" compatibility="9.10.011" expanded="true" height="68" name="Retrieve" width="90" x="581" y="442">
            <parameter key="repository_entry" value="data/ROBOT/sv_stopwords_114"/>
          </operator>
          <operator activated="true" class="open_file" compatibility="9.10.011" expanded="true" height="68" name="Open File" width="90" x="246" y="136">
            <parameter key="resource_type" value="file"/>
            <parameter key="filename" value="/Users/ayari88/Documents/Research/AFA/ROBOT/RapidMiner/sv_stopwords_418.csv"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="9.4.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="380" y="34">
            <parameter key="case_sensitive" value="false"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="text:stem_snowball" compatibility="9.4.000" expanded="true" height="68" name="Stem (Snowball)" width="90" x="514" y="34">
            <parameter key="language" value="Swedish"/>
          </operator>
          <connect from_port="single" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
          <connect from_op="Open File" from_port="file" to_op="Filter Stopwords (Dictionary)" to_port="file"/>
          <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="concurrency:optimize_parameters_grid" compatibility="9.10.011" expanded="true" height="166" name="Optimize Parameters (Grid)" width="90" x="514" y="34">
        <list key="parameters">
          <parameter key="Extract Topics from Documents (LDA).number_of_topics" value="[1.0;100.0;10;linear]"/>
          <parameter key="Extract Topics from Documents (LDA).iterations" value="[1.0;100.0;10;linear]"/>
          <parameter key="Extract Topics from Documents (LDA).top_words_per_topic" value="[1.0;100.0;10;linear]"/>
        </list>
        <parameter key="error_handling" value="fail on error"/>
        <parameter key="log_performance" value="true"/>
        <parameter key="log_all_criteria" value="false"/>
        <parameter key="synchronize" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="operator_toolbox:lda" compatibility="2.14.000" expanded="true" height="124" name="Extract Topics from Documents (LDA)" width="90" x="179" y="34">
            <parameter key="number_of_topics" value="50"/>
            <parameter key="show_optimization_settings" value="true"/>
            <parameter key="use_alpha_heuristics" value="true"/>
            <parameter key="alpha_sum" value="0.1"/>
            <parameter key="use_beta_heuristics" value="true"/>
            <parameter key="beta" value="0.01"/>
            <parameter key="optimize_hyperparameters" value="true"/>
            <parameter key="optimize_interval_for_hyperparameters" value="10"/>
            <parameter key="iterations" value="1000"/>
            <parameter key="top_words_per_topic" value="20"/>
            <parameter key="stopword language" value="swedish"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="enable_logging" value="false"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="include_meta_data" value="true"/>
          </operator>
          <connect from_port="input 1" to_op="Extract Topics from Documents (LDA)" to_port="col"/>
          <connect from_op="Extract Topics from Documents (LDA)" from_port="exa" to_port="output 1"/>
          <connect from_op="Extract Topics from Documents (LDA)" from_port="top" to_port="output 2"/>
          <connect from_op="Extract Topics from Documents (LDA)" from_port="per" to_port="performance"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_performance" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
          <portSpacing port="sink_output 3" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Loop Files" from_port="output 1" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 1"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="parameter set" to_port="result 2"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="output 1" to_port="result 3"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="output 2" to_port="result 4"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>

Sign In or Register to comment.