Options

Extract topics from data (LDA)

Lottew44Lottew44 Member Posts: 2 Learner I
Hi

Via Process documents from files (subprocess: transform cases > tokenize > filter stopwords > filter tokens by length) I have created an example set based on 6 documents where I want to extract the topics from via the Extract topics from Data (DLA) operator and everything works just fine but I still have words of less than 4 tokens emerging in the wordlist that belong to the topics, as well as stopwords (the, and...). Does anyone knows what else I can do to solve this? I already used the filter stopwords and filter tokens by length (4 - 25) operator in the first step but I'm apparently doing something wrong since I still have those meaningless words in the topic list. @mschmitz can you maybe help?

This the XML file - Thank you very much!

<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="34">
        <list key="text_directories">
          <parameter key="MembersCoops" value="F:\2019-2020 Thesis MBA\Bylaws Coops\Labour-members"/>
        </list>
        <parameter key="file_pattern" value="*"/>
        <parameter key="extract_text_only" value="true"/>
        <parameter key="use_file_extension_as_type" value="true"/>
        <parameter key="content_type" value="txt"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="absolute"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_absolute" value="3"/>
        <parameter key="prune_above_absolute" value="9999"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <process expanded="true">
          <operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="447" y="34">
            <parameter key="min_chars" value="4"/>
            <parameter key="max_chars" value="20"/>
          </operator>
          <operator activated="true" class="text:stem_porter" compatibility="9.3.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="581" y="34"/>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
          <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="operator_toolbox:lda_exampleset" compatibility="2.4.000" expanded="true" height="124" name="Extract Topics from Data (LDA)" width="90" x="380" y="136">
        <parameter key="text_attribute" value="text"/>
        <parameter key="number_of_topics" value="10"/>
        <parameter key="use_alpha_heuristics" value="true"/>
        <parameter key="alpha_sum" value="0.1"/>
        <parameter key="use_beta_heuristics" value="true"/>
        <parameter key="beta" value="0.01"/>
        <parameter key="optimize_hyperparameters" value="true"/>
        <parameter key="optimize_interval_for_hyperparameters" value="10"/>
        <parameter key="top_words_per_topic" value="5"/>
        <parameter key="iterations" value="1000"/>
        <parameter key="reproducible" value="false"/>
        <parameter key="enable_logging" value="false"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Extract Topics from Data (LDA)" to_port="exa"/>
      <connect from_op="Process Documents from Files" from_port="word list" to_port="result 1"/>
      <connect from_op="Extract Topics from Data (LDA)" from_port="exa" to_port="result 2"/>
      <connect from_op="Extract Topics from Data (LDA)" from_port="top" to_port="result 3"/>
      <connect from_op="Extract Topics from Data (LDA)" from_port="mod" to_port="result 4"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • Options
    Telcontar120Telcontar120 Moderator, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,635 Unicorn
    I seem to remember having a similar issue before with this operator, but I don't recall the solution.
    @mschmitz should be able to shed some light.
    In the meantime what happens if you store the data as a new exampleset after text data processing and then retrieve it as a fresh set before you run the LDA extract topics?  That should cut off its access to the various stopwords and short tokens.
    Brian T.
    Lindon Ventures 
    Data Science Consulting from Certified RapidMiner Experts
  • Options
    MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,510 RM Data Scientist
    Hi,

    i am frankly suprised that this even works at all? Isnt Process Documents creating a TF-IDF vector?

    Best,
    Martin
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.