Add custom Stopwords Dictionary

ezouliasezoulias Member Posts: 28 Maven
I have used in previous versions the feature of custom stopword dictionary. I tried the same stop word file with version 9.2.001 and by the results seems that it does not work.
Is there any change;

Best regards,
Manolis

Answers

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
    hi @ezoulias can you please post your XML and your data set / documents? It is the only way we can debug.

    Scott

  • ezouliasezoulias Member Posts: 28 Maven
    Please find attached some data and the stop word file as well as the XML code below.
    <?xml version="1.0" encoding="UTF-8"?><process version="9.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="read_excel" compatibility="6.0.003" expanded="true" height="68" name="Read Excel" width="90" x="45" y="30">
            <parameter key="excel_file" value="C:\Data\Tsimaras\data_cleaned.xlsx"/>
            <parameter key="sheet_selection" value="sheet number"/>
            <parameter key="sheet_number" value="1"/>
            <parameter key="imported_cell_range" value="A1"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="date_format" value=""/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="α/α.true.integer.attribute"/>
              <parameter key="1" value="Σχόλια για συγκεκριμένες πτυχές της δράσης.true.polynominal.attribute"/>
              <parameter key="2" value="Σχόλια για τους εκπαιδευτές.true.polynominal.attribute"/>
              <parameter key="3" value="Πώς θα χαρακτηρίζατε τη συνολική εμπειρία σας στη Δράση;.true.integer.attribute"/>
              <parameter key="4" value="Σύνοψη της συνολικής εμπειρίας σας &amp; σχόλια.true.polynominal.attribute"/>
              <parameter key="5" value="Θα προτείνατε αυτή τη δράση σε κάποιον/αν συνάδελφό σας; Τι θα του/της λέγατε;.true.polynominal.attribute"/>
              <parameter key="6" value="Τι θα προτείνατε για τη βελτίωση της Δράσης;.true.polynominal.attribute"/>
            </list>
            <parameter key="read_not_matching_values_as_missings" value="false"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="30">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="file_path"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="single_value"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="136">
            <parameter key="create_word_vector" value="false"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="9999"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="30">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="165">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="8.1.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="313" y="34">
                <parameter key="file" value="C:\Data\Tsimaras\filter.txt"/>
                <parameter key="case_sensitive" value="true"/>
                <parameter key="encoding" value="UTF-8"/>
              </operator>
              <operator activated="false" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="30">
                <parameter key="min_chars" value="2"/>
                <parameter key="max_chars" value="9999"/>
              </operator>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
              <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="input 1" to_op="Read Excel" to_port="file"/>
          <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
    edited April 2019
    hi @ezoulias ok thank you for all this. I loaded your process and data and this seems to be something odd with lower case Greek letters. My hunch is that this is an encoding issue but cannot be sure (and I don't know Greek) but if you convert everything to upper case, the tokens are filtered as expected. Try this process and the attached filter.txt file.

    Scott

    <?xml version="1.0" encoding="UTF-8"?><process version="9.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="read_excel" compatibility="6.0.003" expanded="true" height="68" name="Read Excel" width="90" x="45" y="30">
            <parameter key="excel_file" value="data_sent.xlsx"/>
            <parameter key="sheet_selection" value="sheet number"/>
            <parameter key="sheet_number" value="1"/>
            <parameter key="imported_cell_range" value="A1"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="date_format" value=""/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="α/α.true.integer.attribute"/>
              <parameter key="1" value="Σχόλια για τους εκπαιδευτές.true.polynominal.attribute"/>
              <parameter key="2" value="Πώς θα χαρακτηρίζατε τη συνολική εμπειρία σας στη Δράση;.true.integer.attribute"/>
              <parameter key="3" value="Σύνοψη της συνολικής εμπειρίας σας &amp; σχόλια.true.polynominal.attribute"/>
              <parameter key="4" value="Θα προτείνατε αυτή τη δράση σε κάποιον/αν συνάδελφό σας; Τι θα του/της λέγατε;.true.polynominal.attribute"/>
              <parameter key="5" value="Τι θα προτείνατε για τη βελτίωση της Δράσης;.true.polynominal.attribute"/>
            </list>
            <parameter key="read_not_matching_values_as_missings" value="false"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="30">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="file_path"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="single_value"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
            <parameter key="create_word_vector" value="false"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="9999"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
                <parameter key="transform_to" value="upper case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="8.1.000" expanded="true" height="82" name="Filter Stopwords (2)" width="90" x="313" y="34">
                <parameter key="file" value="filter.txt"/>
                <parameter key="case_sensitive" value="false"/>
                <parameter key="encoding" value="SYSTEM"/>
              </operator>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
              <connect from_op="Filter Stopwords (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
    


Sign In or Register to comment.