Fragen zu Textmining

ZaramotZaramot Member Posts: 3 Newbie
Hallo Leute. Ich benutze Rapidminer für Textmining. Jedoch hab ich paar Schwierigkeiten. Ich hoffe ihr könnt mir dabei weiterhelfen

1. Ich möchte in einem Dokument nach bestimmten Wörtern und Wortketten suchen. Zum Beispiel nach dem Wort "Baum" und "Bäume, Äste". Bei den Wortketten soll er mir eben einen Satz ausgeben, in dem die Wörter Bäume und Äste vorkommen.
2. Nachdem ich das nun gemacht habe, wie kriege ich den Output in ein Wortdokument?

Danke für die Hilfe

Answers

  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,503 RM Data Scientist
    Hi,
    anbei eine Variante das zu lösen (es gibt viele).
    LG,
    Martin

    <?xml version="1.0" encoding="UTF-8"?><process version="9.7.002">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.7.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="utility:create_exampleset" compatibility="9.7.002" expanded="true" height="68" name="Create ExampleSet" width="90" x="246" y="34">
            <parameter key="generator_type" value="comma separated text"/>
            <parameter key="number_of_examples" value="100"/>
            <parameter key="use_stepsize" value="false"/>
            <list key="function_descriptions"/>
            <parameter key="add_id_attribute" value="false"/>
            <list key="numeric_series_configuration"/>
            <list key="date_series_configuration"/>
            <list key="date_series_configuration (interval)"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="input_csv_text" value="Text&#10;Ein Text mit Ästen&#10;Ein Text mit Bäumen&#10;Ein Text mit Ästen und Bäumen&#10;Ein anderer Text"/>
            <parameter key="column_separator" value=","/>
            <parameter key="parse_all_as_nominal" value="true"/>
            <parameter key="decimal_point_character" value="."/>
            <parameter key="trim_attribute_names" value="true"/>
            <description align="center" color="transparent" colored="false" width="126">Wahrscheinlich hier eher Read CSV</description>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="9.7.002" expanded="true" height="82" name="Nominal to Text" width="90" x="380" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Text"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="file_path"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="single_value"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="514" y="34">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="true"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:stem_snowball" compatibility="9.3.001" expanded="true" height="68" name="Stem (Snowball)" width="90" x="581" y="34">
                <parameter key="language" value="German"/>
              </operator>
              <operator activated="false" class="text:stem_german" compatibility="9.3.001" expanded="true" height="68" name="Stem (German)" width="90" x="380" y="34"/>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
              <connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="generate_aggregation" compatibility="9.7.002" expanded="true" height="82" name="Generate Aggregation" width="90" x="782" y="34">
            <parameter key="attribute_name" value="Indicator"/>
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value="ast|baum"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="aggregation_function" value="sum"/>
            <parameter key="concatenation_separator" value="|"/>
            <parameter key="keep_all" value="true"/>
            <parameter key="ignore_missings" value="true"/>
            <parameter key="ignore_missing_attributes" value="false"/>
            <description align="center" color="transparent" colored="false" width="126">Auch wenn das Attribut nicht in der liste auftaucht, kannst du das trotzdem hinzuf&amp;#252;gen!</description>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="9.7.002" expanded="true" height="103" name="Filter Examples" width="90" x="916" y="34">
            <parameter key="parameter_expression" value=""/>
            <parameter key="condition_class" value="custom_filters"/>
            <parameter key="invert_filter" value="false"/>
            <list key="filters_list">
              <parameter key="filters_entry_key" value="Indicator.ge.2"/>
            </list>
            <parameter key="filters_logic_and" value="true"/>
            <parameter key="filters_check_metadata" value="true"/>
          </operator>
          <operator activated="false" class="generate_attributes" compatibility="9.7.002" expanded="true" height="82" name="Generate Attributes" width="90" x="514" y="289">
            <list key="function_descriptions"/>
            <parameter key="keep_all" value="true"/>
          </operator>
          <connect from_op="Create ExampleSet" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Generate Aggregation" to_port="example set input"/>
          <connect from_op="Generate Aggregation" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>




    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,503 RM Data Scientist
    mir ist gerade aufgefallen, dass es einen viel einfacheren Weg dazu gibt.. Einfach Filter Examples, sie angehangens Beispiel.

    LG,
    Martin

    <?xml version="1.0" encoding="UTF-8"?><process version="9.7.002">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.7.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="utility:create_exampleset" compatibility="9.7.002" expanded="true" height="68" name="Create ExampleSet" width="90" x="179" y="136">
            <parameter key="generator_type" value="comma separated text"/>
            <parameter key="number_of_examples" value="100"/>
            <parameter key="use_stepsize" value="false"/>
            <list key="function_descriptions"/>
            <parameter key="add_id_attribute" value="false"/>
            <list key="numeric_series_configuration"/>
            <list key="date_series_configuration"/>
            <list key="date_series_configuration (interval)"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="input_csv_text" value="Text&#10;Ein Satz mit Ästen&#10;Ein Satz mit Bäumen&#10;Ein Satz ohne etwas"/>
            <parameter key="column_separator" value=","/>
            <parameter key="parse_all_as_nominal" value="false"/>
            <parameter key="decimal_point_character" value="."/>
            <parameter key="trim_attribute_names" value="true"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="9.7.002" expanded="true" height="103" name="Filter Examples" width="90" x="313" y="136">
            <parameter key="parameter_expression" value=""/>
            <parameter key="condition_class" value="custom_filters"/>
            <parameter key="invert_filter" value="false"/>
            <list key="filters_list">
              <parameter key="filters_entry_key" value="Text.contains.Äste"/>
            </list>
            <parameter key="filters_logic_and" value="true"/>
            <parameter key="filters_check_metadata" value="true"/>
          </operator>
          <connect from_op="Create ExampleSet" from_port="output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>


    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.