Extract all Uppercase words from document into new attributes

Hyperrick · October 2020

Hello RM forum,

I'd like to extract all matches from an example set to a new attribute or even to multiple new attributes.

Example text:

We love DOGS. But CATS are cooler than BROWN FOXES.

This is my RegEx which finds all occurences (operator "generate extract":

The Problem is, that it will only generate the first match as a new attribute:

I want to get a new attribute containing all matches like:

test
DOGS;CATS;BROWN;FOXES

<?xml version="1.0" encoding="UTF-8"?><process version="9.7.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.7.002" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="utility:create_exampleset" compatibility="9.7.002" expanded="true" height="68" name="Create ExampleSet" width="90" x="246" y="238">
        <parameter key="generator_type" value="comma separated text"/>
        <parameter key="number_of_examples" value="100"/>
        <parameter key="use_stepsize" value="false"/>
        <list key="function_descriptions"/>
        <parameter key="add_id_attribute" value="false"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="input_csv_text" value="text&#10;We love DOGS. But CATS are cooler than BROWN FOXES."/>
        <parameter key="column_separator" value=","/>
        <parameter key="parse_all_as_nominal" value="false"/>
        <parameter key="decimal_point_character" value="."/>
        <parameter key="trim_attribute_names" value="true"/>
      </operator>
      <operator activated="true" class="text:generate_extract" compatibility="9.3.001" expanded="true" height="68" name="Generate Extract" width="90" x="447" y="238">
        <parameter key="source_attribute" value="text"/>
        <parameter key="query_type" value="Regular Expression"/>
        <list key="string_machting_queries"/>
        <parameter key="attribute_type" value="Nominal"/>
        <list key="regular_expression_queries">
          <parameter key="test" value="[A-Z]{1}[A-Z]{3,}"/>
        </list>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <parameter key="ignore_CDATA" value="true"/>
        <parameter key="assume_html" value="true"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries"/>
      </operator>
      <connect from_op="Create ExampleSet" from_port="output" to_op="Generate Extract" to_port="Example Set"/>
      <connect from_op="Generate Extract" from_port="Example Set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Is there a way to do this in Rapidminer?

Kind regards,

Patrick

MarcoBarradas · October 2020

@Hyperrick you could play with tokenization on the text minning extension

Its not the final result but it would give you an idea.

<?xml version="1.0" encoding="UTF-8"?><process version="9.7.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.7.002" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="text:create_document" compatibility="9.3.001" expanded="true" height="68" name="Create Document" width="90" x="179" y="136">
        <parameter key="text" value="We love DOGS. But CATS are cooler than BROWN FOXES."/>
        <parameter key="add label" value="false"/>
        <parameter key="label_type" value="nominal"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="9.3.001" expanded="true" height="103" name="Process Documents" width="90" x="313" y="34">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="313" y="34">
            <parameter key="mode" value="regular expression"/>
            <parameter key="characters" value=".:"/>
            <parameter key="expression" value="[a-z]{1}[a-z]{3,}"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Hyperrick · October 2020

Hi Marco. Thank you for your approach. That is a good way, I'll try this one!

Patrick

JEdward · October 2020

@Hyperrick
I would take a different approach to Marco, should achieve a similar result, but puts all the upper case words into a single attribute.
Try this one out and see what you think.

<?xml version="1.0" encoding="UTF-8"?><process version="9.7.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="proccan be separated ess" compatibility="9.7.002" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="utility:create_exampleset" compatibility="9.7.002" expanded="true" height="68" name="Create ExampleSet" width="90" x="112" y="34">
        <parameter key="generator_type" value="comma separated text"/>
        <parameter key="number_of_examples" value="100"/>
        <parameter key="use_stepsize" value="false"/>
        <list key="function_descriptions"/>
        <parameter key="add_id_attribute" value="false"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="input_csv_text" value="text&#10;We love DOGS. But CATS are cooler than BROWN FOXES.&#10;They are QUICK but we are QUICKER. "/>
        <parameter key="column_separator" value=","/>
        <parameter key="parse_all_as_nominal" value="false"/>
        <parameter key="decimal_point_character" value="."/>
        <parameter key="trim_attribute_names" value="true"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="9.7.002" expanded="true" height="82" name="Nominal to Text" width="90" x="246" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="text"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="9.3.001" expanded="true" height="68" name="Data to Documents" width="90" x="380" y="34">
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <description align="center" color="transparent" colored="false" width="126">Turn each record into a collection of documents.</description>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="9.7.002" expanded="true" height="103" name="Loop Collection" width="90" x="514" y="34">
        <parameter key="set_iteration_macro" value="true"/>
        <parameter key="macro_name" value="iteration"/>
        <parameter key="macro_start_value" value="1"/>
        <parameter key="unfold" value="false"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
            <description align="center" color="transparent" colored="false" width="126">Use whatever tokenisation pattern suits your document.</description>
          </operator>
          <operator activated="true" class="converters:extract_tokens" compatibility="0.7.000" expanded="true" height="103" name="Extract Tokens" width="90" x="246" y="34">
            <description align="center" color="transparent" colored="false" width="126">Take the tokens and turn them into an example set.</description>
          </operator>
          <operator activated="true" class="text:documents_to_data" compatibility="9.3.001" expanded="true" height="82" name="Documents to Data" width="90" x="514" y="238">
            <parameter key="text_attribute" value="text"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="use_processed_text" value="false"/>
            <description align="center" color="transparent" colored="false" width="126">Turn the document back into data.</description>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.7.002" expanded="true" height="82" name="Generate Attributes" width="90" x="916" y="187">
            <list key="function_descriptions">
              <parameter key="_JOINID" value="%{iteration}"/>
            </list>
            <parameter key="keep_all" value="true"/>
            <description align="center" color="transparent" colored="false" width="126">Add a join ID</description>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="9.7.002" expanded="true" height="103" name="Filter Examples" width="90" x="514" y="34">
            <parameter key="parameter_expression" value=""/>
            <parameter key="condition_class" value="custom_filters"/>
            <parameter key="invert_filter" value="false"/>
            <list key="filters_list">
              <parameter key="filters_entry_key" value="Token.matches.[A-Z]*"/>
            </list>
            <parameter key="filters_logic_and" value="true"/>
            <parameter key="filters_check_metadata" value="true"/>
            <description align="center" color="transparent" colored="false" width="126">Keep only the UPPER CASE tokens.</description>
          </operator>
          <operator activated="true" class="aggregate" compatibility="9.7.002" expanded="true" height="82" name="Aggregate" width="90" x="648" y="34">
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="default_aggregation_function" value="average"/>
            <list key="aggregation_attributes">
              <parameter key="Token" value="concatenation"/>
            </list>
            <parameter key="group_by_attributes" value=""/>
            <parameter key="count_all_combinations" value="false"/>
            <parameter key="only_distinct" value="false"/>
            <parameter key="ignore_missings" value="true"/>
            <description align="center" color="transparent" colored="false" width="126">This aggregates the tokens into one record.</description>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.7.002" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="782" y="34">
            <list key="function_descriptions">
              <parameter key="_JOINID" value="%{iteration}"/>
            </list>
            <parameter key="keep_all" value="true"/>
            <description align="center" color="transparent" colored="false" width="126">Add a join ID</description>
          </operator>
          <operator activated="true" class="rename" compatibility="9.7.002" expanded="true" height="82" name="Rename" width="90" x="916" y="34">
            <parameter key="old_name" value="concat(Token)"/>
            <parameter key="new_name" value="Token"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <connect from_port="single" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Extract Tokens" to_port="doc"/>
          <connect from_op="Extract Tokens" from_port="exa" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Extract Tokens" from_port="ori" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_port="output 2"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
          <portSpacing port="sink_output 3" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">Set an iteration macro which is used later as a Join key.</description>
      </operator>
      <operator activated="true" class="append" compatibility="9.7.002" expanded="true" height="82" name="Append (2)" width="90" x="648" y="136">
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="merge_type" value="all"/>
      </operator>
      <operator activated="true" class="append" compatibility="9.7.002" expanded="true" height="82" name="Append" width="90" x="648" y="34">
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="merge_type" value="all"/>
      </operator>
      <operator activated="true" class="concurrency:join" compatibility="9.7.002" expanded="true" height="82" name="Join" width="90" x="782" y="34">
        <parameter key="remove_double_attributes" value="true"/>
        <parameter key="join_type" value="inner"/>
        <parameter key="use_id_attribute_as_key" value="false"/>
        <list key="key_attributes">
          <parameter key="_JOINID" value="_JOINID"/>
        </list>
        <parameter key="keep_both_join_attributes" value="false"/>
        <description align="center" color="transparent" colored="false" width="126">Join the datasets together using the Join ID</description>
      </operator>
      <connect from_op="Create ExampleSet" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Append" to_port="example set 1"/>
      <connect from_op="Loop Collection" from_port="output 2" to_op="Append (2)" to_port="example set 1"/>
      <connect from_op="Append (2)" from_port="merged set" to_op="Join" to_port="right"/>
      <connect from_op="Append" from_port="merged set" to_op="Join" to_port="left"/>
      <connect from_op="Join" from_port="join" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Hyperrick · October 2020

Hi JEdward,
thanks for your response - unfortunately I can't import the process. I keep gettigng an error while importing:

Image: https://us.v-cdn.net/6030995/uploads/editor/m7/atfmn3mwlmqb.png

Patrick

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Extract all Uppercase words from document into new attributes

Best Answer

Answers