Options

Extracting the Tokenized Result Data

maxfaxmaxfax Member Posts: 17 Contributor II
edited November 2018 in Help
Hi,

First I read a txt-file and afterwards i would like to do some simple text processing steps and then i would like to export the newly stemmed and processed data to a txt data.

I got all the steps working but i just cant export results - I dont now if its clear but i would like to have a txt-file which contains the processed file as it is show in the Resulttable.
      <operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="45" y="120">
        <parameter key="file" value="C:\mystring.txt"/>
        <parameter key="extract_text_only" value="true"/>
        <parameter key="use_file_extension_as_type" value="true"/>
        <parameter key="content_type" value="txt"/>
        <parameter key="encoding" value="SYSTEM"/>
      </operator>
      <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="196" y="138">
        <parameter key="mode" value="non letters"/>
        <parameter key="characters" value=".:"/>
        <parameter key="language" value="English"/>
        <parameter key="max_token_length" value="3"/>
      </operator>
      <operator activated="true" class="text:transform_cases" compatibility="5.2.004" expanded="true" height="60" name="Transform Cases" width="90" x="311" y="69">
        <parameter key="transform_to" value="lower case"/>
      </operator>
      <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.004" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="210">
        <parameter key="stop_word_list" value="Standard"/>
      </operator>
      <operator activated="true" class="text:stem_german" compatibility="5.2.004" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
      <operator activated="true" class="text:filter_by_length" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="210">
        <parameter key="min_chars" value="2"/>
        <parameter key="max_chars" value="25"/>
      </operator>
      <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document"/>
      <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
      <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
      <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
      <connect from_op="Stem (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
      <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="108"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • Options
    SkirzynskiSkirzynski Member Posts: 164 Maven
    Indeed this is not very intuitive. Of course you need the "Write Document" operator, but unfortunately this operator does not write the processed tokens. To do this you have to add the "Combine Documents" operator which does not only concatenate multiple documents, but also creates a new document with the tokens instead the original text. Thus adding this operator before the write operator yields your desired output.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
        <process expanded="true" height="431" width="815">
          <operator activated="true" class="text:read_document" compatibility="5.2.005" expanded="true" height="60" name="Read Document" width="90" x="45" y="120">
            <parameter key="file" value="/home/marcin/mystring.txt"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="196" y="138"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.2.005" expanded="true" height="60" name="Transform Cases" width="90" x="311" y="69"/>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.005" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="210"/>
          <operator activated="true" class="text:stem_german" compatibility="5.2.005" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.005" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="210">
            <parameter key="min_chars" value="2"/>
          </operator>
          <operator activated="true" class="text:combine_documents" compatibility="5.2.005" expanded="true" height="76" name="Combine Documents" width="90" x="581" y="210"/>
          <operator activated="true" class="text:write_document" compatibility="5.2.005" expanded="true" height="76" name="Write Document" width="90" x="581" y="30">
            <parameter key="file" value="/home/marcin/mystring-doc.txt"/>
          </operator>
          <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
          <connect from_op="Stem (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Combine Documents" to_port="documents 1"/>
          <connect from_op="Combine Documents" from_port="document" to_op="Write Document" to_port="document"/>
          <connect from_op="Write Document" from_port="document" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="108"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • Options
    maxfaxmaxfax Member Posts: 17 Contributor II
    Thank you very much :)  :)


    Marcin wrote:

    Indeed this is not very intuitive. Of course you need the "Write Document" operator, but unfortunately this operator does not write the processed tokens. To do this you have to add the "Combine Documents" operator which does not only concatenate multiple documents, but also creates a new document with the tokens instead the original text. Thus adding this operator before the write operator yields your desired output.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
        <process expanded="true" height="431" width="815">
          <operator activated="true" class="text:read_document" compatibility="5.2.005" expanded="true" height="60" name="Read Document" width="90" x="45" y="120">
            <parameter key="file" value="/home/marcin/mystring.txt"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="196" y="138"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.2.005" expanded="true" height="60" name="Transform Cases" width="90" x="311" y="69"/>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.005" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="210"/>
          <operator activated="true" class="text:stem_german" compatibility="5.2.005" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.005" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="210">
            <parameter key="min_chars" value="2"/>
          </operator>
          <operator activated="true" class="text:combine_documents" compatibility="5.2.005" expanded="true" height="76" name="Combine Documents" width="90" x="581" y="210"/>
          <operator activated="true" class="text:write_document" compatibility="5.2.005" expanded="true" height="76" name="Write Document" width="90" x="581" y="30">
            <parameter key="file" value="/home/marcin/mystring-doc.txt"/>
          </operator>
          <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
          <connect from_op="Stem (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Combine Documents" to_port="documents 1"/>
          <connect from_op="Combine Documents" from_port="document" to_op="Write Document" to_port="document"/>
          <connect from_op="Write Document" from_port="document" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="108"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.