Options

Write as text operator writing original text

ayaRizkayaRizk Member Posts: 6 Contributor II
I have a couple of loop files operators to conduct a series of preprocessing steps on a corpus of annual reports documents (including tokenization, stemming, etc.). The last step is to write the preprocessed documents in text files using the "Write as text" operator. However, it writes the original text rather than the tokenized version (seen on the top half of my results view - see attached screenshot).

Thanks for the help!
/Aya
<?xml version="1.0" encoding="UTF-8"?><process version="10.0.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="10.0.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="concurrency:loop_files" compatibility="10.0.000" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34">
        <parameter key="directory" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Kommuners AR"/>
        <parameter key="filter_type" value="glob"/>
        <parameter key="filter_by_regex" value=".*\.docx$"/>
        <parameter key="recursive" value="true"/>
        <parameter key="skip_inaccessible" value="true"/>
        <parameter key="enable_macros" value="false"/>
        <parameter key="macro_for_file_name" value="file_name"/>
        <parameter key="macro_for_file_type" value="file_type"/>
        <parameter key="macro_for_folder_name" value="folder_name"/>
        <parameter key="reuse_results" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="handle_exception" compatibility="10.0.000" expanded="true" height="82" name="Handle Exception" width="90" x="179" y="34">
            <parameter key="add_details_to_log" value="true"/>
            <process expanded="true">
              <operator activated="true" class="text:read_document" compatibility="10.0.000" expanded="true" height="68" name="Read Document" width="90" x="112" y="34">
                <parameter key="extract_text_only" value="true"/>
                <parameter key="use_file_extension_as_type" value="true"/>
                <parameter key="content_type" value="pdf"/>
                <parameter key="encoding" value="SYSTEM"/>
              </operator>
              <connect from_port="in 1" to_op="Read Document" to_port="file"/>
              <connect from_op="Read Document" from_port="output" to_port="out 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
            <process expanded="true">
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="file object" to_op="Handle Exception" to_port="in 1"/>
          <connect from_op="Handle Exception" from_port="out 1" to_port="output 1"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="10.0.000" expanded="true" height="82" name="Loop Collection" width="90" x="179" y="34">
        <parameter key="set_iteration_macro" value="false"/>
        <parameter key="macro_name" value="iteration"/>
        <parameter key="macro_start_value" value="1"/>
        <parameter key="unfold" value="false"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="10.0.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="10.0.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="10.0.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="30"/>
          </operator>
          <operator activated="true" class="open_file" compatibility="10.0.000" expanded="true" height="68" name="Open File" width="90" x="313" y="289">
            <parameter key="resource_type" value="file"/>
            <parameter key="filename" value="/Users/ayari88/Documents/Research/AFA/ROBOT/RapidMiner/Custom_stopwords_ar.csv"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="10.0.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="447" y="187">
            <parameter key="case_sensitive" value="false"/>
            <parameter key="encoding" value="UTF-8"/>
          </operator>
          <operator activated="true" class="text:stem_snowball" compatibility="10.0.000" expanded="true" height="68" name="Stem (Snowball)" width="90" x="581" y="34">
            <parameter key="language" value="Swedish"/>
          </operator>
          <connect from_port="single" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
          <connect from_op="Open File" from_port="file" to_op="Filter Stopwords (Dictionary)" to_port="file"/>
          <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="10.0.000" expanded="true" height="82" name="Write files" width="90" x="313" y="34">
        <parameter key="set_iteration_macro" value="false"/>
        <parameter key="macro_name" value="iteration"/>
        <parameter key="macro_start_value" value="1"/>
        <parameter key="unfold" value="false"/>
        <process expanded="true">
          <operator activated="false" class="text:write_document" compatibility="10.0.000" expanded="true" height="82" name="Write Document" width="90" x="112" y="238">
            <parameter key="file" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Kommuners AR preprocessed/%{a}.txt"/>
            <parameter key="overwrite" value="true"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="write_as_text" compatibility="10.0.000" expanded="true" height="82" name="Write as Text" width="90" x="380" y="34">
            <parameter key="result_file" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Kommuners AR preprocessed/%{a}.txt"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <connect from_port="single" to_op="Write as Text" to_port="input 1"/>
          <connect from_op="Write as Text" from_port="input 1" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Loop Files" from_port="output 1" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Write files" to_port="collection"/>
      <connect from_op="Write files" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>


Best Answer

  • Options
    jwpfaujwpfau Employee, Member Posts: 280 RM Engineering
    Solution Accepted
    Hi,

    you can try to use a combination of "Documents to Data" with use processed data checked → Data to Documents  Loop Collection with Write Document to store the tokenized version.

    Greetings,
    Jonas

Answers

Sign In or Register to comment.