I want to filter token by context with double word such as GLOBAL-WARMING|HARM-REDUCTION

Linhnm_178Linhnm_178 Member Posts: 4 Learner I
I genuinely appreciate your suggestions. I am writing a thesis, where I build my own set of words (using filter by context, paste in regular expressions) and want to find, how many times those words apprear in the pdf file.
However I do not think rapidminer can read my double words such as Global-warming, harm-reduction. These are seperate words that only make sense when the stand together.
Can someone please give me advice?

<?xml version="1.0" encoding="UTF-8"?><process version="9.10.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.10.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="concurrency:loop_files" compatibility="9.10.001" expanded="true" height="82" name="Loop Files" width="90" x="179" y="34">
        <parameter key="directory" value="C:/Master Thesis/file"/>
        <parameter key="filter_type" value="glob"/>
        <parameter key="filter_by_glob" value="*.pdf"/>
        <parameter key="recursive" value="false"/>
        <parameter key="enable_macros" value="false"/>
        <parameter key="macro_for_file_name" value="file_name"/>
        <parameter key="macro_for_file_type" value="file_type"/>
        <parameter key="macro_for_folder_name" value="folder_name"/>
        <parameter key="reuse_results" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="text:read_document" compatibility="9.4.000" expanded="true" height="68" name="Read Document" width="90" x="313" y="34">
            <parameter key="extract_text_only" value="true"/>
            <parameter key="use_file_extension_as_type" value="true"/>
            <parameter key="content_type" value="pdf"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <connect from_port="file object" to_op="Read Document" to_port="file"/>
          <connect from_op="Read Document" from_port="output" to_port="output 1"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="text:combine_documents" compatibility="9.4.000" expanded="true" height="68" name="Combine Documents" width="90" x="179" y="136"/>
      <operator activated="true" class="text:process_documents" compatibility="9.4.000" expanded="true" height="103" name="Process Documents" width="90" x="447" y="85">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases" width="90" x="380" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="514" y="34">
            <parameter key="min_chars" value="4"/>
            <parameter key="max_chars" value="25"/>
          </operator>
          <operator activated="true" class="text:filter_tokens_by_content" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="715" y="34">
            <parameter key="condition" value="matches"/>
            <parameter key="regular_expression" value="ANTIBIOTIC|BARRIER|BIOMASS|BREEDING|CHILD|CHILDREN|CHOLESTEROL|CLAIM|COMPLIANCE|CONCERN|CORRECTIVE|DECREASE|DEFENSE|DEPLETION|DIETARY|DISCHARGE|DISCLOSE|DISCLOSURE|DONATION|EMISSION|FOOTPRINT|FOSSIL|GAP|GLOBAL WARMING|GMO|GRAZING|GREENHOUSE GAS|HARM-REDUCTION|HORMONE|HYDRO|IMPACT|INCREASE|INTENSIVE|LIABILITIES|LIMITATION|LOSS|LOSSES|NEGATIVE|NICOTINE|OBESITY|OPPORTUNITIES|OPPORTUNITY|QUALITY|REDUCE|REDUCTION|REMANUFACTURED|REPROCESSED|REPUTATIONAL|RESPONSIBILITY|RESTRICTION|RISK|SAFETY|SCARCITY|SETTLEMENT|STANDARD|TURNOVER|WASTE|WELFARE  "/>
            <parameter key="case_sensitive" value="false"/>
            <parameter key="invert condition" value="false"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="9.4.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="916" y="34"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="9.4.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="1050" y="34">
            <parameter key="max_length" value="2"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
          <connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="9.4.000" expanded="true" height="82" name="WordList to Data" width="90" x="715" y="238"/>
      <connect from_op="Loop Files" from_port="output 1" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="word list" to_port="result 3"/>
      <connect from_op="WordList to Data" from_port="example set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>


Best Answers

Answers

  • ceaperezceaperez Member Posts: 517 Unicorn
    Hi @Linhnm_178
    I don´t know if I understand in deep your problem, but you can create a Regex expression to capture words and composed words. 
    something like this maybe can help you
    (([a-zA-Z])\w+\-?([a-z])\w+)

    Best, 

    Cesar


  • Linhnm_178Linhnm_178 Member Posts: 4 Learner I
    Thank you @Ceaperez for your answer. Let me elaborate it a bit more. Basically, I want to find how many times the word "Harm reduction" appears specifically in the given pdf file (annual report). Instead of just the word Harm or Reduction separately.
    Do you have any suggestions?

     
Sign In or Register to comment.