Text Mining

RhmanigRhmanig Member Posts: 9 Contributor I
edited November 2019 in Help
Hi everyone

I have huge amount of document (text) that I am currently working with.  I have the data cleaned and preprepared, e.g. tokenized, filtered stop words, with n-grams, without grams. So I have the data ready and I would like to build my model which would help me to extract job related words from it e.g. job, cv, employee and more.

I dont have labelled data which makes it hard to do so. So I gathered some job related words from the internet. Now I would like to extract/display all the words from my dataset that match the job related words.

Can you please advice how to do so or if you have any other solutions please let me know.



  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,453 RM Data Scientist

    there is a small trick you can use

    Let's say you have two tables. The first is the bag of words like this:

    word1 word2 word3
    1 1 3
    and the dictionary. Let's say word2 and word3 are in your dictionary. Then you have a table like this

    What we now want is to create a model, which is like this: a*word2+b*word3, where word2 and word3 are the occurences (first dataset).
    Therefore we can create a table like this

    label word2 word3
    1 1 1
    Then we can learn a vector regression on this, which is creating the appropriate model.

    Attached is a process doing this. the received date set is like this

    Note that you can set "importancies" for each of the words easily.



    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.3.000">
     <operator activated="true" class="process" compatibility="6.3.000" expanded="true" name="Process">
       <process expanded="true">
         <operator activated="true" class="generate_data_user_specification" compatibility="6.3.000" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="112" y="255">
           <list key="attribute_values">
             <parameter key="word1" value="1"/>
             <parameter key="word2" value="1"/>
             <parameter key="word3" value="3"/>
           <list key="set_additional_roles"/>
         <operator activated="true" class="retrieve" compatibility="6.3.000" expanded="true" height="60" name="Retrieve textstuff" width="90" x="112" y="75">
           <parameter key="repository_entry" value="//Local Repository/Fourm/textstuff"/>
         <operator activated="true" class="subprocess" compatibility="6.3.000" expanded="true" height="76" name="Preprocess Dict" width="90" x="380" y="75">
           <process expanded="true">
             <operator activated="true" class="nominal_to_text" compatibility="6.3.000" expanded="true" height="76" name="Nominal to Text" width="90" x="112" y="30"/>
             <operator activated="true" class="text:process_document_from_data" compatibility="6.1.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="30">
               <parameter key="vector_creation" value="Term Occurrences"/>
               <list key="specify_weights"/>
               <process expanded="true">
                 <operator activated="true" class="text:tokenize" compatibility="6.1.000" expanded="true" height="60" name="Tokenize" width="90" x="380" y="30">
                   <parameter key="mode" value="specify characters"/>
                   <parameter key="characters" value="\s"/>
                 <connect from_port="document" to_op="Tokenize" to_port="document"/>
                 <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
                 <portSpacing port="source_document" spacing="0"/>
                 <portSpacing port="sink_document 1" spacing="0"/>
                 <portSpacing port="sink_document 2" spacing="0"/>
             <operator activated="true" class="filter_example_range" compatibility="6.3.000" expanded="true" height="76" name="Filter Example Range" width="90" x="380" y="30">
               <parameter key="first_example" value="1"/>
               <parameter key="last_example" value="1"/>
             <operator activated="true" class="loop_attributes" compatibility="6.3.000" expanded="true" height="76" name="Loop Attributes" width="90" x="514" y="30">
               <process expanded="true">
                 <operator activated="true" class="generate_attributes" compatibility="6.3.000" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="313" y="30">
                   <list key="function_descriptions">
                     <parameter key="%{loop_attribute}" value="1"/>
                 <connect from_port="example set" to_op="Generate Attributes (2)" to_port="example set input"/>
                 <connect from_op="Generate Attributes (2)" from_port="example set output" to_port="example set"/>
                 <portSpacing port="source_example set" spacing="0"/>
                 <portSpacing port="sink_example set" spacing="0"/>
                 <portSpacing port="sink_result 1" spacing="0"/>
             <operator activated="true" class="generate_attributes" compatibility="6.3.000" expanded="true" height="76" name="Generate Attributes (3)" width="90" x="648" y="30">
               <list key="function_descriptions">
                 <parameter key="label" value="1"/>
             <operator activated="true" class="set_role" compatibility="6.3.000" expanded="true" height="76" name="Set Role" width="90" x="782" y="30">
               <parameter key="attribute_name" value="label"/>
               <parameter key="target_role" value="label"/>
               <list key="set_additional_roles"/>
             <connect from_port="in 1" to_op="Nominal to Text" to_port="example set input"/>
             <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
             <connect from_op="Process Documents from Data" from_port="example set" to_op="Filter Example Range" to_port="example set input"/>
             <connect from_op="Filter Example Range" from_port="example set output" to_op="Loop Attributes" to_port="example set"/>
             <connect from_op="Loop Attributes" from_port="example set" to_op="Generate Attributes (3)" to_port="example set input"/>
             <connect from_op="Generate Attributes (3)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
             <connect from_op="Set Role" from_port="example set output" to_port="out 1"/>
             <portSpacing port="source_in 1" spacing="0"/>
             <portSpacing port="source_in 2" spacing="0"/>
             <portSpacing port="sink_out 1" spacing="0"/>
             <portSpacing port="sink_out 2" spacing="0"/>
         <operator activated="true" class="vector_linear_regression" compatibility="6.3.000" expanded="true" height="76" name="Vector Linear Regression" width="90" x="514" y="75">
           <parameter key="use_bias" value="false"/>
         <operator activated="true" class="apply_model" compatibility="6.3.000" expanded="true" height="76" name="Apply Model" width="90" x="581" y="165">
           <list key="application_parameters"/>
         <connect from_op="Generate Data by User Specification" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
         <connect from_op="Retrieve textstuff" from_port="output" to_op="Preprocess Dict" to_port="in 1"/>
         <connect from_op="Preprocess Dict" from_port="out 1" to_op="Vector Linear Regression" to_port="training set"/>
         <connect from_op="Vector Linear Regression" from_port="model" to_op="Apply Model" to_port="model"/>
         <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
    - Sr. Director Data Solutions, Altair RapidMiner -
    Dortmund, Germany
Sign In or Register to comment.