To classify customer queries using RapidMiner

Akshay_GuptaAkshay_Gupta Member Posts: 3 Contributor I
edited November 2018 in Help

I have a huge excel file, which has general attributes like BILLID, CLAIM ID, ITERATION ID, QUERY TEXT etc. I want to classify these queries to identify major bulk of problem areas. I tried employing Decision Tree, but it did not render any insights. Most of what I researched I stumbled upon using filter keywords, Tokenizing, Stemming to identify keyword frequency, But I couldn't use it for excel attribute, irrespective of that I am not sure that would help me in finding clusters/buckets of complete query statement rather than just keywords. Any help in finding right direction would be of much help, Thank You.

Answers

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    You need to set a label to classify your dataset and if you have some sort of text to process, you'd need the Text Processing extension too. 

     

    Attached is a sample process that I used to classify some HR data that also used open ended survey questions. 

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" breakpoints="after" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve EmployeeData" width="90" x="45" y="30">
    <parameter key="repository_entry" value="../data/EmployeeData"/>
    </operator>
    <operator activated="true" class="replace_missing_values" compatibility="7.5.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="30">
    <parameter key="value_type" value="nominal"/>
    <list key="columns"/>
    <parameter key="replenishment_value" value="MISSING"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="30">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="CanDoBetter"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.5.000" expanded="true" height="82" name="Set Role" width="90" x="447" y="30">
    <parameter key="attribute_name" value="ID"/>
    <parameter key="target_role" value="id"/>
    <list key="set_additional_roles">
    <parameter key="Status" value="label"/>
    </list>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
    <parameter key="vector_creation" value="Binary Term Occurrences"/>
    <parameter key="prune_method" value="percentual"/>
    <parameter key="prune_above_percent" value="50.0"/>
    <parameter key="prune_below_absolute" value="5"/>
    <parameter key="prune_above_absolute" value="2000"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34">
    <parameter key="max_length" value="3"/>
    </operator>
    <operator activated="true" class="text:stem_porter" compatibility="7.4.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="447" y="34"/>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="581" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="715" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
    <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
    <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="120">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="ID|Status|Benefit|DriveTime|Education|Est_Income|Facility|Marital|PositionType|SpousalBenefit|StockPlan|Training|TrainingRecieved|length_of_hire|bottled_wat|benefit|event|pay_mor|listen|pension|salari|social"/>
    </operator>
    <operator activated="true" class="real_to_integer" compatibility="6.0.003" expanded="true" height="82" name="Real to Integer" width="90" x="581" y="210">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="length_of_hire|Est_Income"/>
    </operator>
    <operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="581" y="300">
    <parameter key="attribute_filter_type" value="value_type"/>
    <parameter key="value_type" value="real"/>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="581" y="390"/>
    <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.000" expanded="true" height="82" name="Decision Tree (2)" width="90" x="782" y="85">
    <parameter key="maximal_depth" value="5"/>
    <parameter key="apply_prepruning" value="false"/>
    </operator>
    <operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="525">
    <parameter key="attributes" value="Benefit|SpousalBenefit|StockPlan"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="true" class="remap_binominals" compatibility="7.3.000" expanded="true" height="82" name="Remap Binominals" width="90" x="715" y="525">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Status"/>
    <parameter key="include_special_attributes" value="true"/>
    <parameter key="negative_value" value="past"/>
    <parameter key="positive_value" value="current"/>
    </operator>
    <operator activated="true" class="weight_by_svm" compatibility="7.5.000" expanded="true" height="82" name="Weight by SVM" width="90" x="849" y="525">
    <parameter key="normalize_weights" value="true"/>
    </operator>
    <operator activated="true" class="weights_to_data" compatibility="7.5.000" expanded="true" height="68" name="Weights to Data" width="90" x="782" y="210"/>
    <operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="916" y="210">
    <parameter key="attribute_name" value="Weight"/>
    <parameter key="sorting_direction" value="decreasing"/>
    </operator>
    <operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range" width="90" x="1050" y="210">
    <parameter key="first_example" value="1"/>
    <parameter key="last_example" value="10"/>
    </operator>
    <connect from_op="Retrieve EmployeeData" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
    <connect from_op="Replace Missing Values" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Real to Integer" to_port="example set input"/>
    <connect from_op="Real to Integer" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
    <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
    <connect from_op="Multiply" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
    <connect from_op="Decision Tree (2)" from_port="model" to_port="result 3"/>
    <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
    <connect from_op="Remap Binominals" from_port="example set output" to_op="Weight by SVM" to_port="example set"/>
    <connect from_op="Weight by SVM" from_port="weights" to_op="Weights to Data" to_port="attribute weights"/>
    <connect from_op="Weights to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
    <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
    <connect from_op="Filter Example Range" from_port="example set output" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>

     

     

  • Akshay_GuptaAkshay_Gupta Member Posts: 3 Contributor I

    I tried to replicate your process, but

    1. It doesn't show any attribute in Remap Binomials operator,

    2. The decision tree generated individual branches for each example, no relevant insights, I think I am not using it properly.

    3. Word frquency looks irregular, For example at times it is showing count for "approv" "provide" or "support" instead of supportings.

     

    Thank you for such a prompt reply, This is my first process @ Rapidminer. My expectation is to classify textual queries(thousands) into some handful of classes, For example "Tax related", "Incorrect/Empty details", "Supportings not Attached". Queries are remarks put up by officer to intiator or QTI(Query to Initiator). I am looking a way to automate these query handling proccesses in order to improve efficiency and productivity. Hence I am starting with the first step of identifying major problem/Query areas - classification. 

     

    I am sorry, If I sound immature, I have taken 3 weeks of internet scouring before posting here. This field is quite new in my environment. Hence I couldn't get much help from around, It would help in great deal, if you can share resources about some example processes from where I can learn by comparison. 

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    Do you have some sample data to share too? Would make troubleshooting this easier. 

     

    Thanks!

     

  • Akshay_GuptaAkshay_Gupta Member Posts: 3 Contributor I

    Hi Thank you for your time and attention, PFA the data file. I hope it helps the context of my problem. These are queries related to payable, i want to automate query handling processes, hence I am trying to classify these queries into handful of focus areas and think about solutions from there.

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    So your data set has some data quality issues. I just Filtered them out by selecting only ER- records.  Then I made some tweaks to the process and got a results.

    Customer Class.png

     

     

     

    See below for the XML.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.5.001" expanded="true" height="68" name="Retrieve QRY_ITERATION_T_Mini" width="90" x="45" y="34">
    <parameter key="repository_entry" value="../data/QRY_ITERATION_T_Mini"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="7.5.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="136">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="CLAIMID.contains.ER-"/>
    </list>
    </operator>
    <operator activated="true" class="replace_missing_values" compatibility="7.5.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="136">
    <parameter key="value_type" value="nominal"/>
    <list key="columns"/>
    <parameter key="replenishment_value" value="MISSING"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="136">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value="QUERYTEXT"/>
    <parameter key="attributes" value="REPLYTEXT|QUERYTEXT"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.5.001" expanded="true" height="82" name="Set Role" width="90" x="447" y="34">
    <parameter key="attribute_name" value="BILLID"/>
    <parameter key="target_role" value="id"/>
    <list key="set_additional_roles">
    <parameter key="QUERYCATEGORY" value="label"/>
    </list>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
    <parameter key="vector_creation" value="Binary Term Occurrences"/>
    <parameter key="prune_method" value="percentual"/>
    <parameter key="prune_above_percent" value="50.0"/>
    <parameter key="prune_below_absolute" value="5"/>
    <parameter key="prune_above_absolute" value="2000"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34">
    <parameter key="max_length" value="3"/>
    </operator>
    <operator activated="true" class="text:stem_porter" compatibility="7.4.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="447" y="34"/>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="581" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="715" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
    <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
    <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="238">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="att7|REPLYBY|QUERYDATE|QUERYBY|ITERATIONID|CLAIMID|BILLID"/>
    <parameter key="invert_selection" value="true"/>
    </operator>
    <operator activated="true" class="nominal_to_date" compatibility="7.5.001" expanded="true" height="82" name="Nominal to Date" width="90" x="447" y="340">
    <parameter key="attribute_name" value="CREATEDDATE"/>
    <parameter key="date_format" value="yyyy-MM-dd"/>
    </operator>
    <operator activated="true" class="real_to_integer" compatibility="6.0.003" expanded="true" height="82" name="Real to Integer" width="90" x="246" y="340">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="length_of_hire|Est_Income"/>
    </operator>
    <operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="179" y="493">
    <parameter key="attribute_filter_type" value="value_type"/>
    <parameter key="value_type" value="real"/>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.5.001" expanded="true" height="103" name="Multiply" width="90" x="380" y="544"/>
    <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.001" expanded="true" height="82" name="Decision Tree (2)" width="90" x="782" y="85">
    <parameter key="maximal_depth" value="5"/>
    <parameter key="apply_prepruning" value="false"/>
    </operator>
    <operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="525">
    <parameter key="attributes" value="Benefit|SpousalBenefit|StockPlan"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="true" class="remap_binominals" compatibility="7.3.000" expanded="true" height="82" name="Remap Binominals" width="90" x="715" y="525">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Status"/>
    <parameter key="include_special_attributes" value="true"/>
    <parameter key="negative_value" value="past"/>
    <parameter key="positive_value" value="current"/>
    </operator>
    <operator activated="true" class="weight_by_svm" compatibility="7.5.001" expanded="true" height="82" name="Weight by SVM" width="90" x="849" y="525">
    <parameter key="normalize_weights" value="true"/>
    </operator>
    <operator activated="true" class="weights_to_data" compatibility="7.5.001" expanded="true" height="68" name="Weights to Data" width="90" x="782" y="210"/>
    <operator activated="true" class="sort" compatibility="7.5.001" expanded="true" height="82" name="Sort" width="90" x="916" y="210">
    <parameter key="attribute_name" value="Weight"/>
    <parameter key="sorting_direction" value="decreasing"/>
    </operator>
    <operator activated="true" class="filter_example_range" compatibility="7.5.001" expanded="true" height="82" name="Filter Example Range" width="90" x="1050" y="210">
    <parameter key="first_example" value="1"/>
    <parameter key="last_example" value="10"/>
    </operator>
    <connect from_op="Retrieve QRY_ITERATION_T_Mini" from_port="output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
    <connect from_op="Replace Missing Values" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Date" to_port="example set input"/>
    <connect from_op="Nominal to Date" from_port="example set output" to_op="Real to Integer" to_port="example set input"/>
    <connect from_op="Real to Integer" from_port="original" to_op="Numerical to Binominal" to_port="example set input"/>
    <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
    <connect from_op="Multiply" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
    <connect from_op="Decision Tree (2)" from_port="model" to_port="result 3"/>
    <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
    <connect from_op="Remap Binominals" from_port="example set output" to_op="Weight by SVM" to_port="example set"/>
    <connect from_op="Weight by SVM" from_port="weights" to_op="Weights to Data" to_port="attribute weights"/>
    <connect from_op="Weights to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
    <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
    <connect from_op="Filter Example Range" from_port="example set output" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    </process>
Sign In or Register to comment.