Classification with LibSVM

ArmMiner · October 2012

Hi

I face a problem with LibSVM operator. My problem in general is the classification of the customers' reviews in the online shop (in german). So, what I did so far:
1. Collected the reviews (200) in the excel sheet.
2. I built a text processing model only for the editing my training data (tokenization, stemming, filtering,etc.) and saved in another excel sheet. So, it is less then 200 reviews and I gave them labels . For the beginning, I want to do 2-class classification (fast delivery and xxx).
3. I built another model, which is for the classification with LibSVM. Here LibSVM wants numeric values, so before this operator I use NominalToNumerical operator.
4. When I apply test data, the result is bad. A lot of examples are misclassified.
I put also breakpoint before LibSVM in order to see what exactly does that NominalToNumerical operator. And it just gives unique numbers to training examples, so it cant work.
Now I'm thinking how to solve this problem or maybe I have to use other operator? By the way, whole data is in german.
Thanks in advance.

Best regards
Armen

Classification Model

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
    <process expanded="true" height="386" width="815">
      <operator activated="true" class="read_database" compatibility="5.2.008" expanded="true" height="60" name="Read Database (2)" width="90" x="45" y="165">
        <parameter key="connection" value="sqlserver"/>
        <parameter key="query" value="SELECT *&#10;FROM `test_schnell`"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="read_excel" compatibility="5.2.008" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Klein.xls"/>
        <parameter key="imported_cell_range" value="A1:B123"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations"/>
        <parameter key="locale" value="German (Germany)"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Bewertung.true.text.attribute"/>
          <parameter key="1" value="Label.true.text.label"/>
        </list>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.2.008" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
        <parameter key="name" value="Bewertung"/>
        <list key="set_additional_roles">
          <parameter key="Label" value="label"/>
        </list>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="5.2.008" expanded="true" height="94" name="Nominal to Numerical" width="90" x="447" y="75">
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.2.008" expanded="true" height="76" name="SVM" width="90" x="648" y="30">
        <list key="class_weights"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="648" y="165">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Read Database (2)" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Read Excel" from_port="output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
      <connect from_op="Nominal to Numerical" from_port="example set output" to_op="SVM" to_port="training set"/>
      <connect from_op="SVM" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Text Processing Model

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
    <process expanded="true" height="375" width="756">
      <operator activated="true" class="read_excel" compatibility="5.2.008" expanded="true" height="60" name="Read Excel" width="90" x="45" y="75">
        <parameter key="excel_file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Training Data - Schnell.xls"/>
        <parameter key="imported_cell_range" value="A1:B201"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Bewertung.true.text.attribute"/>
          <parameter key="1" value="Label.true.text.label"/>
        </list>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="5.2.008" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="120"/>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.2.004" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="30">
        <parameter key="prunde_below_percent" value="5.0"/>
        <parameter key="prune_above_percent" value="100.0"/>
        <list key="specify_weights"/>
        <process expanded="true" height="386" width="774">
          <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30">
            <parameter key="mode" value="specify characters"/>
            <parameter key="characters" value=".:,:;:!:?:|:+-="/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="45" y="120">
            <parameter key="max_chars" value="9999"/>
          </operator>
          <operator activated="true" class="text:stem_dictionary" compatibility="5.2.004" expanded="true" height="76" name="Stem (Dictionary)" width="90" x="45" y="210">
            <parameter key="file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Wörterbuch.TXT"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.004" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="246" y="120"/>
          <operator activated="false" class="text:stem_german" compatibility="5.2.004" expanded="true" height="60" name="Stem (German)" width="90" x="313" y="30"/>
          <operator activated="true" class="text:filter_tokens_by_content" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="380" y="210">
            <parameter key="condition" value="contains match"/>
            <parameter key="string" value="schnell "/>
            <parameter key="regular_expression" value=".*schnell.*|.*liefer.*|.*gern.*|.*wieder.*|.*versand.*|.*ware.*|.*ordnung.*|"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Dictionary)" to_port="document"/>
          <connect from_op="Stem (Dictionary)" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
          <connect from_op="Filter Tokens (by Content)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="5.2.004" expanded="true" height="76" name="WordList to Data" width="90" x="313" y="210"/>
      <operator activated="true" class="write_excel" compatibility="5.2.008" expanded="true" height="76" name="Write Excel" width="90" x="514" y="165">
        <parameter key="excel_file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Klein.xls"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 2"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

A part of my training Data after text processing xxxxxxxxxxxxschnelle lieferungschnelle lieferungschnelle lieferungxxxxxxxxxxxxxxxschnelle lieferungschnelle lieferungxxxxxxxxxxxxschnelle lieferungxxxxxxxxxxxxschnelle lieferungschnelle lieferungschnelle lieferungschnelle lieferungschnelle lieferungschnelle lieferungxxxschnelle lieferungschnelle lieferungschnelle lieferungxxxschnelle lieferungschnelle lieferungschnelle lieferungschnelle lieferungschnelle lieferungxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxschnelle lieferungschnelle lieferungschnelle lieferungxxxschnelle lieferungschnelle lieferung

alles in ordnung
alles perfekt und schnell nur zu empfehlen
alles schnell und problemlos verlaufen
anfragen werden sehr schnell beantwortet
auch bei retouren sehr schnell
die lieferung ging schnell und die verpackung ist im sehr guten zustand
die ware wurde schnell und ordentlich verpackt geliefert
einwandfreie ware
gerne bei gelegenheit wieder mal
gerne wieder
gerne wieder
gut verpackt und gute ware
guter versand
heute geliefert
ich würde jederzeit wieder bei "mein paket" bestellen
immer wieder gerne
jederzeit wieder
kontaktanfragen waren nicht nötig
lieferung und service perfekt
lieferzeit war 1 woche
mit dem service zufrieden hätte auf ne schnellere e
preis voll und ganz in ordnung
preiswerte ware (terra 1
rasche lieferung einwandfreier ware
schnell versandt
schnelle lieferung
schnelle lieferung und guter rucksack
schnelle versendung der ware
schneller versand
sehr gerne wieder
sehr schnell geliefert
sehr schnelle lieferung
sehr schnelle versand
sichere lieferung mit dhl
sogar schneller als versandbenachrichtigung
super schneller versand
super schneller versand (schneller geht es kaum)
und schneller versand
vom schnellen versand bis hin zum super leckerem getränk
ware einwandfrei
ware i
ware ist wie beschrieben
ware kam innerhalb weniger tage
ware ok
ware orginalverpackt
ware und versand 1
ware wie beschrieben
werde wieder hier kaufen
wir waren mit dem anbieter sehr zufrieden
alles in ordnung
alles super immer wieder gerne
alles waren in ordnung
bestellung wurde sofort bearbeitet und wir hatten die lieferung innerhalb von 3 tagen bei uns
blitzlieferung
der artikel wurde zufriedenstellend und schnell versendet
der preis für die ware ist ok
die lieferung auch
die ware ist schnell geliefert wurden

The part of test data after text processing

bin sehr zufrieden
danke
das zelt ist einfach spitze
dass das produkt eklig ist
gern wieder
gerne wieder
gerne wieder
jede meiner vielen anfragen vor der kauf wurden durch kuhnshop schnell und zu meiner zufriedenheit beantwortet
jederzeit wieder
kann ich weiterempfehlen
lieferzeit top
schnelle lieferung
sehr schnelle lieferung
sowohl bei der lieferung als auch bei der rücknahme
super
top preis
verpackung
versand war gut und angemessen schnell
versand war sehr schnell
vielen dank
vorbildlicher service
ware ok
alles bestens
alles hat hervorragend geklappt
alles ohne probleme
alles super
alles super bin zufrieden
alles war bestens
alles wunderbar geklappt
bin sehr zufrieden
der anbieter kann ja nix dafür
für den preis kann mann nichts falsch machen
hat alles super geklappt
keine bemängelung nur positiv zu meiner zufriedenheit
kuhnshop zum zweiten mal bereits spitze
liefert gute ergebnisse

ArmMiner · October 2012

Actually, I don't want anybody to provide the solution. I'm just askin for some hints.
In my opinion the problem is in the type convertor.
Please help!

Best regards
Armen

Skirzynski · October 2012

Hey,

The "Nominal To Numerical" operator is the wrong one. As you already noticed it maps a string to a unique number for this string. Try to learn and validate a model on the same data via a cross-validation first.

Below this posting you can find a process which does a simple crossvalidation. In this example i read a csv-file, but you can use your Excel-operator as well. But please note: The output of the reading operator has to have 2 attributes. One where the unprocessed text is stored (regular attribute with the value type "text") and the other one a binominal label (special attribute "label" with the value type "binominal"). Please use the wizard of the reading operator to output this kind of data. The "Process Documents" operator does the tokenization, stemming, and filtering of stop words and creates an ExampleSet which already has the correct format to learn (i.e. it has numbers). This data will be the input for the cross-validation which output provides you information about how good your learner (for instance the libsvm) performs. If your performance is bad you should play with the parameters of the learner and/or pre-processing steps (the operator inside the "Process Documents" operator) or look at your data. Maybe you do not provide enough data. Or your labeling is not very good. In your data snippet which you have posted i can see that the first examples are talking about fast delivery (in a way) but they are not classified as such.

If your cross-validation indicates a good performance you can apply your model on new data. But i would test my process with a crossvalidation first.

Good luck
Marcin



<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
    <process expanded="true" height="520" width="620">
      <operator activated="true" class="read_csv" compatibility="5.3.000" expanded="true" height="60" name="Training" width="90" x="45" y="30">
        <parameter key="csv_file" value="/home/marcin/temp/training.csv"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations"/>
        <parameter key="encoding" value="UTF-8"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="text.true.text.attribute"/>
          <parameter key="1" value="label.true.binominal.label"/>
        </list>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.2.005" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="30">
        <list key="specify_weights"/>
        <process expanded="true" height="538" width="620">
          <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.005" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="179" y="30"/>
          <operator activated="true" class="text:stem_german" compatibility="5.2.005" expanded="true" height="60" name="Stem (German)" width="90" x="313" y="30"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
          <connect from_op="Stem (German)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.3.000" expanded="true" height="112" name="Validation" width="90" x="380" y="30">
        <process expanded="true" height="538" width="351">
          <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.3.000" expanded="true" height="76" name="SVM" width="90" x="112" y="30">
            <list key="class_weights"/>
          </operator>
          <connect from_port="training" to_op="SVM" to_port="training set"/>
          <connect from_op="SVM" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="538" width="351">
          <operator activated="true" class="apply_model" compatibility="5.3.000" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.3.000" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Training" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

ArmMiner · October 2012

Hi

Thank you for the help!
I will do so and try my best.
Thanks again.

Best regards
Armen

ArmMiner · October 2012

Hey

It works normally I think - 92 % accuracy.
Thanks a lot.

Best regards
Armen

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Classification with LibSVM

Answers