"Example - Classify Text Language"

B_ · January 2011

This process will assign a language for documents and RSS feeds. After tokenizing the text it creates trigrams which are matched against the training labels. The model then scores new text and assigns a language label.

Text that has a mixture of languages (i.e., Spanish and English) can end up marked as either language based on how many training examples you use. You may need to have a large number of examples for your preferred language.

To mark text categories or sentiment remove the ngram operator and use topics instead (Finance, Sports, Entertainment).


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Root">
    <description>Using a simple Naive Bayes classifier.</description>
    <process expanded="true" height="611" width="949">
      <operator activated="true" class="read_database" compatibility="5.0.10" expanded="true" height="60" name="Read DB - Train" width="90" x="45" y="30">
        <list key="data_set_meta_data_information"/>
        <parameter key="attribute_names_already_defined" value="true"/>
        <parameter key="connection" value="rsstext"/>
        <parameter key="query" value="SELECT &quot;id&quot;, &quot;title&quot;&#13;,&quot;lang_train&quot;&#10;FROM &quot;textfile&quot;&#13;&#10;WHERE&#13; lang_train is not null"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="ID Train" width="90" x="179" y="30">
        <parameter key="name" value="id"/>
        <parameter key="target_role" value="id"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Label Train" width="90" x="45" y="120">
        <parameter key="name" value="lang_train"/>
        <parameter key="target_role" value="label"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="NomText Train" width="90" x="179" y="120">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="title"/>
        <parameter key="attributes" value="posttitle|postdesc"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="ProcessDocs Train" width="90" x="313" y="210">
        <list key="specify_weights"/>
        <process expanded="true" height="565" width="827">
          <operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
          <operator activated="true" class="text:generate_n_grams_characters" compatibility="5.0.7" expanded="true" height="60" name="Generate n-Grams (Characters)" width="90" x="380" y="30"/>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Generate n-Grams (Characters)" to_port="document"/>
          <connect from_op="Generate n-Grams (Characters)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.0.10" expanded="true" height="76" name="SVM" width="90" x="447" y="210">
        <parameter key="kernel_type" value="linear"/>
        <list key="class_weights"/>
      </operator>
      <operator activated="true" class="read_database" compatibility="5.0.10" expanded="true" height="60" name="Read DB - Apply" width="90" x="45" y="345">
        <list key="data_set_meta_data_information"/>
        <parameter key="attribute_names_already_defined" value="true"/>
        <parameter key="connection" value="rsstext"/>
        <parameter key="query" value="SELECT &quot;id&quot;, &quot;title&quot;&#13;&#10;FROM &quot;textfile&quot;&#13;&#10;"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="ID Apply" width="90" x="45" y="435">
        <parameter key="name" value="id"/>
        <parameter key="target_role" value="id"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="NomText Apply" width="90" x="179" y="435">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="title"/>
        <parameter key="attributes" value="posttitle|postdesc"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="ProcessDocs Apply" width="90" x="380" y="345">
        <list key="specify_weights"/>
        <process expanded="true" height="657" width="827">
          <operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize (2)" width="90" x="180" y="30"/>
          <operator activated="true" class="text:generate_n_grams_characters" compatibility="5.0.7" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="503" y="30"/>
          <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
          <connect from_op="Generate n-Grams (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.0.10" expanded="true" height="76" name="Apply Model" width="90" x="648" y="255">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="write_database" compatibility="5.0.10" expanded="true" height="60" name="Write Database" width="90" x="782" y="255">
        <parameter key="connection" value="rsstext"/>
        <parameter key="table_name" value="langupdate"/>
        <parameter key="overwrite_mode" value="overwrite"/>
      </operator>
      <connect from_op="Read DB - Train" from_port="output" to_op="ID Train" to_port="example set input"/>
      <connect from_op="ID Train" from_port="example set output" to_op="Label Train" to_port="example set input"/>
      <connect from_op="Label Train" from_port="example set output" to_op="NomText Train" to_port="example set input"/>
      <connect from_op="NomText Train" from_port="example set output" to_op="ProcessDocs Train" to_port="example set"/>
      <connect from_op="ProcessDocs Train" from_port="example set" to_op="SVM" to_port="training set"/>
      <connect from_op="ProcessDocs Train" from_port="word list" to_op="ProcessDocs Apply" to_port="word list"/>
      <connect from_op="SVM" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Read DB - Apply" from_port="output" to_op="ID Apply" to_port="example set input"/>
      <connect from_op="ID Apply" from_port="example set output" to_op="NomText Apply" to_port="example set input"/>
      <connect from_op="NomText Apply" from_port="example set output" to_op="ProcessDocs Apply" to_port="example set"/>
      <connect from_op="ProcessDocs Apply" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Write Database" to_port="input"/>
      <connect from_op="Write Database" from_port="through" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="216"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

land · February 2011

Hi,
that's cool! Did you share this on myExperiment, too? You could use the CommunityExtension for that.

Greetings,
Sebastian

rakirk · February 2011

Very clever approach. Out of curiosity, would you be willing to share the accuracy levels of your process?

rk

B_ · February 2011

Sebastian,

I just wanted to post a simple example to help people get started.

Rakirk,

Accuracy depends on how many training examples you use and how many categories to classify. I use it to classify text between English and NotEnglish. I have about 1000 entries marked between the two categories - some pure English, some another languge and some mixed English/other language. Some very short text records are misclassified because of English abbreviations or mixed languages, but it works well enough for my application.

If you import text from the web, you may have problems with coding, such as to/from UTF, etc. You will need to preprocess the text to improve results.

rakirk · February 2011

I guess I was wondering more about comparative accuracy, primarily, how would the NBN compare to an SVM. The SVM may help account for smaller text files, but could also lead to overfitting.

B_ · February 2011

Haven't done a formal comparison. It works well enough for my tasks.

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

"Example - Classify Text Language"

Answers