Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Classification of multi label dataset using SVM

cazzi123cazzi123 Member Posts: 3 Contributor I
edited November 2019 in Help
I am trying to apply SVM to the 20 newsgroups dataset without success.  I have applied some preprocessing such as tokenize, stemming and changed case. The process has nested the SVM operator in a Polynominal by Binaminal classification operator. It runs for hours before finally giving up due to memory.

I have applied Naive Bayes and K-NN without an issue and both complete pretty quickly.

Can you please take a look at the process below and make any suggestions on how I could speed up the classification using SVM.

The dataset has twenty labels.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input>
      <location>//RM_Repository/Project/Newsgroups_TrainVector</location>
    </input>
    <output>
      <location>Newsgroups_TrainVectorTH</location>
      <location>Newsgroups_TrainVector</location>
    </output>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files (2)" width="90" x="112" y="75">
        <list key="text_directories">
          <parameter key="alt.atheism" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\alt.atheism"/>
          <parameter key="comp.graphics" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.graphics"/>
          <parameter key="misc.forsale" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\misc.forsale"/>
          <parameter key="rec.autos" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\rec.autos"/>
          <parameter key="comp.os.ms-windows.misc" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.os.ms-windows.misc"/>
          <parameter key="comp.sys.ibm.pc.hardware" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.sys.ibm.pc.hardware"/>
          <parameter key="comp.windows.x" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.windows.x"/>
          <parameter key="rec.motorcycles" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\rec.motorcycles"/>
          <parameter key="rec.sport.baseball" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\rec.sport.baseball"/>
          <parameter key="rec.sport.hockey" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\rec.sport.hockey"/>
          <parameter key="sci.crypt" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\sci.crypt"/>
          <parameter key="sci.electronics" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\sci.electronics"/>
          <parameter key="sci.med" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\sci.med"/>
          <parameter key="sci.space" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\sci.space"/>
          <parameter key="soc.religion.christian" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\soc.religion.christian"/>
          <parameter key="talk.politics.guns" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\talk.politics.guns"/>
          <parameter key="talk.politics.mideast" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\talk.politics.mideast"/>
          <parameter key="talk.politics.misc" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\talk.politics.misc"/>
          <parameter key="talk.religion.misc" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\talk.religion.misc"/>
          <parameter key="comp.sys.mac.hardware" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.sys.mac.hardware"/>
        </list>
        <parameter key="prune_method" value="absolute"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="1000"/>
        <process expanded="true">
          <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="180" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="313" y="30"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (2)" width="90" x="447" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="581" y="30">
            <parameter key="min_chars" value="2"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
          <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
          <connect from_op="Stem (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.3.015" expanded="true" height="112" name="Validation" width="90" x="380" y="75">
        <parameter key="number_of_validations" value="5"/>
        <process expanded="true">
          <operator activated="false" class="k_nn" compatibility="5.3.015" expanded="true" height="76" name="k-NN" width="90" x="179" y="165">
            <parameter key="k" value="5"/>
          </operator>
          <operator activated="false" class="naive_bayes" compatibility="5.3.015" expanded="true" height="76" name="Naive Bayes" width="90" x="179" y="300"/>
          <operator activated="true" class="polynomial_by_binomial_classification" compatibility="5.3.015" expanded="true" height="76" name="Polynominal by Binominal Classification" width="90" x="112" y="30">
            <parameter key="classification_strategies" value="exhaustive code (ECOC)"/>
            <process expanded="true">
              <operator activated="true" class="support_vector_machine" compatibility="5.3.015" expanded="true" height="112" name="SVM (2)" width="90" x="313" y="75"/>
              <connect from_port="training set" to_op="SVM (2)" to_port="training set"/>
              <connect from_op="SVM (2)" from_port="model" to_port="model"/>
              <portSpacing port="source_training set" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
            </process>
          </operator>
          <operator activated="false" class="support_vector_machine_libsvm" compatibility="5.3.015" expanded="true" height="76" name="SVM" width="90" x="179" y="435">
            <list key="class_weights"/>
          </operator>
          <connect from_port="training" to_op="Polynominal by Binominal Classification" to_port="training set"/>
          <connect from_op="Polynominal by Binominal Classification" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="5.3.015" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
            <parameter key="main_criterion" value="accuracy"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files (3)" width="90" x="112" y="345">
        <list key="text_directories">
          <parameter key="alt.atheism" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\alt.atheism"/>
          <parameter key="comp.graphics" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.graphics"/>
          <parameter key="misc.forsale" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\misc.forsale"/>
          <parameter key="rec.autos" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\rec.autos"/>
          <parameter key="comp.os.ms-windows.misc" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.os.ms-windows.misc"/>
          <parameter key="comp.sys.ibm.pc.hardware" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.sys.ibm.pc.hardware"/>
          <parameter key="comp.windows.x" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-train\comp.windows.x"/>
          <parameter key="rec.motorcycles" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\rec.motorcycles"/>
          <parameter key="rec.sport.baseball" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\rec.sport.baseball"/>
          <parameter key="rec.sport.hockey" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\rec.sport.hockey"/>
          <parameter key="sci.crypt" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\sci.crypt"/>
          <parameter key="sci.electronics" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\sci.electronics"/>
          <parameter key="sci.med" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\sci.med"/>
          <parameter key="sci.space" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\sci.space"/>
          <parameter key="soc.religion.christian" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\soc.religion.christian"/>
          <parameter key="talk.politics.guns" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\talk.politics.guns"/>
          <parameter key="talk.politics.mideast" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\talk.politics.mideast"/>
          <parameter key="talk.politics.misc" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\talk.politics.misc"/>
          <parameter key="talk.religion.misc" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\talk.religion.misc"/>
          <parameter key="comp.sys.mac.hardware" value="E:\Year 2\Text Mining\Practical\20news-bydate\20news-bydate-test\comp.sys.mac.hardware"/>
        </list>
        <parameter key="prune_method" value="absolute"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="1000"/>
        <process expanded="true">
          <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (3)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (3)" width="90" x="180" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (3)" width="90" x="315" y="30"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (3)" width="90" x="450" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (3)" width="90" x="571" y="30">
            <parameter key="min_chars" value="2"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases (3)" to_port="document"/>
          <connect from_op="Transform Cases (3)" from_port="document" to_op="Tokenize (3)" to_port="document"/>
          <connect from_op="Tokenize (3)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
          <connect from_op="Filter Stopwords (3)" from_port="document" to_op="Stem (3)" to_port="document"/>
          <connect from_op="Stem (3)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
          <connect from_op="Filter Tokens (3)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model (2)" width="90" x="514" y="300">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Process Documents from Files (2)" from_port="example set" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Validation" from_port="training" to_port="result 2"/>
      <connect from_op="Process Documents from Files (3)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>
Sign In or Register to comment.