Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Bayes Learner and Text Classification - Index Out of Bounds
I am trying to classify text with W-Naive Bayes Multinomial, but keep getting an error 263 Array Index Out of Bounds error.
I use a sample of 200 text records from a database as a training set, save the model, then read 2000 text records to classify. I know there are additional terms in the full set that are not in the test set. Is this causing the index out of bounds error?
What will correct this?
Thanks
B.
training code
I use a sample of 200 text records from a database as a training set, save the model, then read 2000 text records to classify. I know there are additional terms in the full set that are not in the test set. Is this causing the index out of bounds error?
What will correct this?
Thanks
B.
training code
model applier
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Root">
<description>Using a simple Naive Bayes classifier.</description>
<process expanded="true" height="584" width="962">
<operator activated="true" class="retrieve" compatibility="5.0.10" expanded="true" height="60" name="Retrieve" width="90" x="45" y="75">
<parameter key="repository_entry" value="//RMRepository/train_source"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.0.10" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="trainentry"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Set Role" width="90" x="45" y="300">
<parameter key="name" value="trainlabel"/>
<parameter key="target_role" value="label"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Set Role (2)" width="90" x="45" y="390">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="Nominal to Text" width="90" x="246" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="posttitle"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="255">
<list key="specify_weights"/>
<process expanded="true" height="565" width="827">
<operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="165"/>
<operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize" width="90" x="313" y="120"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="weka:W-NaiveBayesMultinomialUpdateable" compatibility="5.0.1" expanded="true" height="76" name="W-NaiveBayesMultinomialUpdateable" width="90" x="447" y="120">
<parameter key="D" value="true"/>
</operator>
<operator activated="true" class="write_model" compatibility="5.0.10" expanded="true" height="60" name="Write Model" width="90" x="455" y="30">
<parameter key="model_file" value="M:\RM\mdl_test_01.mod"/>
<parameter key="output_type" value="XML"/>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="W-NaiveBayesMultinomialUpdateable" to_port="training set"/>
<connect from_op="W-NaiveBayesMultinomialUpdateable" from_port="model" to_op="Write Model" to_port="input"/>
<connect from_op="W-NaiveBayesMultinomialUpdateable" from_port="exampleSet" to_port="result 2"/>
<connect from_op="Write Model" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.10" expanded="true" name="Process">
<process expanded="true" height="341" width="681">
<operator activated="true" class="read_model" compatibility="5.0.10" expanded="true" height="60" name="Read Model" width="90" x="376" y="34">
<parameter key="model_file" value="M:\RM\mdl_test_01.mod"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.0.10" expanded="true" height="60" name="Retrieve" width="90" x="44" y="22">
<parameter key="repository_entry" value="assignlabel_01"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.0.10" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="120">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="entrylabel"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Set Role" width="90" x="45" y="255">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="posttitle"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="165">
<list key="specify_weights"/>
<process expanded="true" height="583" width="845">
<operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="51"/>
<operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize" width="90" x="313" y="75"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.0.10" expanded="true" height="76" name="Apply Model" width="90" x="581" y="75">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Model" from_port="output" to_op="Apply Model" to_port="model"/>
<connect from_op="Retrieve" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
0
Answers
could you paste the stack trace from the log?
Best,
Simon
the W-Naive Bayes did not show a stack trace, only the error messages in the log below.
W-NaiveBayesMultinomialUpdateable: Exception occured while classifying example:263 [class java.lang.ArrayIndexOutOfBoundsException]
This is from using the Naive Bayes process to create a model.
Exception: java.lang.ArrayIndexOutOfBoundsException
Message: 262
Stack trace:
com.rapidminer.operator.learner.bayes.SimpleDistributionModel.performPrediction(SimpleDistributionModel.java:384)
com.rapidminer.operator.learner.PredictionModel.apply(PredictionModel.java:76)
com.rapidminer.operator.ModelApplier.doWork(ModelApplier.java:100)
com.rapidminer.operator.Operator.execute(Operator.java:771)
com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:51)
com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:709)
com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:368)
com.rapidminer.operator.Operator.execute(Operator.java:771)
com.rapidminer.Process.run(Process.java:899)
com.rapidminer.Process.run(Process.java:795)
com.rapidminer.Process.run(Process.java:790)
com.rapidminer.Process.run(Process.java:780)
com.rapidminer.gui.ProcessThread.run(ProcessThread.java:62)
I opened a bug report. You can help by attaching a process and data to it.
http://bugs.rapid-i.com/show_bug.cgi?id=403
Best,
Simon
Problem is closed due to invalid classification process. The word list was missing and so the example set was incompatible with the model.
Best,
Simon
Can you provide a simple example of how to set up Bayes text classification? there isn't one in the samples.
Thanks
B.
the Naive Bayes operator can be used like any other learning algorithm. Try with an example without text first.
The problem is not the Naive Bayes operator. The problem is that the classification process does not have the same word list as the training process. For that reason, the example sets have different attributes during training and application. The whole process does not make sense, it won't give correct results for any learner. In order to fix the setup, you must store the word list generated during training and feed it back into the classification step. The document processing operator has an input for that. Consider the word list as a part of the model.
Best,
Simon
How to use word input wasn't clear - I assumed the word attributes were stored in the model. Now it's working.
thanks Simon