Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
"Sentiment Analysis Problem"
Hello,
I'm trying to run a sentiment analysis using linear svm. I'm reading my data from a sql server. However on the apply model (2) in the xml below I receive the below error each time. Would you have an idea why this is occurring ?
the input example set does not match the training example set. missing attribute aaaa
Paul
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="5.3.005" expanded="true" height="60" name="Read Database" width="90" x="45" y="30">
<parameter key="connection" value="sqlserver"/>
<parameter key="query" value="SELECT top 1000000 ID, Feed, Sentiment FROM Twitter_Training_Data WHERE Sentiment = 'Positive' UNION ALL SELECT top 1000000 ID, Feed, Sentiment FROM Twitter_Training_Data WHERE Sentiment = 'Negative'"/>
<parameter key="table_name" value="Sample_Feeds"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (3)" width="90" x="179" y="30">
<parameter key="name" value="ID"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="30">
<parameter key="keep_text" value="true"/>
<parameter key="prune_above_percent" value="90.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.000" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.000" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.000" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.000" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="30">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="999"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role" width="90" x="447" y="30">
<parameter key="name" value="Sentiment"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.005" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
<parameter key="number_of_validations" value="5"/>
<process expanded="true">
<operator activated="true" class="select_attributes" compatibility="5.3.005" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="30">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="attribute" value="text"/>
</operator>
<operator activated="true" class="nominal_to_binominal" compatibility="5.3.005" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Sentiment"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="support_vector_machine_linear" compatibility="5.3.005" expanded="true" height="76" name="SVM (Linear)" width="90" x="179" y="210"/>
<connect from_port="training" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Binominal" to_port="example set input"/>
<connect from_op="Nominal to Binominal" from_port="example set output" to_op="SVM (Linear)" to_port="training set"/>
<connect from_op="SVM (Linear)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model" width="90" x="45" y="75">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.005" expanded="true" height="76" name="Performance" width="90" x="179" y="120"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="false" class="read_excel" compatibility="5.3.005" expanded="true" height="60" name="Read Excel" width="90" x="45" y="255">
<parameter key="excel_file" value="C:\Users\dingopole\Desktop\Twitter_Data.xlsx"/>
<list key="annotations"/>
<list key="data_set_meta_data_information"/>
</operator>
<operator activated="false" class="nominal_to_text" compatibility="5.3.005" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="255"/>
<operator activated="true" class="read_database" compatibility="5.3.005" expanded="true" height="60" name="Read Database (2)" width="90" x="45" y="120">
<parameter key="connection" value="sqlserver"/>
<parameter key="query" value="SELECT top 100 id, feed as feed FROM "dbo"."Twitter_Test_Data""/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (4)" width="90" x="179" y="120">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="313" y="120">
<parameter key="keep_text" value="true"/>
<parameter key="prune_above_percent" value="90.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.000" expanded="true" height="60" name="Tokenize (2)" width="90" x="112" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.000" expanded="true" height="60" name="Transform Cases (2)" width="90" x="246" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.000" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="380" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.000" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="514" y="30">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="999"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
<connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (2)" width="90" x="447" y="120">
<parameter key="name" value="text"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model (2)" width="90" x="648" y="255">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Database" from_port="output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Validation" from_port="training" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<connect from_op="Read Database (2)" from_port="output" to_op="Set Role (4)" to_port="example set input"/>
<connect from_op="Set Role (4)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
I'm trying to run a sentiment analysis using linear svm. I'm reading my data from a sql server. However on the apply model (2) in the xml below I receive the below error each time. Would you have an idea why this is occurring ?
the input example set does not match the training example set. missing attribute aaaa
Paul
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="5.3.005" expanded="true" height="60" name="Read Database" width="90" x="45" y="30">
<parameter key="connection" value="sqlserver"/>
<parameter key="query" value="SELECT top 1000000 ID, Feed, Sentiment FROM Twitter_Training_Data WHERE Sentiment = 'Positive' UNION ALL SELECT top 1000000 ID, Feed, Sentiment FROM Twitter_Training_Data WHERE Sentiment = 'Negative'"/>
<parameter key="table_name" value="Sample_Feeds"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (3)" width="90" x="179" y="30">
<parameter key="name" value="ID"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="30">
<parameter key="keep_text" value="true"/>
<parameter key="prune_above_percent" value="90.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.000" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.000" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.000" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.000" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="30">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="999"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role" width="90" x="447" y="30">
<parameter key="name" value="Sentiment"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.005" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
<parameter key="number_of_validations" value="5"/>
<process expanded="true">
<operator activated="true" class="select_attributes" compatibility="5.3.005" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="30">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="attribute" value="text"/>
</operator>
<operator activated="true" class="nominal_to_binominal" compatibility="5.3.005" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Sentiment"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="support_vector_machine_linear" compatibility="5.3.005" expanded="true" height="76" name="SVM (Linear)" width="90" x="179" y="210"/>
<connect from_port="training" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Binominal" to_port="example set input"/>
<connect from_op="Nominal to Binominal" from_port="example set output" to_op="SVM (Linear)" to_port="training set"/>
<connect from_op="SVM (Linear)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model" width="90" x="45" y="75">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.005" expanded="true" height="76" name="Performance" width="90" x="179" y="120"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="false" class="read_excel" compatibility="5.3.005" expanded="true" height="60" name="Read Excel" width="90" x="45" y="255">
<parameter key="excel_file" value="C:\Users\dingopole\Desktop\Twitter_Data.xlsx"/>
<list key="annotations"/>
<list key="data_set_meta_data_information"/>
</operator>
<operator activated="false" class="nominal_to_text" compatibility="5.3.005" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="255"/>
<operator activated="true" class="read_database" compatibility="5.3.005" expanded="true" height="60" name="Read Database (2)" width="90" x="45" y="120">
<parameter key="connection" value="sqlserver"/>
<parameter key="query" value="SELECT top 100 id, feed as feed FROM "dbo"."Twitter_Test_Data""/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (4)" width="90" x="179" y="120">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="313" y="120">
<parameter key="keep_text" value="true"/>
<parameter key="prune_above_percent" value="90.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.000" expanded="true" height="60" name="Tokenize (2)" width="90" x="112" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.000" expanded="true" height="60" name="Transform Cases (2)" width="90" x="246" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.000" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="380" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.000" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="514" y="30">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="999"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
<connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (2)" width="90" x="447" y="120">
<parameter key="name" value="text"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model (2)" width="90" x="648" y="255">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Database" from_port="output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Validation" from_port="training" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<connect from_op="Read Database (2)" from_port="output" to_op="Set Role (4)" to_port="example set input"/>
<connect from_op="Set Role (4)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
At first glance I'd suggest that you do all your pre-processing before the validation, as you are set up only the training examples get filtered and re-typed, the testing examples therefore have different attributes and RM gets confused..
Hope that works!
Best wishes,
H
Most of the pre-processing is done before it arrives in RM via an ETL process on SQL.
Are you suggesting removing the below outside of the x-validation operator ?
<operator activated="true" class="select_attributes" compatibility="5.3.005" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="30">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="attribute" value="text"/>
</operator>
<operator activated="true" class="nominal_to_binominal" compatibility="5.3.005" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Sentiment"/>
<parameter key="include_special_attributes" value="true"
Paul
The point is that the training and testing example sets need to have the same attributes; something like the following is better.. Ciao,
H
The input ExampleSet does not match the training ExampleSet. Missing attribute:aaa
The operator expects the input to have a set of Attributes which is equal or a superset of the ExampleSet used for training of the input model.
Please make sure that the attributes of the two example sets satisfy this condition.
Cause: Apply Model
You are training on one set of attributes, and testing on another; put a break before each model applier to check that they are all using the same set of attributes.
Ciao,
H