Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Selected Attribute Not Appearing in Output
minerthreat
Member Posts: 2 Contributor I
I am running a Naive Bayes analysis on textual data. The Naive Bayes model itself is in another process that is input into the Apply Model operator in the process described below. The input data in the process is a 162 row dataset in MySQL. 'Title' is one of the columns/attributes in this table. These are simply titles of various news articles from around the web. As my XML code below shows, I want title to be included in my output. However, it does not appear even though the process completes successfully and the other selected attributes do appear.
My log contains the following warnings:
WARNING: SimpleDistribution: The number of regular attributes of the given example set does not fit the number of attributes of the training example set, training: 26228, application: 3162
WARNING: SimpleDistribution: The given example set does not contain a regular attribute with name 'aa_batteri'. This might cause problems for some models depending on this particular attribute.
The second warning repeats for many many times over for separate n-grams.
My XML:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve Model" width="90" x="45" y="30">
<parameter key="repository_entry" value="//NewLocalRepository/Virtualization/Disruption Prediction Model/Realtime Predictions/Modeling/Model"/>
</operator>
<operator activated="true" class="read_database" compatibility="5.3.015" expanded="true" height="60" name="Read Database" width="90" x="45" y="210">
<parameter key="connection" value="MySQL"/>
<parameter key="query" value="SELECT `title`, `clean_text` FROM `potential_tech_disruptions_clean_data_tbl`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="210">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="clean_text|title|"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.3.015" expanded="true" height="76" name="Nominal to Text" width="90" x="313" y="210">
<parameter key="attributes" value="|title|clean_text"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="447" y="210">
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_percent" value="0.0"/>
<parameter key="prune_above_percent" value="10.0"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="99999"/>
<parameter key="prune_below_rank" value="0.5"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="180" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="315" y="30"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (2)" width="90" x="450" y="30"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="313" y="165">
<parameter key="max_length" value="3"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="447" y="165">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="125"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
<connect from_op="Stem (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
<connect from_op="Generate n-Grams (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
<connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model (2)" width="90" x="179" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes (2)" width="90" x="313" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|title|confidence(N)|confidence(Y)|prediction(Disruptive)"/>
</operator>
<operator activated="true" class="write_database" compatibility="5.3.015" expanded="true" height="60" name="Write Database" width="90" x="447" y="30">
<parameter key="connection" value="MySQL"/>
<parameter key="table_name" value="predicted_disruption_tbl2"/>
<parameter key="overwrite_mode" value="overwrite"/>
<parameter key="set_default_varchar_length" value="true"/>
<parameter key="default_varchar_length" value="255"/>
</operator>
<connect from_op="Retrieve Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Read Database" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Write Database" to_port="input"/>
<connect from_op="Write Database" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Thank you for any help that can be offered.
My log contains the following warnings:
WARNING: SimpleDistribution: The number of regular attributes of the given example set does not fit the number of attributes of the training example set, training: 26228, application: 3162
WARNING: SimpleDistribution: The given example set does not contain a regular attribute with name 'aa_batteri'. This might cause problems for some models depending on this particular attribute.
The second warning repeats for many many times over for separate n-grams.
My XML:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve Model" width="90" x="45" y="30">
<parameter key="repository_entry" value="//NewLocalRepository/Virtualization/Disruption Prediction Model/Realtime Predictions/Modeling/Model"/>
</operator>
<operator activated="true" class="read_database" compatibility="5.3.015" expanded="true" height="60" name="Read Database" width="90" x="45" y="210">
<parameter key="connection" value="MySQL"/>
<parameter key="query" value="SELECT `title`, `clean_text` FROM `potential_tech_disruptions_clean_data_tbl`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="210">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="clean_text|title|"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.3.015" expanded="true" height="76" name="Nominal to Text" width="90" x="313" y="210">
<parameter key="attributes" value="|title|clean_text"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="447" y="210">
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_percent" value="0.0"/>
<parameter key="prune_above_percent" value="10.0"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="99999"/>
<parameter key="prune_below_rank" value="0.5"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="180" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="315" y="30"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (2)" width="90" x="450" y="30"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="313" y="165">
<parameter key="max_length" value="3"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="447" y="165">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="125"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
<connect from_op="Stem (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
<connect from_op="Generate n-Grams (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
<connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model (2)" width="90" x="179" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes (2)" width="90" x="313" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|title|confidence(N)|confidence(Y)|prediction(Disruptive)"/>
</operator>
<operator activated="true" class="write_database" compatibility="5.3.015" expanded="true" height="60" name="Write Database" width="90" x="447" y="30">
<parameter key="connection" value="MySQL"/>
<parameter key="table_name" value="predicted_disruption_tbl2"/>
<parameter key="overwrite_mode" value="overwrite"/>
<parameter key="set_default_varchar_length" value="true"/>
<parameter key="default_varchar_length" value="255"/>
</operator>
<connect from_op="Retrieve Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Read Database" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Write Database" to_port="input"/>
<connect from_op="Write Database" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Thank you for any help that can be offered.
0
Answers
when you right-click an operator, you can add breakpoints before/after it. That way, you can check if your data is coming out / going into an operator and see where the problem is, i.e. where the attribute gets lost.
Regards,
Marco
This means that those attributes clean_text & title would both disappear.
I would suggest ticking the keep text parameter & trying that:
I added a textual variable titled 'text' that comprises the bodies of the articles to the second Select Attributes operator just to see if I could get that column in the output. This worked once I applied JEdward's tip about checking the 'keep text' box in the Process Docs operator.
So, I'm getting closer to the source of the issue, but I still don't get why title isn't included in my output.
Before applying the model you must apply the same Process Document Operator
you used when train the model. This explains your warning messages.
Process Documents eats every attribute you gave the type "text". You must copy it into
a new attribute name with type "nominal" to prevent this.
Ticking "Keep text" merges every text into one single attribute "text" after processing.