Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Select By Weights Criteria
Hey,
I am currently building a process for TextMining. I used the TF-IDF as a solution. Briefly and concisely, it's about extracting important information from news. I filter the messages by topic and date so that I can assign the information to the message.
A friend recommended the operator Select by Weights to me. However, I always get an error message with the code:
<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001">
<context><input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve reut2-000" width="90" x="45" y="85">
<parameter key="repository_entry" value="reut2-000"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="85">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="|exchanges|orgs|people|text_orig|title|topics|zahlen"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="9.2.001" expanded="true" height="82" name="Generate ID" width="90" x="313" y="85">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
</operator>
<operator activated="true" breakpoints="after" class="filter_examples" compatibility="9.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="187">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="places.does_not_equal.?"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="187">
<parameter key="attribute_name" value="places"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" breakpoints="after" class="remove_correlated_attributes" compatibility="9.2.001" expanded="true" height="82" name="Remove Correlated Attributes" width="90" x="380" y="187">
<parameter key="correlation" value="0.8"/>
<parameter key="filter_relation" value="greater"/>
<parameter key="attribute_order" value="random"/>
<parameter key="use_absolute_correlation" value="true"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="subprocess" compatibility="9.2.001" expanded="true" height="124" name="Feature Engineering" width="90" x="581" y="85">
<process expanded="true">
<operator activated="true" class="multiply" compatibility="9.2.001" expanded="true" height="124" name="Multiply (2)" width="90" x="112" y="187"/>
<operator activated="true" class="weight_by_chi_squared_statistic" compatibility="9.2.001" expanded="true" height="82" name="Weight by Chi Squared Statistic" width="90" x="313" y="34">
<parameter key="normalize_weights" value="false"/>
<parameter key="sort_weights" value="true"/>
<parameter key="sort_direction" value="descending"/>
<parameter key="number_of_bins" value="10"/>
</operator>
<operator activated="true" breakpoints="after" class="select_by_weights" compatibility="9.2.001" expanded="true" height="103" name="Select by Weights (ChiSq)" width="90" x="514" y="34">
<parameter key="weight_relation" value="top k"/>
<parameter key="weight" value="10.0"/>
<parameter key="k" value="50"/>
<parameter key="p" value="0.1"/>
<parameter key="deselect_unknown" value="true"/>
<parameter key="use_absolute_weights" value="false"/>
</operator>
<operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store" width="90" x="715" y="34">
<parameter key="repository_entry" value="reut2-000"/>
</operator>
<operator activated="true" class="principal_component_analysis" compatibility="9.2.001" expanded="true" height="103" name="PCA" width="90" x="313" y="187">
<parameter key="dimensionality_reduction" value="keep variance"/>
<parameter key="variance_threshold" value="0.8"/>
<parameter key="number_of_components" value="1"/>
</operator>
<operator activated="true" class="weight_by_pca" compatibility="9.2.001" expanded="true" height="82" name="Weight by PCA" width="90" x="313" y="340">
<parameter key="normalize_weights" value="false"/>
<parameter key="sort_weights" value="true"/>
<parameter key="sort_direction" value="ascending"/>
<parameter key="component_number" value="1"/>
</operator>
<operator activated="true" breakpoints="after" class="select_by_weights" compatibility="9.2.001" expanded="true" height="103" name="Select by Weights (PCA)" width="90" x="514" y="340">
<parameter key="weight_relation" value="top k"/>
<parameter key="weight" value="10.0"/>
<parameter key="k" value="50"/>
<parameter key="p" value="0.1"/>
<parameter key="deselect_unknown" value="true"/>
<parameter key="use_absolute_weights" value="true"/>
</operator>
<operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store (3)" width="90" x="715" y="340">
<parameter key="repository_entry" value="reut2-000"/>
</operator>
<operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store (2)" width="90" x="715" y="187">
<parameter key="repository_entry" value="reut2-000"/>
</operator>
<connect from_port="in 1" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Weight by Chi Squared Statistic" to_port="example set"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="PCA" to_port="example set input"/>
<connect from_op="Multiply (2)" from_port="output 3" to_op="Weight by PCA" to_port="example set"/>
<connect from_op="Weight by Chi Squared Statistic" from_port="weights" to_op="Select by Weights (ChiSq)" to_port="weights"/>
<connect from_op="Weight by Chi Squared Statistic" from_port="example set" to_op="Select by Weights (ChiSq)" to_port="example set input"/>
<connect from_op="Select by Weights (ChiSq)" from_port="example set output" to_op="Store" to_port="input"/>
<connect from_op="Store" from_port="through" to_port="out 1"/>
<connect from_op="PCA" from_port="example set output" to_op="Store (2)" to_port="input"/>
<connect from_op="Weight by PCA" from_port="weights" to_op="Select by Weights (PCA)" to_port="weights"/>
<connect from_op="Weight by PCA" from_port="example set" to_op="Select by Weights (PCA)" to_port="example set input"/>
<connect from_op="Select by Weights (PCA)" from_port="example set output" to_op="Store (3)" to_port="input"/>
<connect from_op="Store (3)" from_port="through" to_port="out 3"/>
<connect from_op="Store (2)" from_port="through" to_port="out 2"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
<portSpacing port="sink_out 4" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve reut2-000" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Remove Correlated Attributes" to_port="example set input"/>
<connect from_op="Remove Correlated Attributes" from_port="example set output" to_op="Feature Engineering" to_port="in 1"/>
<connect from_op="Feature Engineering" from_port="out 1" to_port="result 1"/>
<connect from_op="Feature Engineering" from_port="out 2" to_port="result 2"/>
<connect from_op="Feature Engineering" from_port="out 3" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<description align="left" color="yellow" colored="false" height="278" resized="true" width="815" x="39" y="325">REDUKTION DER DIMENSIONALIT&#196;T<br/><br/>Hier geht hier darum, die Reduktion der Dimensionalit&#228;t anzustreben. Zwei m&#246;gliche Arten:<br>-- auf Basis PCA (braucht kein Zielvariable)<br>-- auf Basis ChiSquared (Zielvariable vorausus<br>Gibt es eine Zielvariable, so ist es m&#246;glich nur diejenigen Felder zu behalten, die hohes Potenzial f&#252;r ein Model haben.<br><br>Schritte:<br>a. Input Daten TF-IDF<br>b. Non-TFIDF Felder rausfiltern: exchanges, org, people, usw.<br>c. Filter nur Datens&#228;tze mit vollst&#228;ndigen Werte &#252;r Zielvariable<br>d. Entferne korrelierte TFIDF Felder<br>e. Verwende beiden Methoden zur Reduktion der Dimensionalit&#228;t. Daten speichern.<br><br></description>
<description align="left" color="yellow" colored="false" height="58" resized="true" width="301" x="177" y="22">F&#252;r die Reduktion der Dimensionalit&#228;t bleibt eine Zielvariable und die TF-IDF Felder.</description>
</process>
</operator>
</process>
The Input is a CSV Data which i download from the Newsholding Reuters.
Thanks
Tagged:
0
Best Answer
-
MartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,531 RM Data ScientistHi @Flixport,your process states itself, that you want to work on the TF-IDFFür die Reduktion der Dimensionalität bleibt eine Zielvariable und die TF-IDF Felder.
But you are not creating these. Hier is a process adding the TF-IDF. I've noted the difference with a red note and switched to Read CSV. Note that you need to install Text Processing extension to be able to run this.
I've written you an e-mail to the e-mail address you used for regrestration. I would appriciate an answer
Best,
Martin
<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001"><br> <context><br> <input/><br> <output/><br> <macros/><br> </context><br> <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process"><br> <parameter key="logverbosity" value="init"/><br> <parameter key="random_seed" value="2001"/><br> <parameter key="send_mail" value="never"/><br> <parameter key="notification_email" value=""/><br> <parameter key="process_duration_for_mail" value="30"/><br> <parameter key="encoding" value="UTF-8"/><br> <process expanded="true"><br> <operator activated="false" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve reut2-000" width="90" x="112" y="697"><br> <parameter key="repository_entry" value="reut2-000"/><br> </operator><br> <operator activated="true" class="read_csv" compatibility="9.2.001" expanded="true" height="68" name="Read CSV" width="90" x="45" y="85"><br> <parameter key="csv_file" value="C:\Users\MartinSchmitz\Downloads\reut2-000.csv"/><br> <parameter key="column_separators" value=";"/><br> <parameter key="trim_lines" value="false"/><br> <parameter key="use_quotes" value="true"/><br> <parameter key="quotes_character" value="""/><br> <parameter key="escape_character" value="\"/><br> <parameter key="skip_comments" value="true"/><br> <parameter key="comment_characters" value="#"/><br> <parameter key="starting_row" value="1"/><br> <parameter key="parse_numbers" value="true"/><br> <parameter key="decimal_character" value="."/><br> <parameter key="grouped_digits" value="false"/><br> <parameter key="grouping_character" value=","/><br> <parameter key="infinity_representation" value=""/><br> <parameter key="date_format" value=""/><br> <parameter key="first_row_as_names" value="true"/><br> <list key="annotations"/><br> <parameter key="time_zone" value="SYSTEM"/><br> <parameter key="locale" value="English (United States)"/><br> <parameter key="encoding" value="windows-1252"/><br> <parameter key="read_all_values_as_polynominal" value="false"/><br> <list key="data_set_meta_data_information"><br> <parameter key="0" value="exchanges.true.polynominal.attribute"/><br> <parameter key="1" value="orgs.true.polynominal.attribute"/><br> <parameter key="2" value="people.true.polynominal.attribute"/><br> <parameter key="3" value="places.true.polynominal.attribute"/><br> <parameter key="4" value="text.true.polynominal.attribute"/><br> <parameter key="5" value="title.true.polynominal.attribute"/><br> <parameter key="6" value="topics.true.polynominal.attribute"/><br> <parameter key="7" value="zahlen.true.polynominal.attribute"/><br> <parameter key="8" value="text_orig.true.polynominal.attribute"/><br> </list><br> <parameter key="read_not_matching_values_as_missings" value="false"/><br> <parameter key="datamanagement" value="double_array"/><br> <parameter key="data_management" value="auto"/><br> </operator><br> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="85"><br> <parameter key="attribute_filter_type" value="subset"/><br> <parameter key="attribute" value=""/><br> <parameter key="attributes" value="|exchanges|orgs|people|text_orig|title|topics|zahlen"/><br> <parameter key="use_except_expression" value="false"/><br> <parameter key="value_type" value="attribute_value"/><br> <parameter key="use_value_type_exception" value="false"/><br> <parameter key="except_value_type" value="time"/><br> <parameter key="block_type" value="attribute_block"/><br> <parameter key="use_block_type_exception" value="false"/><br> <parameter key="except_block_type" value="value_matrix_row_start"/><br> <parameter key="invert_selection" value="true"/><br> <parameter key="include_special_attributes" value="true"/><br> </operator><br> <operator activated="true" class="generate_id" compatibility="9.2.001" expanded="true" height="82" name="Generate ID" width="90" x="313" y="85"><br> <parameter key="create_nominal_ids" value="false"/><br> <parameter key="offset" value="0"/><br> </operator><br> <operator activated="true" breakpoints="after" class="filter_examples" compatibility="9.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="187"><br> <parameter key="parameter_expression" value=""/><br> <parameter key="condition_class" value="custom_filters"/><br> <parameter key="invert_filter" value="false"/><br> <list key="filters_list"><br> <parameter key="filters_entry_key" value="places.does_not_equal.?"/><br> </list><br> <parameter key="filters_logic_and" value="true"/><br> <parameter key="filters_check_metadata" value="true"/><br> </operator><br> <operator activated="true" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="187"><br> <parameter key="attribute_name" value="places"/><br> <parameter key="target_role" value="label"/><br> <list key="set_additional_roles"/><br> </operator><br> <operator activated="true" breakpoints="after" class="remove_correlated_attributes" compatibility="9.2.001" expanded="true" height="82" name="Remove Correlated Attributes" width="90" x="380" y="187"><br> <parameter key="correlation" value="0.8"/><br> <parameter key="filter_relation" value="greater"/><br> <parameter key="attribute_order" value="random"/><br> <parameter key="use_absolute_correlation" value="true"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <description align="center" color="transparent" colored="false" width="126">Type your comment</description><br> </operator><br> <operator activated="true" class="nominal_to_text" compatibility="9.2.001" expanded="true" height="82" name="Nominal to Text" width="90" x="581" y="187"><br> <parameter key="attribute_filter_type" value="single"/><br> <parameter key="attribute" value="text"/><br> <parameter key="attributes" value=""/><br> <parameter key="use_except_expression" value="false"/><br> <parameter key="value_type" value="nominal"/><br> <parameter key="use_value_type_exception" value="false"/><br> <parameter key="except_value_type" value="file_path"/><br> <parameter key="block_type" value="single_value"/><br> <parameter key="use_block_type_exception" value="false"/><br> <parameter key="except_block_type" value="single_value"/><br> <parameter key="invert_selection" value="false"/><br> <parameter key="include_special_attributes" value="false"/><br> </operator><br> <operator activated="true" class="text:process_document_from_data" compatibility="9.1.000-SNAPSHOT" expanded="true" height="82" name="Process Documents from Data" width="90" x="715" y="187"><br> <parameter key="create_word_vector" value="true"/><br> <parameter key="vector_creation" value="TF-IDF"/><br> <parameter key="add_meta_information" value="true"/><br> <parameter key="keep_text" value="false"/><br> <parameter key="prune_method" value="percentual"/><br> <parameter key="prune_below_percent" value="3.0"/><br> <parameter key="prune_above_percent" value="30.0"/><br> <parameter key="prune_below_rank" value="0.05"/><br> <parameter key="prune_above_rank" value="0.95"/><br> <parameter key="datamanagement" value="double_sparse_array"/><br> <parameter key="data_management" value="auto"/><br> <parameter key="select_attributes_and_weights" value="false"/><br> <list key="specify_weights"/><br> <process expanded="true"><br> <operator activated="true" class="text:tokenize" compatibility="9.1.000-SNAPSHOT" expanded="true" height="68" name="Tokenize" width="90" x="313" y="34"><br> <parameter key="mode" value="non letters"/><br> <parameter key="characters" value=".:"/><br> <parameter key="language" value="English"/><br> <parameter key="max_token_length" value="3"/><br> </operator><br> <connect from_port="document" to_op="Tokenize" to_port="document"/><br> <connect from_op="Tokenize" from_port="document" to_port="document 1"/><br> <portSpacing port="source_document" spacing="0"/><br> <portSpacing port="sink_document 1" spacing="0"/><br> <portSpacing port="sink_document 2" spacing="0"/><br> </process><br> </operator><br> <operator activated="true" class="subprocess" compatibility="9.2.001" expanded="true" height="124" name="Feature Engineering" width="90" x="849" y="187"><br> <process expanded="true"><br> <operator activated="true" class="multiply" compatibility="9.2.001" expanded="true" height="124" name="Multiply (2)" width="90" x="112" y="187"/><br> <operator activated="true" class="weight_by_chi_squared_statistic" compatibility="9.2.001" expanded="true" height="82" name="Weight by Chi Squared Statistic" width="90" x="313" y="34"><br> <parameter key="normalize_weights" value="false"/><br> <parameter key="sort_weights" value="true"/><br> <parameter key="sort_direction" value="descending"/><br> <parameter key="number_of_bins" value="10"/><br> </operator><br> <operator activated="true" breakpoints="after" class="select_by_weights" compatibility="9.2.001" expanded="true" height="103" name="Select by Weights (ChiSq)" width="90" x="514" y="34"><br> <parameter key="weight_relation" value="top k"/><br> <parameter key="weight" value="10.0"/><br> <parameter key="k" value="50"/><br> <parameter key="p" value="0.1"/><br> <parameter key="deselect_unknown" value="true"/><br> <parameter key="use_absolute_weights" value="false"/><br> </operator><br> <operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store" width="90" x="715" y="34"><br> <parameter key="repository_entry" value="reut2-000"/><br> </operator><br> <operator activated="true" class="principal_component_analysis" compatibility="9.2.001" expanded="true" height="103" name="PCA" width="90" x="313" y="187"><br> <parameter key="dimensionality_reduction" value="keep variance"/><br> <parameter key="variance_threshold" value="0.8"/><br> <parameter key="number_of_components" value="1"/><br> </operator><br> <operator activated="true" class="weight_by_pca" compatibility="9.2.001" expanded="true" height="82" name="Weight by PCA" width="90" x="313" y="340"><br> <parameter key="normalize_weights" value="false"/><br> <parameter key="sort_weights" value="true"/><br> <parameter key="sort_direction" value="ascending"/><br> <parameter key="component_number" value="1"/><br> </operator><br> <operator activated="true" breakpoints="after" class="select_by_weights" compatibility="9.2.001" expanded="true" height="103" name="Select by Weights (PCA)" width="90" x="514" y="340"><br> <parameter key="weight_relation" value="top k"/><br> <parameter key="weight" value="10.0"/><br> <parameter key="k" value="50"/><br> <parameter key="p" value="0.1"/><br> <parameter key="deselect_unknown" value="true"/><br> <parameter key="use_absolute_weights" value="true"/><br> </operator><br> <operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store (3)" width="90" x="715" y="340"><br> <parameter key="repository_entry" value="reut2-000"/><br> </operator><br> <operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store (2)" width="90" x="715" y="187"><br> <parameter key="repository_entry" value="reut2-000"/><br> </operator><br> <connect from_port="in 1" to_op="Multiply (2)" to_port="input"/><br> <connect from_op="Multiply (2)" from_port="output 1" to_op="Weight by Chi Squared Statistic" to_port="example set"/><br> <connect from_op="Multiply (2)" from_port="output 2" to_op="PCA" to_port="example set input"/><br> <connect from_op="Multiply (2)" from_port="output 3" to_op="Weight by PCA" to_port="example set"/><br> <connect from_op="Weight by Chi Squared Statistic" from_port="weights" to_op="Select by Weights (ChiSq)" to_port="weights"/><br> <connect from_op="Weight by Chi Squared Statistic" from_port="example set" to_op="Select by Weights (ChiSq)" to_port="example set input"/><br> <connect from_op="Select by Weights (ChiSq)" from_port="example set output" to_op="Store" to_port="input"/><br> <connect from_op="Store" from_port="through" to_port="out 1"/><br> <connect from_op="PCA" from_port="example set output" to_op="Store (2)" to_port="input"/><br> <connect from_op="Weight by PCA" from_port="weights" to_op="Select by Weights (PCA)" to_port="weights"/><br> <connect from_op="Weight by PCA" from_port="example set" to_op="Select by Weights (PCA)" to_port="example set input"/><br> <connect from_op="Select by Weights (PCA)" from_port="example set output" to_op="Store (3)" to_port="input"/><br> <connect from_op="Store (3)" from_port="through" to_port="out 3"/><br> <connect from_op="Store (2)" from_port="through" to_port="out 2"/><br> <portSpacing port="source_in 1" spacing="0"/><br> <portSpacing port="source_in 2" spacing="0"/><br> <portSpacing port="sink_out 1" spacing="0"/><br> <portSpacing port="sink_out 2" spacing="0"/><br> <portSpacing port="sink_out 3" spacing="0"/><br> <portSpacing port="sink_out 4" spacing="0"/><br> </process><br> </operator><br> <connect from_op="Read CSV" from_port="output" to_op="Select Attributes" to_port="example set input"/><br> <connect from_op="Select Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/><br> <connect from_op="Generate ID" from_port="example set output" to_op="Filter Examples" to_port="example set input"/><br> <connect from_op="Filter Examples" from_port="example set output" to_op="Set Role" to_port="example set input"/><br> <connect from_op="Set Role" from_port="example set output" to_op="Remove Correlated Attributes" to_port="example set input"/><br> <connect from_op="Remove Correlated Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/><br> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/><br> <connect from_op="Process Documents from Data" from_port="example set" to_op="Feature Engineering" to_port="in 1"/><br> <connect from_op="Feature Engineering" from_port="out 1" to_port="result 1"/><br> <connect from_op="Feature Engineering" from_port="out 2" to_port="result 2"/><br> <connect from_op="Feature Engineering" from_port="out 3" to_port="result 3"/><br> <portSpacing port="source_input 1" spacing="0"/><br> <portSpacing port="sink_result 1" spacing="0"/><br> <portSpacing port="sink_result 2" spacing="0"/><br> <portSpacing port="sink_result 3" spacing="0"/><br> <portSpacing port="sink_result 4" spacing="0"/><br> <description align="left" color="yellow" colored="false" height="278" resized="true" width="815" x="39" y="325">REDUKTION DER DIMENSIONALIT&#196;T<br/><br/>Hier geht hier darum, die Reduktion der Dimensionalit&#228;t anzustreben. Zwei m&#246;gliche Arten:<br>-- auf Basis PCA (braucht kein Zielvariable)<br>-- auf Basis ChiSquared (Zielvariable vorausus<br>Gibt es eine Zielvariable, so ist es m&#246;glich nur diejenigen Felder zu behalten, die hohes Potenzial f&#252;r ein Model haben.<br><br>Schritte:<br>a. Input Daten TF-IDF<br>b. Non-TFIDF Felder rausfiltern: exchanges, org, people, usw.<br>c. Filter nur Datens&#228;tze mit vollst&#228;ndigen Werte &#252;r Zielvariable<br>d. Entferne korrelierte TFIDF Felder<br>e. Verwende beiden Methoden zur Reduktion der Dimensionalit&#228;t. Daten speichern.<br><br></description><br> <description align="left" color="yellow" colored="false" height="58" resized="true" width="301" x="177" y="22">F&#252;r die Reduktion der Dimensionalit&#228;t bleibt eine Zielvariable und die TF-IDF Felder.</description><br> <description align="center" color="purple" colored="true" height="213" resized="true" width="316" x="517" y="97">Das hier ist neu und erstellt die TF/IDF Werte<br/>Evtl muss pruning sowie dinge wie Transfom Cases benutzt werden.</description><br> </process><br> </operator><br></process><br><br>
- Sr. Director Data Solutions, Altair RapidMiner -
Dortmund, Germany5
Answers
Dortmund, Germany