Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Logistic Regression with polynominal attributes
Hi,
Novice here. My member type is a university professor but not in the data science field.
I am trying to run logistic regression using the data downloaded from here: https://www.kaggle.com/fayomi/advertising. As stated in the data description of the site, the goal is to predict people who are more likely to click the ad. I split the data into training and scoring data.
I want to get some help in how to handle the polynominal attributes in the data set. For example, there are so many Ad Topic Line and City values. When I ran the model, it ran but the results did not look right.
Also, the confidence percentage for ad click (yes, no) was either one or zero for all rows. I feel something might be wrong here.
Thanks for help...
Novice here. My member type is a university professor but not in the data science field.
I am trying to run logistic regression using the data downloaded from here: https://www.kaggle.com/fayomi/advertising. As stated in the data description of the site, the goal is to predict people who are more likely to click the ad. I split the data into training and scoring data.
I want to get some help in how to handle the polynominal attributes in the data set. For example, there are so many Ad Topic Line and City values. When I ran the model, it ran but the results did not look right.
Also, the confidence percentage for ad click (yes, no) was either one or zero for all rows. I feel something might be wrong here.
Thanks for help...
Tagged:
0
Best Answer
-
sgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Managerhi @jykim so that was fun. Here's a quick solution.
<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve advertising" width="90" x="45" y="85"> <parameter key="repository_entry" value="//NewLocalRepository/advertising"/> </operator> <operator activated="true" class="subprocess" compatibility="9.6.000" expanded="true" height="82" name="Subprocess" width="90" x="179" y="85"> <process expanded="true"> <operator activated="true" class="nominal_to_text" compatibility="9.6.000" expanded="true" height="82" name="Nominal to Text" width="90" x="45" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="Ad Topic Line"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="179" y="34"> <parameter key="create_word_vector" value="true"/> <parameter key="vector_creation" value="TF-IDF"/> <parameter key="add_meta_information" value="true"/> <parameter key="keep_text" value="true"/> <parameter key="prune_method" value="percentual"/> <parameter key="prune_below_percent" value="2.0"/> <parameter key="prune_above_percent" value="30.0"/> <parameter key="prune_below_rank" value="0.05"/> <parameter key="prune_above_rank" value="0.95"/> <parameter key="datamanagement" value="double_sparse_array"/> <parameter key="data_management" value="auto"/> <parameter key="select_attributes_and_weights" value="false"/> <list key="specify_weights"/> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"> <parameter key="mode" value="non letters"/> <parameter key="characters" value=".:"/> <parameter key="language" value="English"/> <parameter key="max_token_length" value="3"/> </operator> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="numerical_to_binominal" compatibility="9.6.000" expanded="true" height="82" name="Numerical to Binominal" width="90" x="313" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="Clicked on Ad"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="numeric"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="real"/> <parameter key="block_type" value="value_series"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_series_end"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="min" value="0.0"/> <parameter key="max" value="0.0"/> </operator> <operator activated="true" class="set_role" compatibility="9.6.000" expanded="true" height="82" name="Set Role" width="90" x="447" y="34"> <parameter key="attribute_name" value="Clicked on Ad"/> <parameter key="target_role" value="label"/> <list key="set_additional_roles"/> </operator> <operator activated="true" class="map" compatibility="9.6.000" expanded="true" height="82" name="Map" width="90" x="581" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="Country"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <list key="value_mappings"> <parameter key="United States of America" value="United States"/> <parameter key="Antarctica (the territory South of 60 deg S)" value="Antarctica"/> <parameter key="Bouvet Island (Bouvetoya)" value="Bouvet Island"/> <parameter key="British Indian Ocean Territory (Chagos Archipelago)" value="British Indian Ocean Territory"/> <parameter key="Brunei Darussalam" value="Brunei"/> <parameter key="Cote d'Ivoire" value="Ivory Coast (Côte d'Ivoire)"/> <parameter key="Falkland Islands (Malvinas)" value="Falkland Islands"/> <parameter key="Gambia" value="The Gambia"/> <parameter key="Holy See (Vatican City State)" value="Vatican City"/> <parameter key="Korea" value="South Korea"/> <parameter key="Kyrgyz Republic" value="Kyrgyzstan"/> <parameter key="Lao People's Democratic Republic" value="Laos"/> <parameter key="Libyan Arab Jamahiriya" value="Libya"/> <parameter key="Macao" value="Macau"/> <parameter key="Micronesia" value="Federated States of Micronesia"/> <parameter key="Netherlands Antilles" value="Caribbean Netherlands (Bonaire, Sint Eustatius and Saba)"/> <parameter key="Palestinian Territory" value="Palestine"/> <parameter key="Reunion" value="Réunion"/> <parameter key="Russian Federation" value="Russia"/> <parameter key="Saint Barthelemy" value="Saint Barthélemy"/> <parameter key="Saint Helena" value="Saint Helena, Ascension and Tristan da Cunha"/> <parameter key="Saint Martin" value="Saint Martin (French part)"/> <parameter key="Sao Tome and Principe" value="São Tomé and Principé"/> <parameter key="Slovakia (Slovak Republic)" value="Slovakia"/> <parameter key="Svalbard & Jan Mayen Islands" value="Svalbard and Jan Mayen"/> <parameter key="Syrian Arab Republic" value="Syria"/> <parameter key="Timor-Leste" value="East Timor"/> </list> <parameter key="consider_regular_expressions" value="false"/> <parameter key="add_default_mapping" value="false"/> </operator> <operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve World Countries" width="90" x="179" y="238"> <parameter key="repository_entry" value="//Community Samples/Community Data Sets/World Geography/World Countries"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.6.000" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="238"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Country Name|UN continental region"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:join" compatibility="9.6.000" expanded="true" height="82" name="Join" width="90" x="782" y="136"> <parameter key="remove_double_attributes" value="true"/> <parameter key="join_type" value="left"/> <parameter key="use_id_attribute_as_key" value="false"/> <list key="key_attributes"> <parameter key="Country" value="Country Name"/> </list> <parameter key="keep_both_join_attributes" value="false"/> </operator> <operator activated="true" class="nominal_to_numerical" compatibility="9.6.000" expanded="true" height="103" name="Nominal to Numerical" width="90" x="916" y="136"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="UN continental region"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="coding_type" value="dummy coding"/> <parameter key="use_comparison_groups" value="false"/> <list key="comparison_groups"/> <parameter key="unexpected_value_handling" value="all 0 and warning"/> <parameter key="use_underscore_in_name" value="false"/> </operator> <operator activated="true" class="date_to_numerical" compatibility="9.6.000" expanded="true" height="82" name="Date to Numerical" width="90" x="1050" y="136"> <parameter key="attribute_name" value="Timestamp"/> <parameter key="time_unit" value="hour"/> <parameter key="millisecond_relative_to" value="epoch"/> <parameter key="second_relative_to" value="minute"/> <parameter key="minute_relative_to" value="hour"/> <parameter key="hour_relative_to" value="day"/> <parameter key="day_relative_to" value="month"/> <parameter key="week_relative_to" value="year"/> <parameter key="month_relative_to" value="year"/> <parameter key="quarter_relative_to" value="year"/> <parameter key="half_year_relative_to" value="year"/> <parameter key="year_relative_to" value="era"/> <parameter key="keep_old_attribute" value="true"/> </operator> <operator activated="true" class="date_to_numerical" compatibility="9.6.000" expanded="true" height="82" name="Date to Numerical (2)" width="90" x="1184" y="136"> <parameter key="attribute_name" value="Timestamp"/> <parameter key="time_unit" value="day"/> <parameter key="millisecond_relative_to" value="epoch"/> <parameter key="second_relative_to" value="minute"/> <parameter key="minute_relative_to" value="hour"/> <parameter key="hour_relative_to" value="day"/> <parameter key="day_relative_to" value="week"/> <parameter key="week_relative_to" value="year"/> <parameter key="month_relative_to" value="year"/> <parameter key="quarter_relative_to" value="year"/> <parameter key="half_year_relative_to" value="year"/> <parameter key="year_relative_to" value="era"/> <parameter key="keep_old_attribute" value="true"/> </operator> <operator activated="true" class="date_to_numerical" compatibility="9.6.000" expanded="true" height="82" name="Date to Numerical (3)" width="90" x="1318" y="136"> <parameter key="attribute_name" value="Timestamp"/> <parameter key="time_unit" value="week"/> <parameter key="millisecond_relative_to" value="epoch"/> <parameter key="second_relative_to" value="minute"/> <parameter key="minute_relative_to" value="hour"/> <parameter key="hour_relative_to" value="day"/> <parameter key="day_relative_to" value="month"/> <parameter key="week_relative_to" value="year"/> <parameter key="month_relative_to" value="year"/> <parameter key="quarter_relative_to" value="year"/> <parameter key="half_year_relative_to" value="year"/> <parameter key="year_relative_to" value="era"/> <parameter key="keep_old_attribute" value="true"/> </operator> <operator activated="true" class="date_to_numerical" compatibility="9.6.000" expanded="true" height="82" name="Date to Numerical (4)" width="90" x="1452" y="136"> <parameter key="attribute_name" value="Timestamp"/> <parameter key="time_unit" value="day"/> <parameter key="millisecond_relative_to" value="epoch"/> <parameter key="second_relative_to" value="minute"/> <parameter key="minute_relative_to" value="hour"/> <parameter key="hour_relative_to" value="day"/> <parameter key="day_relative_to" value="month"/> <parameter key="week_relative_to" value="year"/> <parameter key="month_relative_to" value="year"/> <parameter key="quarter_relative_to" value="year"/> <parameter key="half_year_relative_to" value="year"/> <parameter key="year_relative_to" value="era"/> <parameter key="keep_old_attribute" value="false"/> </operator> <operator activated="true" class="generate_attributes" compatibility="9.6.000" expanded="true" height="82" name="Generate Attributes" width="90" x="1653" y="136"> <list key="function_descriptions"/> <parameter key="keep_all" value="true"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.6.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="1787" y="136"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="City|Country|Country Name|text"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="true"/> <parameter key="include_special_attributes" value="true"/> </operator> <connect from_port="in 1" to_op="Nominal to Text" to_port="example set input"/> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_op="Numerical to Binominal" to_port="example set input"/> <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Set Role" to_port="example set input"/> <connect from_op="Set Role" from_port="example set output" to_op="Map" to_port="example set input"/> <connect from_op="Map" from_port="example set output" to_op="Join" to_port="left"/> <connect from_op="Retrieve World Countries" from_port="output" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="right"/> <connect from_op="Join" from_port="join" to_op="Nominal to Numerical" to_port="example set input"/> <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Date to Numerical" to_port="example set input"/> <connect from_op="Date to Numerical" from_port="example set output" to_op="Date to Numerical (2)" to_port="example set input"/> <connect from_op="Date to Numerical (2)" from_port="example set output" to_op="Date to Numerical (3)" to_port="example set input"/> <connect from_op="Date to Numerical (3)" from_port="example set output" to_op="Date to Numerical (4)" to_port="example set input"/> <connect from_op="Date to Numerical (4)" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/> <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_port="out 1"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.6.000" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="85"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="10"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:logistic_regression" compatibility="9.3.001" expanded="true" height="124" name="Logistic Regression" width="90" x="112" y="34"> <parameter key="solver" value="AUTO"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_regularization" value="false"/> <parameter key="lambda_search" value="false"/> <parameter key="number_of_lambdas" value="0"/> <parameter key="lambda_min_ratio" value="0.0"/> <parameter key="early_stopping" value="true"/> <parameter key="stopping_rounds" value="3"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="standardize" value="true"/> <parameter key="non-negative_coefficients" value="false"/> <parameter key="add_intercept" value="true"/> <parameter key="compute_p-values" value="true"/> <parameter key="remove_collinear_columns" value="true"/> <parameter key="missing_values_handling" value="MeanImputation"/> <parameter key="max_iterations" value="0"/> <parameter key="max_runtime_seconds" value="0"/> </operator> <connect from_port="training set" to_op="Logistic Regression" to_port="training set"/> <connect from_op="Logistic Regression" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.6.000" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="performance_binominal_classification" compatibility="9.6.000" expanded="true" height="82" name="Performance" width="90" x="179" y="34"> <parameter key="manually_set_positive_class" value="false"/> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="false"/> <parameter key="AUC (optimistic)" value="false"/> <parameter key="AUC" value="false"/> <parameter key="AUC (pessimistic)" value="false"/> <parameter key="precision" value="false"/> <parameter key="recall" value="false"/> <parameter key="lift" value="false"/> <parameter key="fallout" value="false"/> <parameter key="f_measure" value="false"/> <parameter key="false_positive" value="false"/> <parameter key="false_negative" value="false"/> <parameter key="true_positive" value="false"/> <parameter key="true_negative" value="false"/> <parameter key="sensitivity" value="false"/> <parameter key="specificity" value="false"/> <parameter key="youden" value="false"/> <parameter key="positive_predictive_value" value="false"/> <parameter key="negative_predictive_value" value="false"/> <parameter key="psep" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> </operator> <connect from_port="model" to_op="Apply Model" to_port="model"/> <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/> <connect from_op="Performance" from_port="performance" to_port="performance 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> </process> </operator> <connect from_op="Retrieve advertising" from_port="output" to_op="Subprocess" to_port="in 1"/> <connect from_op="Subprocess" from_port="out 1" to_op="Cross Validation" to_port="example set"/> <connect from_op="Cross Validation" from_port="model" to_port="result 1"/> <connect from_op="Cross Validation" from_port="example set" to_port="result 2"/> <connect from_op="Cross Validation" from_port="performance 1" to_port="result 3"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> <portSpacing port="sink_result 4" spacing="0"/> </process> </operator> </process>
Scott1
Answers
Scott