RapidMiner

Polynomial regression gives wrong results (?)

SOLVED
Regular Contributor

Polynomial regression gives wrong results (?)

[ Edited ]

Hi RapidMiner,

 

I'm trying to use Polynomial Regression with a dataset generated from the function: y= 2*x^2 + 3*x + 1, and test the model with the same dataset, but the prediction results look like a straight line (attached picture). All the parameters are set as default (max iterations= 5000, replication factor= 1, max degree= 5). Could you show me how to get the correct result which can capture the quadratic curve using Polynomial Regression? I paste the dataset below and also insert the process code at the end. Thank you very much!

x y
1 6
2 15
3 28
4 45
5 66
6 91
7 120
8 153
9 190
10 231
11 276
12 325
13 378
14 435
15 496
16 561
17 630
18 703
19 780
20 861
21 946
22 1035
23 1128
24 1225

Polynomial-regression-result.jpg

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.3.000" expanded="true" height="68" name="Retrieve Polynomial" width="90" x="112" y="34">
        <parameter key="repository_entry" value="Polynomial"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="7.3.000" expanded="true" height="82" name="Set Role" width="90" x="246" y="34">
        <parameter key="attribute_name" value="y"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="normalize" compatibility="7.3.000" expanded="true" height="103" name="Normalize" width="90" x="380" y="34">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="x"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="numeric"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="real"/>
        <parameter key="block_type" value="value_series"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_series_end"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="method" value="range transformation"/>
        <parameter key="min" value="-1.0"/>
        <parameter key="max" value="1.0"/>
      </operator>
      <operator activated="true" class="polynomial_regression" compatibility="7.3.000" expanded="true" height="82" name="Polynomial Regression" width="90" x="648" y="34">
        <parameter key="max_iterations" value="5000"/>
        <parameter key="replication_factor" value="1"/>
        <parameter key="max_degree" value="5"/>
        <parameter key="min_coefficient" value="-100.0"/>
        <parameter key="max_coefficient" value="100.0"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <operator activated="true" class="retrieve" compatibility="7.3.000" expanded="true" height="68" name="Retrieve Polynomial (2)" width="90" x="112" y="289">
        <parameter key="repository_entry" value="Polynomial"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="7.3.000" expanded="true" height="82" name="Set Role (2)" width="90" x="246" y="289">
        <parameter key="attribute_name" value="y"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="normalize" compatibility="7.3.000" expanded="true" height="103" name="Normalize (2)" width="90" x="380" y="289">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="x"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="numeric"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="real"/>
        <parameter key="block_type" value="value_series"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_series_end"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="method" value="range transformation"/>
        <parameter key="min" value="-1.0"/>
        <parameter key="max" value="1.0"/>
      </operator>
      <operator activated="true" class="denormalize" compatibility="7.3.000" expanded="true" height="82" name="De-Normalize" width="90" x="581" y="493">
        <parameter key="missing_attribute_handling" value="proceed on missing"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="7.3.000" expanded="true" height="82" name="Apply Model" width="90" x="514" y="289">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="7.3.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="715" y="289">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <connect from_op="Retrieve Polynomial" from_port="output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Normalize" to_port="example set input"/>
      <connect from_op="Normalize" from_port="example set output" to_op="Polynomial Regression" to_port="training set"/>
      <connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Retrieve Polynomial (2)" from_port="output" to_op="Set Role (2)" to_port="example set input"/>
      <connect from_op="Set Role (2)" from_port="example set output" to_op="Normalize (2)" to_port="example set input"/>
      <connect from_op="Normalize (2)" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Normalize (2)" from_port="preprocessing model" to_op="De-Normalize" to_port="model input"/>
      <connect from_op="De-Normalize" from_port="model output" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>
3 REPLIES
RMStaff

Re: Polynomial regression gives wrong results (?)

Hi,

 

You need to set the "replication factor" to 2 - otherwise the attribute "x" will only be used a single time.  I also recommend to increase the number of "max iterations" for small data sets.

 

Here is a a process generating the same data set and training the model.  For this process I also restricted the max degree to 2 and the coefficients to a range between 1 and 3 so it is pretty much forced to learn your function.  In reality, you would most likely use a larger range for the coefficients and the degrees of course...

 

Hope this helps,

Ingo

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="subprocess" compatibility="7.3.000" expanded="true" height="82" name="Generate Data (2)" width="90" x="45" y="34">
        <process expanded="true">
          <operator activated="true" class="generate_data" compatibility="7.3.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="34">
            <parameter key="number_examples" value="24"/>
            <parameter key="number_of_attributes" value="1"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="7.3.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
          <operator activated="true" class="set_role" compatibility="7.3.000" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
            <parameter key="attribute_name" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.3.000" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="id"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.3.000" expanded="true" height="82" name="Rename" width="90" x="581" y="34">
            <parameter key="old_name" value="id"/>
            <parameter key="new_name" value="x"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="7.3.000" expanded="true" height="82" name="Generate Attributes" width="90" x="715" y="34">
            <list key="function_descriptions">
              <parameter key="y" value="2*x^2+3*x+1"/>
            </list>
          </operator>
          <operator activated="true" class="set_role" compatibility="7.3.000" expanded="true" height="82" name="Set Role (2)" width="90" x="849" y="34">
            <parameter key="attribute_name" value="y"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="materialize_data" compatibility="7.3.000" expanded="true" height="82" name="Materialize Data" width="90" x="983" y="34"/>
          <connect from_op="Generate Data" from_port="output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Materialize Data" to_port="example set input"/>
          <connect from_op="Materialize Data" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="polynomial_regression" compatibility="7.3.000" expanded="true" height="82" name="Polynomial Regression" width="90" x="179" y="34">
        <parameter key="max_iterations" value="100000"/>
        <parameter key="replication_factor" value="2"/>
        <parameter key="max_degree" value="2"/>
        <parameter key="min_coefficient" value="1.0"/>
        <parameter key="max_coefficient" value="3.0"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="7.3.000" expanded="true" height="82" name="Apply Model" width="90" x="313" y="34">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Generate Data (2)" from_port="out 1" to_op="Polynomial Regression" to_port="training set"/>
      <connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Polynomial Regression" from_port="exampleSet" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

How to load processes in XML from the forum into RapidMiner: Read this!
Highlighted
RMStaff

Re: Polynomial regression gives wrong results (?)

Interesting to test the polynominal regression model. Maybe it is a good chance to try RapidMiner's evolutionary optimization algorithm YAGGA (Yet Another Generating Genetic Algorithm). In short words, YAGGA will generate new attributes using some combinations of math functions: +, -, *, /, power function, etc.

The attached sample process shows that YAGGA kept the orignal attribute x and also generate a new attribute for x^2. You can apply linear regression with the constructed attributes from YAGGA.

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
    <parameter key="notification_email" value=""/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" breakpoints="after" class="subprocess" compatibility="7.3.000" expanded="true" height="82" name="Prepare data" width="90" x="45" y="34">
        <process expanded="true">
          <operator activated="true" class="generate_data" compatibility="7.3.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="34">
            <parameter key="number_examples" value="24"/>
            <parameter key="number_of_attributes" value="1"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="7.3.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
          <operator activated="true" class="set_role" compatibility="7.3.000" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
            <parameter key="attribute_name" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.3.000" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="id"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="rename" compatibility="7.3.000" expanded="true" height="82" name="Rename" width="90" x="581" y="34">
            <parameter key="old_name" value="id"/>
            <parameter key="new_name" value="x"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="7.3.000" expanded="true" height="82" name="Generate Attributes" width="90" x="715" y="34">
            <list key="function_descriptions">
              <parameter key="y" value="2*x^2 + 3*x + 1"/>
            </list>
          </operator>
          <operator activated="true" class="set_role" compatibility="7.3.000" expanded="true" height="82" name="Set Role (2)" width="90" x="849" y="34">
            <parameter key="attribute_name" value="y"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="materialize_data" compatibility="7.3.000" expanded="true" height="82" name="Materialize Data" width="90" x="983" y="34"/>
          <connect from_op="Generate Data" from_port="output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Materialize Data" to_port="example set input"/>
          <connect from_op="Materialize Data" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="optimize_by_generation_yagga" compatibility="7.3.000" expanded="true" height="103" name="YAGGA" width="90" x="179" y="34">
        <parameter key="population_size" value="100"/>
        <parameter key="maximum_number_of_generations" value="10"/>
        <parameter key="use_plus" value="false"/>
        <parameter key="reciprocal_value" value="false"/>
        <parameter key="tournament_size" value="0.8"/>
        <parameter key="keep_best_individual" value="true"/>
        <process expanded="true">
          <operator activated="true" class="split_validation" compatibility="7.3.000" expanded="true" height="124" name="Split Validation" width="90" x="313" y="34">
            <parameter key="local_random_seed" value="10"/>
            <process expanded="true">
              <operator activated="true" class="linear_regression" compatibility="7.3.000" expanded="true" height="103" name="Linear Regression" width="90" x="109" y="30"/>
              <connect from_port="training" to_op="Linear Regression" to_port="training set"/>
              <connect from_op="Linear Regression" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_regression" compatibility="7.3.000" expanded="true" height="82" name="Performance (Regression)" width="90" x="179" y="34">
                <parameter key="root_mean_squared_error" value="false"/>
                <parameter key="root_relative_squared_error" value="true"/>
              </operator>
              <connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
              <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (Regression)" to_port="labelled data"/>
              <connect from_op="Performance (Regression)" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="example set source" to_op="Split Validation" to_port="training"/>
          <connect from_op="Split Validation" from_port="averagable 1" to_port="performance sink"/>
          <portSpacing port="source_example set source" spacing="0"/>
          <portSpacing port="sink_performance sink" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="rename_by_constructions" compatibility="7.3.000" expanded="true" height="82" name="Rename by Constructions" width="90" x="313" y="34"/>
      <operator activated="true" class="linear_regression" compatibility="7.3.000" expanded="true" height="103" name="Linear Regression (2)" width="90" x="447" y="34"/>
      <connect from_op="Prepare data" from_port="out 1" to_op="YAGGA" to_port="example set in"/>
      <connect from_op="YAGGA" from_port="example set out" to_op="Rename by Constructions" to_port="example set input"/>
      <connect from_op="Rename by Constructions" from_port="example set output" to_op="Linear Regression (2)" to_port="training set"/>
      <connect from_op="Linear Regression (2)" from_port="model" to_port="result 1"/>
      <connect from_op="Linear Regression (2)" from_port="exampleSet" to_port="result 2"/>
      <connect from_op="Linear Regression (2)" from_port="weights" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="42"/>
    </process>
  </operator>
</process>
 

 

 

Regular Contributor

Re: Polynomial regression gives wrong results (?)

It works now, after I set the replication factor to 2. Thank you!