RapidMiner

How to convert logged label to actual prediction result?

Highlighted
Contributor II

How to convert logged label to actual prediction result?

I am currently participating in the Kaggle House Prices and I have log transformed my dependent variable which is SalePrice because it is originally skew to the right.

 

The reason why I'm doing this is I'm trying to average out the variance and improve the RMSE score.

 

However, the prediction(SalePrice) that I get is around the value 2.xx to 5.xx. I understand how logarithmic works but is there a way to generate true SalePrice after everything?

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.001" expanded="true" height="68" name="Retrieve Train" width="90" x="45" y="34">
        <parameter key="repository_entry" value="Train"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="7.5.001" expanded="true" height="82" name="Set Role" width="90" x="45" y="136">
        <parameter key="attribute_name" value="SalePrice"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="7.5.001" expanded="true" height="82" name="Log Transform" width="90" x="45" y="238">
        <list key="function_descriptions">
          <parameter key="SalePrice" value="log(SalePrice)"/>
        </list>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="SalePrice|1stFlrSF|GarageArea|GarageCars|GarageCond|GrLivArea|MSZoning|Neighborhood|OverallCond|OverallQual|TotalBsmtSF|YearBuilt"/>
      </operator>
      <operator activated="true" class="retrieve" compatibility="7.5.001" expanded="true" height="68" name="Retrieve Test" width="90" x="45" y="391">
        <parameter key="repository_entry" value="Test"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="7.5.001" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="391">
        <parameter key="condition_class" value="no_missing_attributes"/>
        <list key="filters_list"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="313" y="391">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="SalePrice"/>
        <parameter key="invert_selection" value="true"/>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="7.5.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="447" y="391">
        <parameter key="attribute_filter_type" value="value_type"/>
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="impute_missing_values" compatibility="7.5.001" expanded="true" height="68" name="Impute Missing Values" width="90" x="246" y="187">
        <process expanded="true">
          <operator activated="true" class="optimize_parameters_grid" compatibility="7.5.001" expanded="true" height="124" name="Optimize Parameters (Grid)" width="90" x="246" y="34">
            <list key="parameters">
              <parameter key="k-NN.k" value="[1;10;10;linear]"/>
            </list>
            <process expanded="true">
              <operator activated="true" class="concurrency:cross_validation" compatibility="7.5.001" expanded="true" height="145" name="Cross Validation (2)" width="90" x="179" y="34">
                <process expanded="true">
                  <operator activated="true" class="normalize" compatibility="7.5.001" expanded="true" height="103" name="Normalize" width="90" x="45" y="187"/>
                  <operator activated="true" class="k_nn" compatibility="7.5.001" expanded="true" height="82" name="k-NN (2)" width="90" x="112" y="34">
                    <parameter key="k" value="3"/>
                    <parameter key="weighted_vote" value="true"/>
                  </operator>
                  <operator activated="true" class="group_models" compatibility="7.5.001" expanded="true" height="103" name="Group Models" width="90" x="246" y="136"/>
                  <connect from_port="training set" to_op="Normalize" to_port="example set input"/>
                  <connect from_op="Normalize" from_port="example set output" to_op="k-NN (2)" to_port="training set"/>
                  <connect from_op="Normalize" from_port="preprocessing model" to_op="Group Models" to_port="models in 1"/>
                  <connect from_op="k-NN (2)" from_port="model" to_op="Group Models" to_port="models in 2"/>
                  <connect from_op="Group Models" from_port="model out" to_port="model"/>
                  <portSpacing port="source_training set" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="apply_model" compatibility="7.5.001" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="true" class="performance" compatibility="7.5.001" expanded="true" height="82" name="Performance (2)" width="90" x="179" y="34"/>
                  <connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
                  <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
                  <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
                  <connect from_op="Performance (2)" from_port="performance" to_port="performance 1"/>
                  <connect from_op="Performance (2)" from_port="example set" to_port="test set results"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_test set results" spacing="0"/>
                  <portSpacing port="sink_performance 1" spacing="0"/>
                  <portSpacing port="sink_performance 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="input 1" to_op="Cross Validation (2)" to_port="example set"/>
              <connect from_op="Cross Validation (2)" from_port="model" to_port="result 1"/>
              <connect from_op="Cross Validation (2)" from_port="performance 1" to_port="performance"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_performance" spacing="0"/>
              <portSpacing port="sink_result 1" spacing="0"/>
              <portSpacing port="sink_result 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="false" class="k_nn" compatibility="7.5.001" expanded="true" height="82" name="k-NN (3)" width="90" x="246" y="340">
            <parameter key="k" value="3"/>
            <parameter key="measure_types" value="NumericalMeasures"/>
          </operator>
          <connect from_port="example set source" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
          <connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_port="model sink"/>
          <portSpacing port="source_example set source" spacing="0"/>
          <portSpacing port="sink_model sink" spacing="0"/>
          <description align="center" color="yellow" colored="false" height="105" resized="false" width="180" x="206" y="194">Optimise kNN, only improve by a bit though</description>
        </process>
      </operator>
      <operator activated="true" class="detect_outlier_distances" compatibility="7.5.001" expanded="true" height="82" name="Detect Outlier (Distances)" width="90" x="313" y="34"/>
      <operator activated="true" class="filter_examples" compatibility="7.5.001" expanded="true" height="103" name="Remove Outliers" width="90" x="380" y="187">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="outlier.equals.false"/>
        </list>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="7.5.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="447" y="34">
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="7.5.001" expanded="true" height="145" name="Cross Validation" width="90" x="581" y="34">
        <process expanded="true">
          <operator activated="true" class="stacking" compatibility="7.5.001" expanded="true" height="68" name="Stacking" width="90" x="112" y="34">
            <process expanded="true">
              <operator activated="true" class="k_nn" compatibility="7.5.001" expanded="true" height="82" name="k-NN" width="90" x="179" y="34">
                <parameter key="k" value="10"/>
              </operator>
              <operator activated="true" class="support_vector_machine" compatibility="7.5.001" expanded="true" height="124" name="SVM" width="90" x="179" y="136"/>
              <operator activated="true" class="h2o:deep_learning" compatibility="7.5.000" expanded="true" height="82" name="Deep Learning" width="90" x="179" y="289">
                <enumeration key="hidden_layer_sizes">
                  <parameter key="hidden_layer_sizes" value="50"/>
                  <parameter key="hidden_layer_sizes" value="50"/>
                </enumeration>
                <enumeration key="hidden_dropout_ratios"/>
                <list key="expert_parameters"/>
                <list key="expert_parameters_"/>
              </operator>
              <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="7.5.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="179" y="391">
                <list key="expert_parameters"/>
              </operator>
              <connect from_port="training set 1" to_op="k-NN" to_port="training set"/>
              <connect from_port="training set 2" to_op="SVM" to_port="training set"/>
              <connect from_port="training set 3" to_op="Deep Learning" to_port="training set"/>
              <connect from_port="training set 4" to_op="Gradient Boosted Trees" to_port="training set"/>
              <connect from_op="k-NN" from_port="model" to_port="base model 1"/>
              <connect from_op="SVM" from_port="model" to_port="base model 2"/>
              <connect from_op="Deep Learning" from_port="model" to_port="base model 3"/>
              <connect from_op="Gradient Boosted Trees" from_port="model" to_port="base model 4"/>
              <portSpacing port="source_training set 1" spacing="0"/>
              <portSpacing port="source_training set 2" spacing="0"/>
              <portSpacing port="source_training set 3" spacing="0"/>
              <portSpacing port="source_training set 4" spacing="0"/>
              <portSpacing port="source_training set 5" spacing="0"/>
              <portSpacing port="sink_base model 1" spacing="0"/>
              <portSpacing port="sink_base model 2" spacing="0"/>
              <portSpacing port="sink_base model 3" spacing="0"/>
              <portSpacing port="sink_base model 4" spacing="0"/>
              <portSpacing port="sink_base model 5" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="linear_regression" compatibility="7.5.001" expanded="true" height="103" name="Linear Regression" width="90" x="179" y="34"/>
              <connect from_port="stacking examples" to_op="Linear Regression" to_port="training set"/>
              <connect from_op="Linear Regression" from_port="model" to_port="stacking model"/>
              <portSpacing port="source_stacking examples" spacing="0"/>
              <portSpacing port="sink_stacking model" spacing="0"/>
            </process>
          </operator>
          <connect from_port="training set" to_op="Stacking" to_port="training set"/>
          <connect from_op="Stacking" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="7.5.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance_regression" compatibility="7.5.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
            <parameter key="main_criterion" value="root_mean_squared_error"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
          <connect from_op="Performance" from_port="example set" to_port="test set results"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="7.5.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="581" y="238">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Retrieve Train" from_port="output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Log Transform" to_port="example set input"/>
      <connect from_op="Log Transform" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
      <connect from_op="Retrieve Test" from_port="output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
      <connect from_op="Nominal to Numerical (2)" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Impute Missing Values" from_port="example set out" to_op="Detect Outlier (Distances)" to_port="example set input"/>
      <connect from_op="Detect Outlier (Distances)" from_port="example set output" to_op="Remove Outliers" to_port="example set input"/>
      <connect from_op="Remove Outliers" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
      <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
      <connect from_op="Cross Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Cross Validation" from_port="example set" to_port="result 2"/>
      <connect from_op="Cross Validation" from_port="performance 1" to_port="result 1"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 3"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 4"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
      <description align="center" color="yellow" colored="false" height="105" resized="false" width="180" x="144" y="274">Log dependent variable, remote Log Transform operator to see the difference</description>
    </process>
  </operator>
</process>

 

 

1 REPLY
Elite III

Re: How to convert logged label to actual prediction result?

Of course, you can just use Generate Attributes to create a new value by exponentiating your resulting prediction score (using whatever base you originally performed the log conversion with) and that will turn it back into dollars in the appropriate range.

Brian T., Lindon Ventures - www.lindonventures.com
Analytics Consulting by Certified RapidMiner Analysts