Options

"Normalisation changes random forest performance?"

MuehliManMuehliMan Member Posts: 85 Maven
edited May 2019 in Help
Hi,

I am working with random forest leaners and I was wondering if Normalisation does effect the outcome of Random Forest predictions. Against my expectations it does. Here is the workflow I used with random generated data (so everyone can run it).
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.0.8" expanded="true" name="Process">
    <process expanded="true" height="476" width="1552">
      <operator activated="true" class="generate_data" compatibility="5.0.8" expanded="true" height="60" name="Generate Data" width="90" x="45" y="30">
        <parameter key="number_examples" value="200"/>
        <parameter key="number_of_attributes" value="4"/>
      </operator>
      <operator activated="true" class="discretize_by_user_specification" compatibility="5.0.8" expanded="true" height="94" name="Discretize" width="90" x="179" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="label"/>
        <parameter key="include_special_attributes" value="true"/>
        <list key="classes">
          <parameter key="0" value="0.5"/>
          <parameter key="1" value="1.0"/>
        </list>
      </operator>
      <operator activated="true" class="nominal_to_binominal" compatibility="5.0.8" expanded="true" height="94" name="Nominal to Binominal" width="90" x="313" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="label"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="generate_id" compatibility="5.0.8" expanded="true" height="76" name="Generate ID" width="90" x="447" y="30"/>
      <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="112" name="Multiply" width="90" x="581" y="30"/>
      <operator activated="true" class="subprocess" compatibility="5.0.8" expanded="true" height="112" name="CV-DT + performance (3)" width="90" x="849" y="30">
        <process expanded="true" height="550" width="915">
          <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest (2)" width="90" x="45" y="30">
            <parameter key="criterion" value="gini_index"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (6)" width="90" x="180" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="94" name="Multiply (4)" width="90" x="315" y="30"/>
          <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance no" width="90" x="447" y="30">
            <parameter key="youden" value="true"/>
            <parameter key="psep" value="true"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.0.8" expanded="true" height="76" name="Select Attributes (4)" width="90" x="447" y="120">
            <parameter key="attribute_filter_type" value="regular_expression"/>
            <parameter key="regular_expression" value=".*prediction.*|label|id"/>
          </operator>
          <operator activated="true" class="rename" compatibility="5.0.8" expanded="true" height="76" name="Rename (4)" width="90" x="581" y="120">
            <parameter key="old_name" value="prediction(label)"/>
            <parameter key="new_name" value="no_normalisation"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.0.8" expanded="true" height="76" name="Set Role (3)" width="90" x="715" y="120">
            <parameter key="name" value="no_normalisation"/>
          </operator>
          <connect from_port="in 1" to_op="Random Forest (2)" to_port="training set"/>
          <connect from_op="Random Forest (2)" from_port="model" to_op="Apply Model (6)" to_port="model"/>
          <connect from_op="Random Forest (2)" from_port="exampleSet" to_op="Apply Model (6)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (6)" from_port="labelled data" to_op="Multiply (4)" to_port="input"/>
          <connect from_op="Apply Model (6)" from_port="model" to_port="out 2"/>
          <connect from_op="Multiply (4)" from_port="output 1" to_op="Performance no" to_port="labelled data"/>
          <connect from_op="Multiply (4)" from_port="output 2" to_op="Select Attributes (4)" to_port="example set input"/>
          <connect from_op="Performance no" from_port="performance" to_port="out 1"/>
          <connect from_op="Select Attributes (4)" from_port="example set output" to_op="Rename (4)" to_port="example set input"/>
          <connect from_op="Rename (4)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
          <connect from_op="Set Role (3)" from_port="example set output" to_port="out 3"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="normalize" compatibility="5.0.8" expanded="true" height="94" name="Z-Transformation" width="90" x="715" y="165"/>
      <operator activated="true" class="subprocess" compatibility="5.0.8" expanded="true" height="112" name="CV-DT + performance (2)" width="90" x="849" y="165">
        <process expanded="true" height="550" width="892">
          <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest (3)" width="90" x="45" y="30">
            <parameter key="criterion" value="gini_index"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (4)" width="90" x="180" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="94" name="Multiply (3)" width="90" x="315" y="30"/>
          <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance z" width="90" x="447" y="30">
            <parameter key="youden" value="true"/>
            <parameter key="psep" value="true"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.0.8" expanded="true" height="76" name="Select Attributes (3)" width="90" x="447" y="120">
            <parameter key="attribute_filter_type" value="regular_expression"/>
            <parameter key="regular_expression" value=".*prediction.*|label|id"/>
          </operator>
          <operator activated="true" class="rename" compatibility="5.0.8" expanded="true" height="76" name="Rename (3)" width="90" x="581" y="120">
            <parameter key="old_name" value="prediction(label)"/>
            <parameter key="new_name" value="z_normalisation"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.0.8" expanded="true" height="76" name="Set Role" width="90" x="715" y="120">
            <parameter key="name" value="z_normalisation"/>
          </operator>
          <connect from_port="in 1" to_op="Random Forest (3)" to_port="training set"/>
          <connect from_op="Random Forest (3)" from_port="model" to_op="Apply Model (4)" to_port="model"/>
          <connect from_op="Random Forest (3)" from_port="exampleSet" to_op="Apply Model (4)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Multiply (3)" to_port="input"/>
          <connect from_op="Apply Model (4)" from_port="model" to_port="out 2"/>
          <connect from_op="Multiply (3)" from_port="output 1" to_op="Performance z" to_port="labelled data"/>
          <connect from_op="Multiply (3)" from_port="output 2" to_op="Select Attributes (3)" to_port="example set input"/>
          <connect from_op="Performance z" from_port="performance" to_port="out 1"/>
          <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
          <connect from_op="Rename (3)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_port="out 3"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="join" compatibility="5.0.8" expanded="true" height="76" name="Join" width="90" x="983" y="30"/>
      <operator activated="true" class="normalize" compatibility="5.0.8" expanded="true" height="94" name="Range Transortmation" width="90" x="715" y="300">
        <parameter key="method" value="range transformation"/>
      </operator>
      <operator activated="true" class="subprocess" compatibility="5.0.8" expanded="true" height="112" name="CV-DT + performance" width="90" x="849" y="300">
        <process expanded="true" height="550" width="910">
          <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest" width="90" x="45" y="30">
            <parameter key="criterion" value="gini_index"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model" width="90" x="179" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="94" name="Multiply (2)" width="90" x="313" y="30"/>
          <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance range" width="90" x="447" y="30">
            <parameter key="youden" value="true"/>
            <parameter key="psep" value="true"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.0.8" expanded="true" height="76" name="Select Attributes (2)" width="90" x="449" y="120">
            <parameter key="attribute_filter_type" value="regular_expression"/>
            <parameter key="regular_expression" value=".*prediction.*|label|id"/>
          </operator>
          <operator activated="true" class="rename" compatibility="5.0.8" expanded="true" height="76" name="Rename (2)" width="90" x="581" y="120">
            <parameter key="old_name" value="prediction(label)"/>
            <parameter key="new_name" value="range_normalisation"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.0.8" expanded="true" height="76" name="Set Role (2)" width="90" x="715" y="120">
            <parameter key="name" value="range_normalisation"/>
          </operator>
          <connect from_port="in 1" to_op="Random Forest" to_port="training set"/>
          <connect from_op="Random Forest" from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_op="Random Forest" from_port="exampleSet" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Multiply (2)" to_port="input"/>
          <connect from_op="Apply Model" from_port="model" to_port="out 2"/>
          <connect from_op="Multiply (2)" from_port="output 1" to_op="Performance range" to_port="labelled data"/>
          <connect from_op="Multiply (2)" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Performance range" from_port="performance" to_port="out 1"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_port="out 3"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="log" compatibility="5.0.8" expanded="true" height="112" name="Log" width="90" x="1117" y="300">
        <list key="log">
          <parameter key="youden_range" value="operator.Performance range.value.youden"/>
          <parameter key="youden_z" value="operator.Performance z.value.youden"/>
          <parameter key="youden_no" value="operator.Performance no.value.youden"/>
          <parameter key="accuracy_range" value="operator.Performance range.value.accuracy"/>
          <parameter key="accuracy_z" value="operator.Performance z.value.accuracy"/>
          <parameter key="accuracy_no" value="operator.Performance no.value.accuracy"/>
        </list>
      </operator>
      <operator activated="true" class="log_to_data" compatibility="5.0.8" expanded="true" height="94" name="Log to Data" width="90" x="1251" y="300"/>
      <operator activated="true" class="join" compatibility="5.0.8" expanded="true" height="76" name="Join (2)" width="90" x="1117" y="30"/>
      <operator activated="true" class="nominal_to_numerical" compatibility="5.0.8" expanded="true" height="94" name="Nominal to Numerical" width="90" x="1251" y="30">
        <parameter key="attribute_filter_type" value="regular_expression"/>
        <parameter key="regular_expression" value=".*normalisation.*"/>
      </operator>
      <operator activated="true" class="generate_aggregation" compatibility="5.0.8" expanded="true" height="76" name="Stdev over all labels" width="90" x="1385" y="30">
        <parameter key="attribute_name" value="stdev_normalisation"/>
        <parameter key="attribute_filter_type" value="regular_expression"/>
        <parameter key="regular_expression" value=".*normalisation.*"/>
        <parameter key="aggregation_function" value="standard_deviation"/>
      </operator>
      <connect from_op="Generate Data" from_port="output" to_op="Discretize" to_port="example set input"/>
      <connect from_op="Discretize" from_port="example set output" to_op="Nominal to Binominal" to_port="example set input"/>
      <connect from_op="Nominal to Binominal" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
      <connect from_op="Generate ID" from_port="example set output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="CV-DT + performance (3)" to_port="in 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Z-Transformation" to_port="example set input"/>
      <connect from_op="Multiply" from_port="output 3" to_op="Range Transortmation" to_port="example set input"/>
      <connect from_op="CV-DT + performance (3)" from_port="out 1" to_op="Log" to_port="through 1"/>
      <connect from_op="CV-DT + performance (3)" from_port="out 3" to_op="Join" to_port="left"/>
      <connect from_op="Z-Transformation" from_port="example set output" to_op="CV-DT + performance (2)" to_port="in 1"/>
      <connect from_op="CV-DT + performance (2)" from_port="out 1" to_op="Log" to_port="through 2"/>
      <connect from_op="CV-DT + performance (2)" from_port="out 3" to_op="Join" to_port="right"/>
      <connect from_op="Join" from_port="join" to_op="Join (2)" to_port="left"/>
      <connect from_op="Range Transortmation" from_port="example set output" to_op="CV-DT + performance" to_port="in 1"/>
      <connect from_op="CV-DT + performance" from_port="out 1" to_op="Log" to_port="through 3"/>
      <connect from_op="CV-DT + performance" from_port="out 3" to_op="Join (2)" to_port="right"/>
      <connect from_op="Log" from_port="through 1" to_op="Log to Data" to_port="through 1"/>
      <connect from_op="Join (2)" from_port="join" to_op="Nominal to Numerical" to_port="example set input"/>
      <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Stdev over all labels" to_port="example set input"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
    </process>
  </operator>
</process>
I don't know why normalization does affect the performance and even the classification of some examples. I would be happy if someone could explain this to me.

Cheers,
Markus
Tagged:

Answers

  • Options
    fischerfischer Member Posts: 439 Maven
    Hi,

    are you sure these results have any statistical significance? I would say it's just random.

    Best,
    Simon
  • Options
    MuehliManMuehliMan Member Posts: 85 Maven
    Well, I was expecting that a normalsation would not affect the performance of my model, which is true for the decision tree. So I wonder why I affects a random forest. The question arose as we were skipping the normalization and got different results in our workflow with a random forest.

    I also did not find any literature on that. As I am no statistics guy it can very likely be that I am missing something that should be obvious.

    Cheers,
    Markus
  • Options
    haddockhaddock Member Posts: 849 Maven
    Hi Folks,

    In a separate thread today Pop brought up the issue of what happens when you multiply datasets and normalise a copy, but don't create views. In effect if you normalise in one place without creating views the underlying data is altered for all the other operators that use access that data, because multiplying just makes new pointers to the same underlying object. Heroically in this case Pop was able to quote chapter and verse from the help !!!!! Mutual back slapping all round is in order I think.

    I couldn't help noticing that your code involved multiplying and normalisation without views, so I've cannibalised it to get to the original question, and it actually doesn't seem to be the case that normalisation changes performance, or not that I can notice on the following..
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.0">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.0.8" expanded="true" name="Process">
        <process expanded="true" height="370" width="748">
          <operator activated="true" class="generate_data" compatibility="5.0.8" expanded="true" height="60" name="Generate Data" width="90" x="45" y="120">
            <parameter key="target_function" value="simple polynomial classification"/>
            <parameter key="number_examples" value="200"/>
            <parameter key="number_of_attributes" value="4"/>
          </operator>
          <operator activated="true" class="normalize" compatibility="5.0.8" expanded="true" height="94" name="Z-Transformation" width="90" x="179" y="120">
            <parameter key="create_view" value="true"/>
          </operator>
          <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest (3)" width="90" x="313" y="210">
            <parameter key="criterion" value="gini_index"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (2)" width="90" x="447" y="210">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest (2)" width="90" x="315" y="30">
            <parameter key="criterion" value="gini_index"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (6)" width="90" x="450" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance no" width="90" x="585" y="30">
            <parameter key="youden" value="true"/>
            <parameter key="psep" value="true"/>
          </operator>
          <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance no (2)" width="90" x="581" y="210">
            <parameter key="youden" value="true"/>
            <parameter key="psep" value="true"/>
          </operator>
          <connect from_op="Generate Data" from_port="output" to_op="Z-Transformation" to_port="example set input"/>
          <connect from_op="Z-Transformation" from_port="example set output" to_op="Random Forest (2)" to_port="training set"/>
          <connect from_op="Z-Transformation" from_port="original" to_op="Random Forest (3)" to_port="training set"/>
          <connect from_op="Random Forest (3)" from_port="model" to_op="Apply Model (2)" to_port="model"/>
          <connect from_op="Random Forest (3)" from_port="exampleSet" to_op="Apply Model (2)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance no (2)" to_port="labelled data"/>
          <connect from_op="Random Forest (2)" from_port="model" to_op="Apply Model (6)" to_port="model"/>
          <connect from_op="Random Forest (2)" from_port="exampleSet" to_op="Apply Model (6)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (6)" from_port="labelled data" to_op="Performance no" to_port="labelled data"/>
          <connect from_op="Performance no" from_port="performance" to_port="result 1"/>
          <connect from_op="Performance no" from_port="example set" to_port="result 2"/>
          <connect from_op="Performance no (2)" from_port="performance" to_port="result 3"/>
          <connect from_op="Performance no (2)" from_port="example set" to_port="result 4"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
          <portSpacing port="sink_result 5" spacing="0"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.