RapidMiner 9.7 is Now Available

Lots of amazing new improvements including true version control! Learn more about what's new here.

CLICK HERE TO DOWNLOAD

[HowTo] Create Box Plots to Check Regressions

mschmitzmschmitz Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 2,462  RM Data Scientist
Hey guys!

This is not a question, but rather a how to. I frequently use Box plots to asses the quality of regression problems.
What I do is, that I discretize the prediction, and look at  box plot to compare it to the real value. This looks like this:

Here we see a lot. Most importantly that this model is flat in the beginning and the end, and there is a big of a correlation in the center. I prefer these plots over normal scatter plots of True-vs-Predicted, because you may get disctracted by some outliers if you do this.

Attached is the example process how to generate such a plot. It needs a bit of preprocessing.

<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve Prices of Gas Station" width="90" x="112" y="136">
        <parameter key="repository_entry" value="//Samples/Time Series/data sets/Prices of Gas Station"/>
      </operator>
      <operator activated="true" class="time_series:windowing" compatibility="9.6.000" expanded="true" height="82" name="Windowing" width="90" x="246" y="136">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="gas price / euro (times 1000)"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="has_indices" value="true"/>
        <parameter key="indices_attribute" value="date"/>
        <parameter key="window_size" value="24"/>
        <parameter key="no_overlapping_windows" value="true"/>
        <parameter key="step_size" value="1"/>
        <parameter key="create_horizon_(labels)" value="true"/>
        <parameter key="horizon_attribute" value="gas price / euro (times 1000)"/>
        <parameter key="horizon_size" value="1"/>
        <parameter key="horizon_offset" value="0"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="9.6.000" expanded="true" height="103" name="Filter Examples" width="90" x="380" y="136">
        <parameter key="parameter_expression" value=""/>
        <parameter key="condition_class" value="custom_filters"/>
        <parameter key="invert_filter" value="false"/>
        <list key="filters_list">
          <parameter key="filters_entry_key" value="Last date in window.gt.01/01/2018 00:00:01 AM"/>
        </list>
        <parameter key="filters_logic_and" value="true"/>
        <parameter key="filters_check_metadata" value="true"/>
        <description align="center" color="transparent" colored="false" width="126">Filter on date&lt;br/&gt;</description>
      </operator>
      <operator activated="true" class="h2o:generalized_linear_model" compatibility="9.3.001" expanded="true" height="124" name="Generalized Linear Model" width="90" x="514" y="34">
        <parameter key="family" value="AUTO"/>
        <parameter key="link" value="family_default"/>
        <parameter key="solver" value="AUTO"/>
        <parameter key="reproducible" value="false"/>
        <parameter key="maximum_number_of_threads" value="4"/>
        <parameter key="use_regularization" value="true"/>
        <parameter key="lambda_search" value="false"/>
        <parameter key="number_of_lambdas" value="0"/>
        <parameter key="lambda_min_ratio" value="0.0"/>
        <parameter key="early_stopping" value="true"/>
        <parameter key="stopping_rounds" value="3"/>
        <parameter key="stopping_tolerance" value="0.001"/>
        <parameter key="standardize" value="true"/>
        <parameter key="non-negative_coefficients" value="false"/>
        <parameter key="add_intercept" value="true"/>
        <parameter key="compute_p-values" value="false"/>
        <parameter key="remove_collinear_columns" value="false"/>
        <parameter key="missing_values_handling" value="MeanImputation"/>
        <parameter key="max_iterations" value="0"/>
        <parameter key="specify_beta_constraints" value="false"/>
        <list key="beta_constraints"/>
        <parameter key="max_runtime_seconds" value="0"/>
        <list key="expert_parameters"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="9.6.000" expanded="true" height="82" name="Apply Model" width="90" x="648" y="187">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <operator activated="true" class="discretize_by_bins" compatibility="9.6.000" expanded="true" height="103" name="Discretize" width="90" x="782" y="136">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="prediction(gas price / euro (times 1000) + 1 (horizon))"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="numeric"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="real"/>
        <parameter key="block_type" value="value_series"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_series_end"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="number_of_bins" value="20"/>
        <parameter key="define_boundaries" value="false"/>
        <parameter key="range_name_type" value="interval"/>
        <parameter key="automatic_number_of_digits" value="true"/>
        <parameter key="number_of_digits" value="3"/>
      </operator>
      <operator activated="true" class="sort" compatibility="9.6.000" expanded="true" height="82" name="Sort" width="90" x="916" y="136">
        <parameter key="attribute_name" value="prediction(gas price / euro (times 1000) + 1 (horizon))"/>
        <parameter key="sorting_direction" value="increasing"/>
      </operator>
      <operator activated="true" class="append" compatibility="9.6.000" expanded="true" height="82" name="Append" width="90" x="1050" y="136">
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="merge_type" value="all"/>
      </operator>
      <connect from_op="Retrieve Prices of Gas Station" from_port="output" to_op="Windowing" to_port="example set"/>
      <connect from_op="Windowing" from_port="windowed example set" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Generalized Linear Model" to_port="training set"/>
      <connect from_op="Filter Examples" from_port="unmatched example set" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Generalized Linear Model" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Discretize" to_port="example set input"/>
      <connect from_op="Discretize" from_port="example set output" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_op="Append" to_port="example set 1"/>
      <connect from_op="Append" from_port="merged set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="84"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <description align="center" color="yellow" colored="false" height="176" resized="true" width="275" x="893" y="91">This gets the binning into the right order</description>
      <description align="center" color="green" colored="true" height="92" resized="true" width="448" x="1173" y="178">Use Boxplot.&lt;br&gt;&lt;br&gt;Volume Column: gas price / euro (times 1000) + 1 (horizon)&lt;br&gt;Group By Column: prediction(gas price / euro (times 1000) + 1 (horizon))&lt;br&gt;</description>
    </process>
  </operator>
</process>




- Head of Data Science Services at RapidMiner -
Dortmund, Germany
Tagged:
MPB_hbajpaiJasmine_MarcoBarradasyyhuangsgenzerDavid_ALeMarc
Sign In or Register to comment.