Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Walking Forward Testing
Hello,
I built a multivariate regression forecasting using NN. Results seems to be ok so far.
However and since I'm forecasting the next value (+1) using all past values I would like to be able to test the model in a walk forward way, i.e. using past values to predict next in a rolling way till last example. I thought about using sliding window validation.
Each sliding window iteration produces one example only, because it is the prediction of next one. It seems to me that this is the right approach to get this kind of validation.
However I would like to visualize all iterations results appended to have something like this: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2016/12/Sunspot-Dataset-Train-Test-Split.png
I though about collection and remember/recall operators, but with zero success.
Enclosed my mock process and data source.
Thanks for your help
I built a multivariate regression forecasting using NN. Results seems to be ok so far.
However and since I'm forecasting the next value (+1) using all past values I would like to be able to test the model in a walk forward way, i.e. using past values to predict next in a rolling way till last example. I thought about using sliding window validation.
Each sliding window iteration produces one example only, because it is the prediction of next one. It seems to me that this is the right approach to get this kind of validation.
However I would like to visualize all iterations results appended to have something like this: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2016/12/Sunspot-Dataset-Train-Test-Split.png
I though about collection and remember/recall operators, but with zero success.
Enclosed my mock process and data source.
Thanks for your help
Tagged:
0
Best Answer
-
hughesfleming68 Member Posts: 323 UnicornHi Oprick, you are correct to use the sliding window operator. Normally you would use the log operator to collect details about what is happening within the window. Looking at the train/test image, it looks like that because it is only showing where the data is split.
Are you trying to overlay a sequence of predictions over actual data?2
Answers
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="generate_data" compatibility="9.2.001" expanded="true" height="68" name="Generate Data" width="90" x="112" y="34">
<parameter key="target_function" value="random"/>
<parameter key="number_examples" value="1000"/>
<parameter key="number_of_attributes" value="1"/>
<parameter key="attributes_lower_bound" value="-10.0"/>
<parameter key="attributes_upper_bound" value="10.0"/>
<parameter key="gaussian_standard_deviation" value="10.0"/>
<parameter key="largest_radius" value="10.0"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
<operator activated="true" class="time_series:windowing" compatibility="9.2.001" expanded="true" height="82" name="Windowing" width="90" x="380" y="34">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="has_indices" value="false"/>
<parameter key="indices_attribute" value=""/>
<parameter key="window_size" value="50"/>
<parameter key="no_overlapping_windows" value="false"/>
<parameter key="step_size" value="1"/>
<parameter key="create_horizon_(labels)" value="true"/>
<parameter key="horizon_attribute" value="att1"/>
<parameter key="horizon_size" value="1"/>
<parameter key="horizon_offset" value="0"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="8.2.000" expanded="true" height="145" name="Validation" width="90" x="648" y="34">
<parameter key="split_on_batch_attribute" value="false"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_folds" value="10"/>
<parameter key="sampling_type" value="shuffled sampling"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.2.000" expanded="true" height="124" name="Generalized Linear Model" width="90" x="179" y="34">
<parameter key="family" value="AUTO"/>
<parameter key="link" value="family_default"/>
<parameter key="solver" value="AUTO"/>
<parameter key="reproducible" value="false"/>
<parameter key="maximum_number_of_threads" value="4"/>
<parameter key="use_regularization" value="true"/>
<parameter key="lambda_search" value="false"/>
<parameter key="number_of_lambdas" value="0"/>
<parameter key="lambda_min_ratio" value="0.0"/>
<parameter key="early_stopping" value="true"/>
<parameter key="stopping_rounds" value="3"/>
<parameter key="stopping_tolerance" value="0.001"/>
<parameter key="standardize" value="true"/>
<parameter key="non-negative_coefficients" value="false"/>
<parameter key="add_intercept" value="true"/>
<parameter key="compute_p-values" value="false"/>
<parameter key="remove_collinear_columns" value="false"/>
<parameter key="missing_values_handling" value="MeanImputation"/>
<parameter key="max_iterations" value="0"/>
<parameter key="specify_beta_constraints" value="false"/>
<list key="beta_constraints"/>
<parameter key="max_runtime_seconds" value="0"/>
<list key="expert_parameters"/>
</operator>
<connect from_port="training set" to_op="Generalized Linear Model" to_port="training set"/>
<connect from_op="Generalized Linear Model" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<description align="left" color="green" colored="true" height="113" resized="true" width="284" x="33" y="148">Builds a model on the current training data set (90 % of the data by default, 10 times).<br><br>Make sure that you only put numerical attributes into a linear regression!</description>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="9.2.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance" compatibility="9.2.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
<parameter key="use_example_weights" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
<description align="left" color="blue" colored="true" height="107" resized="true" width="333" x="28" y="139">Applies the model built from the training data set on the current test set (10 % by default).<br/>The Performance operator calculates performance indicators and sends them to the operator result.</description>
</process>
<description align="center" color="transparent" colored="false" width="126">A cross validation including a linear regression.</description>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Windowing" to_port="example set"/>
<connect from_op="Windowing" from_port="windowed example set" to_op="Validation" to_port="example set"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="test result set" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Kind regards,
Alex
I used log family operators. I was not very familiar with that.
Now I can overlap prediction_label and label.
Enclosed the corrected process. I hope it helps someone else.
Many Thanks
regards,
Alex
Sure I will. Thanks for the tip