how to explain why k-nn gives better results using bootstrap sampling?

Fred12Fred12 Member Posts: 344 Unicorn
edited November 2018 in Help

hi,

how can I explain why k-NN gives better X-VAL performanve (93%) compared to 86% when I use a sample(Bootstrap) operator after selecting attributes?

but if I dont use Bootstrap for the X-Validation, I get better performance on the test data in the later (87%) wheras if I use bootstrap, I get only 83% on the separate test data...

I use bootstrap with 1.0 sample ratio... how can this be explained?

 

my process looks like this:

<?xml version="1.0" encoding="UTF-8"?><process version="7.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.2.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.2.001" expanded="true" height="68" name="Retrieve Master3Klassen_nominal" width="90" x="45" y="340">
<parameter key="repository_entry" value="//Marc/data/Master3Klassen_nominal"/>
</operator>
<operator activated="true" class="split_data" compatibility="7.2.001" expanded="true" height="103" name="Split Data" width="90" x="179" y="340">
<enumeration key="partitions">
<parameter key="ratio" value="0.5"/>
<parameter key="ratio" value="0.5"/>
</enumeration>
<parameter key="sampling_type" value="stratified sampling"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="85">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="Durchmesser|FlächezuGesamtfläche LIMI|Fläche LIMI"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.2.001" expanded="true" height="124" name="Multiply Trainings Data" width="90" x="447" y="85"/>
<operator activated="true" class="optimize_parameters_grid" compatibility="7.2.001" expanded="true" height="103" name="Optimize Parameters (Grid)" width="90" x="782" y="34">
<list key="parameters">
<parameter key="k-NN.k" value="[1.0;5;2;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="x_validation" compatibility="7.2.001" expanded="true" height="124" name="Validation" width="90" x="313" y="34">
<parameter key="sampling_type" value="stratified sampling"/>
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="7.2.001" expanded="true" height="82" name="k-NN" width="90" x="179" y="34">
<parameter key="k" value="5"/>
<parameter key="weighted_vote" value="true"/>
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CamberraDistance"/>
</operator>
<connect from_port="training" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="7.2.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.2.001" expanded="true" height="82" name="XVAL_Performance" width="90" x="313" y="34">
<parameter key="kappa" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="XVAL_Performance" to_port="labelled data"/>
<connect from_op="XVAL_Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="7.2.001" expanded="true" height="82" name="Log" width="90" x="514" y="34">
<list key="log">
<parameter key="k" value="operator.k-NN.parameter.k"/>
<parameter key="num_measures" value="operator.k-NN.parameter.numerical_measure"/>
<parameter key="Performance_perf" value="operator.XVAL_Performance.value.performance"/>
<parameter key="opt_par_perf" value="operator.Optimize Parameters (Grid).value.performance"/>
<parameter key="xval_perf" value="operator.Validation.value.performance"/>
<parameter key="perf2_perf" value="operator.TRAIN_Performance.value.performance"/>
<parameter key="perf2_kappa" value="operator.TRAIN_Performance.value.kappa"/>
<parameter key="perf3_perf" value="operator.TEST_Performance.value.performance"/>
<parameter key="perf3_kappa" value="operator.TEST_Performance.value.kappa"/>
</list>
</operator>
<connect from_port="input 1" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
<connect from_op="Log" from_port="through 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_parameters" compatibility="7.2.001" expanded="true" height="82" name="Set Parameters" width="90" x="916" y="85">
<list key="name_map">
<parameter key="k-NN" value="k-NN2"/>
</list>
</operator>
<operator activated="true" class="k_nn" compatibility="7.2.001" expanded="true" height="82" name="k-NN2" width="90" x="581" y="391">
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CamberraDistance"/>
<description align="center" color="transparent" colored="false" width="126">Final Model</description>
</operator>
<operator activated="true" class="multiply" compatibility="7.2.001" expanded="true" height="124" name="Multiply Model" width="90" x="715" y="391"/>
<operator activated="false" class="legacy:write_model" compatibility="7.2.001" expanded="true" height="68" name="Write Model" width="90" x="849" y="391">
<parameter key="model_file" value="C:\Users\Marc\.RapidMiner\repositories\Marc\results\knnModell.mod"/>
<parameter key="output_type" value="Binary"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.2.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="849" y="238">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.2.001" expanded="true" height="82" name="TRAIN_Performance" width="90" x="983" y="238">
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<list key="class_weights"/>
<description align="center" color="gray" colored="true" width="126">Training Error</description>
</operator>
<operator activated="true" class="log" compatibility="7.2.001" expanded="true" height="82" name="Log Train Perfromance" width="90" x="1117" y="238">
<list key="log">
<parameter key="accuracy" value="operator.XVAL_Performance.value.accuracy"/>
<parameter key="classification error" value="operator.XVAL_Performance.value.classification_error"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.2.001" expanded="true" height="82" name="Select Attributes (3)" width="90" x="447" y="595">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="Durchmesser|FlächezuGesamtfläche LIMI|Fläche LIMI"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.2.001" expanded="true" height="82" name="Apply Model (3)" width="90" x="983" y="595">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.2.001" expanded="true" height="82" name="TEST_Performance" width="90" x="1117" y="595">
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="log" compatibility="7.2.001" expanded="true" height="82" name="Log Test Performance" width="90" x="1251" y="595">
<list key="log">
<parameter key="accuracy" value="operator.TEST_Performance.value.accuracy"/>
<parameter key="classification error" value="operator.TEST_Performance.value.classification_error"/>
</list>
</operator>
<connect from_op="Retrieve Master3Klassen_nominal" from_port="output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Multiply Trainings Data" to_port="input"/>
<connect from_op="Multiply Trainings Data" from_port="output 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Multiply Trainings Data" from_port="output 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Multiply Trainings Data" from_port="output 3" to_op="k-NN2" to_port="training set"/>
<connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 1"/>
<connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_op="Set Parameters" to_port="parameter set"/>
<connect from_op="Set Parameters" from_port="parameter set" to_port="result 2"/>
<connect from_op="k-NN2" from_port="model" to_op="Multiply Model" to_port="input"/>
<connect from_op="Multiply Model" from_port="output 1" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Multiply Model" from_port="output 3" to_op="Apply Model (3)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="TRAIN_Performance" to_port="labelled data"/>
<connect from_op="TRAIN_Performance" from_port="performance" to_op="Log Train Perfromance" to_port="through 1"/>
<connect from_op="Log Train Perfromance" from_port="through 1" to_port="result 3"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Apply Model (3)" to_port="unlabelled data"/>
<connect from_op="Apply Model (3)" from_port="labelled data" to_op="TEST_Performance" to_port="labelled data"/>
<connect from_op="TEST_Performance" from_port="performance" to_op="Log Test Performance" to_port="through 1"/>
<connect from_op="Log Test Performance" from_port="through 1" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="21"/>
<portSpacing port="sink_result 3" spacing="147"/>
<portSpacing port="sink_result 4" spacing="336"/>
<portSpacing port="sink_result 5" spacing="0"/>
<description align="center" color="blue" colored="true" height="201" resized="true" width="563" x="22" y="44">Load and Prep Training Data</description>
<description align="center" color="blue" colored="true" height="169" resized="true" width="461" x="130" y="552">Load and Prep Testing Data</description>
<description align="center" color="purple" colored="true" height="179" resized="true" width="301" x="745" y="10">Find Optimal Parameters</description>
<description align="center" color="purple" colored="true" height="183" resized="true" width="467" x="525" y="353">Apply Parameters and Train Model</description>
<description align="center" color="gray" colored="true" height="136" resized="false" width="417" x="996" y="553">Testing Error</description>
<description align="center" color="yellow" colored="false" height="120" resized="false" width="180" x="1086" y="387">Optimized parameters are k for k-NN between 1 and 7.&lt;br/&gt;&lt;br/&gt;Best parameter applied to the operator on the left with the note &amp;quot;Final Model&amp;quot;</description>
</process>
</operator>
</process>
Tagged:
Sign In or Register to comment.