RapidMiner

RapidMiner

Substantial bug in scoring

Contributor II

Substantial bug in scoring

Hello,

I've come across a pretty big bug. I build a model and then I read in a new dataset and try to score it.
The numeric attribute information stays the same, but the names I use as labels are written incorrectly. Is there
a fix or workaround for this problem?  I'm using WinXP and RapidMiner Community 4.2.0000.

Thanks, Mike



======= debug1.csv (TRAIN) =========

name,var1,var2,var3
Jimi,0.352612363,0.590121045,0.564992742
Janis,0.922569485,0.790112692,0.00504262
Bob,0.766240589,0.908079931,0.734902274
Peter,0.460154945,0.464329674,0.686559339
Paul,0.393046641,0.393054941,0.910596227
Mary,0.322384817,0.403900951,0.176867868
Joni,0.466668921,0.366803665,0.230654245

======= debug2.csv (SCORE) =========
* SAME NAME AS IN TRAINING DATASET
name,var1,var2,var3
Buddy,,0.576962167,0.318579208
Ringo,,0.571770059,0.977731173
Peter,,0.464329674,0.686559339 *
Paul,,0.393054941,0.910596227  *
Mary,,0.403900951,0.176867868  *
Joni,,0.366803665,0.230654245  *
John,,0.565080214,0.509177042
Jimi,,0.590121045,0.564992742  *
Jim,,0.864104026,0.839055131
Janis,,0.790112692,0.00504262  *
Bob,,0.908079931,0.734902274  *

======= SCORING OUTPUT =============
ROW NAME VAR1 PREDICTION(VAR1) VAR2 VAR3
1 Buddy NaN 0.506006992847086 0.577 0.319
2 Ringo NaN 0.5026999432485151 0.572 0.978
3 Paul NaN 0.4312676719193833 0.464 0.687
4 Mary NaN 0.38430756761967644 0.393 0.911
5 Joni NaN 0.39158307673653237 0.404 0.177
6 John NaN 0.3671109097071076 0.367 0.231<< *** ATTRIBUTES ARE OK
7 John NaN 0.4980700738105158 0.565 0.509<< *** BUT THE NAMES ARE WRONG!
8 Buddy NaN 0.5146053218033704 0.59 0.565
9 Jim NaN 0.8006220695941839 0.864 0.839
10 Ringo NaN 0.7430405001131846 0.79 0.0050
11 Peter NaN 0.8348597595558593 0.908 0.735


<operator name="Root" class="Process" expanded="yes">
    <operator name="Model" class="OperatorChain" expanded="yes">
        <operator name="CSVExampleSource" class="CSVExampleSource" breakpoints="after">
            <parameter key="filename" value="c:\debug1.csv"/>
            <parameter key="id_column" value="1"/>
            <parameter key="label_column" value="2"/>
        </operator>
        <operator name="ExcelExampleSource" class="ExcelExampleSource" breakpoints="after" activated="no">
            <parameter key="excel_file" value="c:\debug1.xls"/>
            <parameter key="first_row_as_names" value="true"/>
            <parameter key="id_column" value="1"/>
            <parameter key="label_column" value="2"/>
        </operator>
        <operator name="W-M5P" class="W-M5P">
            <parameter key="keep_example_set" value="true"/>
        </operator>
        <operator name="ModelApplier" class="ModelApplier">
            <list key="application_parameters">
            </list>
            <parameter key="keep_model" value="true"/>
        </operator>
        <operator name="ModelWriter" class="ModelWriter">
            <parameter key="model_file" value="c:\debug.mod"/>
            <parameter key="output_type" value="XML"/>
        </operator>
        <operator name="IOConsumer" class="IOConsumer">
            <parameter key="io_object" value="ExampleSet"/>
        </operator>
        <operator name="IOConsumer (2)" class="IOConsumer">
            <parameter key="io_object" value="Model"/>
        </operator>
    </operator>
    <operator name="Score" class="OperatorChain" expanded="yes">
        <operator name="CSVExampleSource (2)" class="CSVExampleSource" breakpoints="after" activated="no">
            <parameter key="filename" value="c:\debug2.csv"/>
            <parameter key="id_column" value="1"/>
            <parameter key="label_column" value="2"/>
        </operator>
        <operator name="ExcelExampleSource (2)" class="ExcelExampleSource">
            <parameter key="excel_file" value="c:\debug2.xls"/>
            <parameter key="first_row_as_names" value="true"/>
            <parameter key="id_column" value="1"/>
            <parameter key="label_column" value="2"/>
        </operator>
        <operator name="ModelLoader" class="ModelLoader">
            <parameter key="model_file" value="c:\debug.mod"/>
        </operator>
        <operator name="ModelApplier (2)" class="ModelApplier">
            <list key="application_parameters">
            </list>
        </operator>
        <operator name="CSVExampleSetWriter" class="CSVExampleSetWriter" breakpoints="after">
            <parameter key="column_separator" value=","/>
            <parameter key="csv_file" value="c:\debug3.csv"/>
        </operator>
    </operator>
</operator>
2 REPLIES
Regular Contributor

Re: Substantial bug in scoring

Hello

uh, this is really not nice. Another "Nominal Mapping sealed its own doom"-error I suppose...

here is a workaround:
I simply create a new id, apply the model and then restore the old id. Since "ExampleSetJoin" is used, your set should not be that large...

<operator name="Root" class="Process" expanded="yes">
    <operator name="Model" class="OperatorChain" expanded="yes">
        <operator name="CSVExampleSource" class="CSVExampleSource">
            <parameter key="filename" value="debug1.csv"/>
            <parameter key="id_column" value="1"/>
            <parameter key="label_column" value="2"/>
        </operator>
        <operator name="W-M5P" class="W-M5P">
            <parameter key="keep_example_set" value="true"/>
        </operator>
        <operator name="ModelApplier" class="ModelApplier">
            <list key="application_parameters">
            </list>
            <parameter key="keep_model" value="true"/>
        </operator>
        <operator name="ModelWriter" class="ModelWriter">
            <parameter key="model_file" value="debug.mod"/>
            <parameter key="output_type" value="XML"/>
        </operator>
        <operator name="IOConsumer" class="IOConsumer">
            <parameter key="io_object" value="ExampleSet"/>
        </operator>
        <operator name="IOConsumer (2)" class="IOConsumer">
            <parameter key="io_object" value="Model"/>
        </operator>
    </operator>
    <operator name="Score" class="OperatorChain" expanded="yes">
        <operator name="CSVExampleSource (2)" class="CSVExampleSource">
            <parameter key="filename" value="debug2.csv"/>
            <parameter key="id_column" value="1"/>
            <parameter key="label_column" value="2"/>
        </operator>
        <operator name="ModelLoader" class="ModelLoader">
            <parameter key="model_file" value="debug.mod"/>
        </operator>
        <operator name="ChangeAttributeRole" class="ChangeAttributeRole">
            <parameter key="name" value="name"/>
            <parameter key="target_role" value="ignore"/>
        </operator>
        <operator name="IdTagging" class="IdTagging">
        </operator>
        <operator name="IOMultiplier" class="IOMultiplier">
            <parameter key="io_object" value="ExampleSet"/>
        </operator>
        <operator name="reduce_id_saver_set" class="OperatorChain" expanded="yes">
            <operator name="remove_all_except_id_name" class="FeatureNameFilter">
                <parameter key="except_features_with_name" value="id||name"/>
                <parameter key="filter_special_features" value="true"/>
                <parameter key="skip_features_with_name" value=".*"/>
            </operator>
        </operator>
        <operator name="IOSelector" class="IOSelector">
            <parameter key="io_object" value="ExampleSet"/>
            <parameter key="select_which" value="2"/>
        </operator>
        <operator name="remove_name" class="FeatureNameFilter">
            <parameter key="filter_special_features" value="true"/>
            <parameter key="skip_features_with_name" value="name"/>
        </operator>
        <operator name="ModelApplier (2)" class="ModelApplier">
            <list key="application_parameters">
            </list>
            <parameter key="keep_model" value="true"/>
        </operator>
        <operator name="ExampleSetJoin" class="ExampleSetJoin">
        </operator>
        <operator name="restore_old_id" class="ChangeAttributeRole">
            <parameter key="name" value="name"/>
            <parameter key="target_role" value="id"/>
        </operator>
        <operator name="CSVExampleSetWriter" class="CSVExampleSetWriter">
            <parameter key="column_separator" value=","/>
            <parameter key="csv_file" value="debug4.csv"/>
        </operator>
    </operator>
</operator>



hope this was helpful

Steffen

PS: Setting the old id to "inactive" does not work (as expected)
PPS: Note that there is an attachement-function. The function is hidden under "additional options" in the postreply-dialog. So you are able to add xmls and saved .csvs to your post (of moderate size)
Contributor II

Re: Substantial bug in scoring

Thanks for the workaround. 
-Mike