Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Replacing a missing value with the mean average, but wait there's more....

juanVjuanV Member Posts: 2 Learner I
edited September 2019 in Help
I was using the "replace missing values" operator to create an average of the missing value but what it does it calculates the average of the entire row. But what I wanted to do is replace the missing value from Rent and Sqft with the average based on area and Unit Type. I don't know if I'm using the operator wrong or is there another one I should be using. I provided the data below not to worry this is data from apartments.com so there is no privacy issue to worry about. I have a background from Biovia Pipeline Pilot, I am learning the rapid miner way now, bare with me. :smile:

Tagged:

Answers

  • yyhuangyyhuang Administrator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 364 RM Data Scientist
    edited September 2019
    Hi @juanV,

    Do you want to try "impute missing values" operator for the missing values in Rent and Sqft?

    You will impute the missing Rent based on the nearest neighbors (k=5) non-missing Rent.

    Sample process below:

    <?xml version="1.0" encoding="UTF-8"?><process version="9.4.000-BETA2">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.4.000-BETA2" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value="yhuang@rapidminer.com"/>
        <parameter key="process_duration_for_mail" value="1"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.4.000-BETA2" expanded="true" height="68" name="Retrieve example_data" width="90" x="112" y="34">
            <parameter key="repository_entry" value="//demo/AutoModel/example_data"/>
          </operator>
          <operator activated="true" class="replace_missing_values" compatibility="9.4.000-BETA2" expanded="true" height="103" name="Replace Missing Values" width="90" x="313" y="34">
            <parameter key="return_preprocessing_model" value="false"/>
            <parameter key="create_view" value="false"/>
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="default" value="average"/>
            <list key="columns"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.4.000-BETA2" expanded="true" height="103" name="Multiply" width="90" x="514" y="85"/>
          <operator activated="true" class="declare_missing_value" compatibility="9.4.000-BETA2" expanded="true" height="82" name="Declare Missing Value" width="90" x="648" y="136">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="SqFt"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="mode" value="numeric"/>
            <parameter key="numeric_value" value="1.0"/>
            <parameter key="expression_value" value=""/>
            <description align="center" color="transparent" colored="false" width="126">sqft = 1?</description>
          </operator>
          <operator activated="true" class="declare_missing_value" compatibility="9.4.000-BETA2" expanded="true" height="82" name="Declare Missing Value (2)" width="90" x="782" y="136">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="SqFt"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="mode" value="numeric"/>
            <parameter key="numeric_value" value="10.0"/>
            <parameter key="expression_value" value=""/>
            <description align="center" color="transparent" colored="false" width="126">sqft =10?</description>
          </operator>
          <operator activated="true" class="impute_missing_values" compatibility="9.4.000-BETA2" expanded="true" height="68" name="Impute Missing Values" width="90" x="916" y="136">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value="Rent|SqFt"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="iterate" value="true"/>
            <parameter key="learn_on_complete_cases" value="true"/>
            <parameter key="order" value="chronological"/>
            <parameter key="sort" value="ascending"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <process expanded="true">
              <operator activated="true" class="k_nn" compatibility="9.4.000-BETA2" expanded="true" height="82" name="k-NN" width="90" x="246" y="34">
                <parameter key="k" value="5"/>
                <parameter key="weighted_vote" value="true"/>
                <parameter key="measure_types" value="MixedMeasures"/>
                <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
                <parameter key="nominal_measure" value="NominalDistance"/>
                <parameter key="numerical_measure" value="EuclideanDistance"/>
                <parameter key="divergence" value="GeneralizedIDivergence"/>
                <parameter key="kernel_type" value="radial"/>
                <parameter key="kernel_gamma" value="1.0"/>
                <parameter key="kernel_sigma1" value="1.0"/>
                <parameter key="kernel_sigma2" value="0.0"/>
                <parameter key="kernel_sigma3" value="2.0"/>
                <parameter key="kernel_degree" value="3.0"/>
                <parameter key="kernel_shift" value="1.0"/>
                <parameter key="kernel_a" value="1.0"/>
                <parameter key="kernel_b" value="0.0"/>
              </operator>
              <connect from_port="example set source" to_op="k-NN" to_port="training set"/>
              <connect from_op="k-NN" from_port="model" to_port="model sink"/>
              <portSpacing port="source_example set source" spacing="0"/>
              <portSpacing port="sink_model sink" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve example_data" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
          <connect from_op="Replace Missing Values" from_port="example set output" to_port="result 1"/>
          <connect from_op="Replace Missing Values" from_port="original" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_port="result 3"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Declare Missing Value" to_port="example set input"/>
          <connect from_op="Declare Missing Value" from_port="example set output" to_op="Declare Missing Value (2)" to_port="example set input"/>
          <connect from_op="Declare Missing Value (2)" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
          <connect from_op="Impute Missing Values" from_port="example set out" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
        </process>
      </operator>
    </process>
    
    


  • yyhuangyyhuang Administrator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 364 RM Data Scientist
    Just noticed in the data, Sqft has very low values, 1 or 10. I added the "declare missing values" before the imputation.
Sign In or Register to comment.