The RapidMiner community is on read-only mode until further notice. Technical support via cases will continue to work as is. For any urgent licensing related requests from Students/Faculty members, please use the Altair academic forum here.

Discretizing by frequency with highly modal data

tennenrishintennenrishin Member Posts: 177 Contributor II
edited November 2018 in Help
Discretize by Frequency operator says:
The selected number of ranges is not applicable for the attribute x, because it has too many equal values.
If there are too many same values, a bin might grow over specified size, because values can't be distinguished. If it grows more than twice it's size some bins would vanish completely, causing this error.

The parent process is run on a wide variety of different input example sets. Is there any simple way to make RM solve this problem by allowing individual bins to grow indefinitely, and basing the frequency discretization on the remainder of the data?

For example, the data {1,1,1,1,1,2,3,8,9} with bin count 3, should be binned as follows:
1,1,1,1,1
2,3
8,9

EDIT: What I'm basically saying is:
The Discretize by Frequency operator can fail fatally just because of a coincidence in the input data. Should this exception not be handled internally by the operator, perhaps with a warning message stating that some bins might be bigger than expected?

Answers

  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    I created an internal bug report for that.

    Best, Marius
  • tennenrishintennenrishin Member Posts: 177 Contributor II
    Can anyone think of a workaround to this problem? I.e. A way of determining when this error might occur, so that one can fail over to a different discretization such as Discretize by Binning or add some random noise before discretizing by frequency.

    Here is a minimalistic demo of the problem.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.005">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
       <process expanded="true">
         <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="75">
           <list key="attribute_values">
             <parameter key="a" value="1"/>
           </list>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (4)" width="90" x="45" y="525">
           <list key="attribute_values">
             <parameter key="a" value="3"/>
           </list>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="165">
           <list key="attribute_values">
             <parameter key="a" value="1"/>
           </list>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="255">
           <list key="attribute_values">
             <parameter key="a" value="1"/>
           </list>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (5)" width="90" x="45" y="345">
           <list key="attribute_values">
             <parameter key="a" value="1"/>
           </list>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (6)" width="90" x="45" y="435">
           <list key="attribute_values">
             <parameter key="a" value="2"/>
           </list>
           <list key="set_additional_roles"/>
         </operator>
         <operator activated="true" class="append" compatibility="5.3.005" expanded="true" height="166" name="Append" width="90" x="179" y="165"/>
         <operator activated="true" class="discretize_by_frequency" compatibility="5.3.005" expanded="true" height="94" name="Discretize" width="90" x="313" y="120">
           <parameter key="number_of_bins" value="3"/>
         </operator>
         <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
         <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 6"/>
         <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
         <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
         <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 4"/>
         <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 5"/>
         <connect from_op="Append" from_port="merged set" to_op="Discretize" to_port="example set input"/>
         <connect from_op="Discretize" from_port="example set output" to_port="result 1"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
       </process>
     </operator>
    </process>
  • tennenrishintennenrishin Member Posts: 177 Contributor II
    This workaround isn't ideal. Does anyone have a better idea?
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.005">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="75">
            <list key="attribute_values">
              <parameter key="a" value="1"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (4)" width="90" x="45" y="525">
            <list key="attribute_values">
              <parameter key="a" value="3"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="165">
            <list key="attribute_values">
              <parameter key="a" value="1"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="255">
            <list key="attribute_values">
              <parameter key="a" value="1"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (5)" width="90" x="45" y="345">
            <list key="attribute_values">
              <parameter key="a" value="1"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (6)" width="90" x="45" y="435">
            <list key="attribute_values">
              <parameter key="a" value="2"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="append" compatibility="5.3.005" expanded="true" height="166" name="Append" width="90" x="179" y="165"/>
          <operator activated="true" class="aggregate" compatibility="5.3.005" expanded="true" height="76" name="Aggregate" width="90" x="313" y="210">
            <parameter key="use_default_aggregation" value="true"/>
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="a"/>
            <parameter key="default_aggregation_function" value="mode"/>
            <list key="aggregation_attributes"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="5.3.005" expanded="true" height="60" name="Extract Macro" width="90" x="380" y="120">
            <parameter key="macro" value="mode"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="mode(a)"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="5.3.005" expanded="true" height="76" name="Filter Examples" width="90" x="447" y="210">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="a=%{mode}"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="5.3.005" expanded="true" height="60" name="Extract Macro (3)" width="90" x="581" y="165">
            <parameter key="macro" value="modecount"/>
            <parameter key="attribute_name" value="mode(a)"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="5.3.005" expanded="true" height="60" name="Extract Macro (2)" width="90" x="447" y="345">
            <parameter key="macro" value="count"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="set_macro" compatibility="5.3.005" expanded="true" height="76" name="Set Macro" width="90" x="581" y="345">
            <parameter key="macro" value="bincount"/>
            <parameter key="value" value="3"/>
          </operator>
          <operator activated="true" class="branch" compatibility="5.3.005" expanded="true" height="76" name="Branch" width="90" x="581" y="435">
            <parameter key="condition_type" value="expression"/>
            <parameter key="condition_value" value="%{modecount}&gt;=2*%{count}/%{bincount}"/>
            <process expanded="true">
              <operator activated="true" class="add_noise" compatibility="5.3.005" expanded="true" height="94" name="Add Noise" width="90" x="179" y="30">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="a"/>
                <parameter key="label_noise" value="0.0"/>
                <parameter key="default_attribute_noise" value="1.0E-9"/>
                <list key="noise"/>
              </operator>
              <connect from_port="condition" to_op="Add Noise" to_port="example set input"/>
              <connect from_op="Add Noise" from_port="example set output" to_port="input 1"/>
              <portSpacing port="source_condition" spacing="0"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="sink_input 1" spacing="0"/>
              <portSpacing port="sink_input 2" spacing="0"/>
            </process>
            <process expanded="true">
              <connect from_port="condition" to_port="input 1"/>
              <portSpacing port="source_condition" spacing="0"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="sink_input 1" spacing="0"/>
              <portSpacing port="sink_input 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="discretize_by_frequency" compatibility="5.3.005" expanded="true" height="94" name="Discretize" width="90" x="782" y="390">
            <parameter key="number_of_bins" value="%{bincount}"/>
          </operator>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 6"/>
          <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
          <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 4"/>
          <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 5"/>
          <connect from_op="Append" from_port="merged set" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
          <connect from_op="Aggregate" from_port="original" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Extract Macro (3)" to_port="example set"/>
          <connect from_op="Filter Examples" from_port="original" to_op="Extract Macro (2)" to_port="example set"/>
          <connect from_op="Extract Macro (2)" from_port="example set" to_op="Set Macro" to_port="through 1"/>
          <connect from_op="Set Macro" from_port="through 1" to_op="Branch" to_port="condition"/>
          <connect from_op="Branch" from_port="input 1" to_op="Discretize" to_port="example set input"/>
          <connect from_op="Discretize" from_port="example set output" to_port="result 1"/>
          <connect from_op="Discretize" from_port="original" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
Sign In or Register to comment.