RapidMiner 9.7 is Now Available

Lots of amazing new improvements including true version control! Learn more about what's new here.

CLICK HERE TO DOWNLOAD

Loop K-means - how to write csv chosing the # of K

SLaxmidasSLaxmidas Member Posts: 2 Learner I
edited June 11 in Help
Hi there,

I'm running a process with loop k-means (K=2 - k=13), manually comparing the DB of each k I like the results of K=3.
Now I want to write an csv file with a new attribute, the cluster (0, 1, 2) but when I add the operator to this process it writes the file based on k=13.

attached is the data set and bellow the process' xml.

Thanks in advance!

<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Root" origin="GENERATED_SAMPLE">
    <parameter key="logverbosity" value="warning"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve CL_loan_ow" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../Clustering_loans/CL_loan_ow"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="9.6.000" expanded="true" height="103" name="Filter Examples (2)" width="90" x="179" y="34">
        <parameter key="parameter_expression" value=""/>
        <parameter key="condition_class" value="custom_filters"/>
        <parameter key="invert_filter" value="false"/>
        <list key="filters_list">
          <parameter key="filters_entry_key" value="client_disp_type_owner.eq.1"/>
        </list>
        <parameter key="filters_logic_and" value="true"/>
        <parameter key="filters_check_metadata" value="true"/>
      </operator>
      <operator activated="true" class="replace_missing_values" compatibility="9.6.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="313" y="34">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="all"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="default" value="zero"/>
        <list key="columns"/>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="9.6.000" expanded="true" height="82" name="Generate Attributes" width="90" x="112" y="238">
        <list key="function_descriptions">
          <parameter key="log_avg_month_ks_oldage_pension_amount" value="log(avg_month_ks_oldage_pension_amount+1)"/>
          <parameter key="log_avg_month_ks_insur_payment_amount" value="log(avg_month_ks_insur_payment_amount+1)"/>
        </list>
        <parameter key="keep_all" value="true"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.6.000" expanded="true" height="82" name="Select Attributes" width="90" x="246" y="238">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="avg_month_ks_household_amount|avg_month_ks_negative_bal_amount|avg_month_total_transactions_amount|avg_month_transactions_to_other_banks_amount|client_id|last_transaction_age_year|loan_age_months|loan_amount|loan_duration|loan_status|log_avg_month_ks_insur_payment_amount|no_enterpreneurs_per_1000_inhabitants|has_credit_card"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="normalize" compatibility="9.6.000" expanded="true" height="103" name="Normalize" width="90" x="447" y="238">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="avg_month_avg_transaction_amount|avg_month_ks_household_amount|avg_month_ks_negative_bal_amount|avg_month_total_credit_amount|avg_month_total_transactions_amount|avg_month_total_withdrawal_amount|avg_month_transactions_to_other_banks_amount|client_age_years|client_card_age_months|last_transaction_age_year|loan_age_months|loan_amount|loan_duration|loan_status|log_avg_month_ks_insur_payment_amount|log_avg_month_ks_oldage_pension_amount|no_enterpreneurs_per_1000_inhabitants|has_credit_card"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="numeric"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="real"/>
        <parameter key="block_type" value="value_series"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_series_end"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="method" value="Z-transformation"/>
        <parameter key="min" value="0.0"/>
        <parameter key="max" value="1.0"/>
        <parameter key="allow_negative_values" value="false"/>
      </operator>
      <operator activated="true" class="denormalize" compatibility="9.6.000" expanded="true" height="82" name="De-Normalize" width="90" x="581" y="391">
        <parameter key="missing_attribute_handling" value="proceed on missing"/>
      </operator>
      <operator activated="true" class="loop_parameters" compatibility="9.6.000" expanded="true" height="124" name="ParameterIteration" origin="GENERATED_SAMPLE" width="90" x="715" y="238">
        <list key="parameters">
          <parameter key="KMeans.k" value="2,3,4,5,6,7,8,9,10,11,12,13"/>
        </list>
        <parameter key="error_handling" value="fail on error"/>
        <parameter key="synchronize" value="false"/>
        <process expanded="true">
          <operator activated="true" class="concurrency:k_means" compatibility="9.0.001" expanded="true" height="82" name="KMeans" origin="GENERATED_SAMPLE" width="90" x="246" y="34">
            <parameter key="add_cluster_attribute" value="true"/>
            <parameter key="add_as_label" value="false"/>
            <parameter key="remove_unlabeled" value="false"/>
            <parameter key="k" value="13"/>
            <parameter key="max_runs" value="10"/>
            <parameter key="determine_good_start_values" value="false"/>
            <parameter key="measure_types" value="BregmanDivergences"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="EuclideanDistance"/>
            <parameter key="divergence" value="SquaredEuclideanDistance"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
            <parameter key="max_optimization_steps" value="100"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.6.000" expanded="true" height="103" name="Multiply (3)" width="90" x="447" y="34"/>
          <operator activated="true" class="multiply" compatibility="9.6.000" expanded="true" height="166" name="Multiply" width="90" x="380" y="187"/>
          <operator activated="true" class="write_csv" compatibility="9.6.000" expanded="true" height="82" name="Write CSV" width="90" x="908" y="340">
            <parameter key="csv_file" value="C:\Users\sarit\OneDrive\Documents\PBS\PROJETO_I\Clustering\20200611.csv"/>
            <parameter key="column_separator" value=";"/>
            <parameter key="write_attribute_names" value="true"/>
            <parameter key="quote_nominal_values" value="true"/>
            <parameter key="format_date_attributes" value="true"/>
            <parameter key="append_to_file" value="false"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="model_simulator:cluster_model_visualizer" compatibility="9.6.000" expanded="true" height="103" name="Cluster Model Visualizer" width="90" x="715" y="34"/>
          <operator activated="true" class="cluster_distance_performance" compatibility="9.3.001" expanded="true" height="103" name="Evaluation" origin="GENERATED_SAMPLE" width="90" x="648" y="187">
            <parameter key="main_criterion" value="Avg. within centroid distance"/>
            <parameter key="main_criterion_only" value="false"/>
            <parameter key="normalize" value="true"/>
            <parameter key="maximize" value="true"/>
          </operator>
          <operator activated="true" class="log" compatibility="9.6.000" expanded="true" height="103" name="ProcessLog" origin="GENERATED_SAMPLE" width="90" x="782" y="187">
            <parameter key="filename" value="C:\Users\sarit\OneDrive\Documents\PBS\PROJETO_I\Clustering\ficheiros_Joao\teste9with_cor_matrix.log"/>
            <list key="log">
              <parameter key="k" value="operator.KMeans.parameter.k"/>
              <parameter key="DB" value="operator.Evaluation.value.DaviesBouldin"/>
              <parameter key="Avg_distance_within_cluster" value="operator.Evaluation.value.avg_within_distance"/>
            </list>
            <parameter key="sorting_type" value="none"/>
            <parameter key="sorting_k" value="100"/>
            <parameter key="persistent" value="false"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Denormalised original data with clusters and labels" width="90" x="514" y="442">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="concurrency:loop_values" compatibility="9.6.000" expanded="true" height="103" name="Loop Values" width="90" x="715" y="442">
            <parameter key="attribute" value="cluster"/>
            <parameter key="iteration_macro" value="loop_value"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="true"/>
            <process expanded="true">
              <operator activated="true" class="filter_examples" compatibility="9.6.000" expanded="true" height="103" name="Filter Examples" width="90" x="313" y="136">
                <parameter key="parameter_expression" value="cluster==%{loop_value}"/>
                <parameter key="condition_class" value="expression"/>
                <parameter key="invert_filter" value="false"/>
                <list key="filters_list">
                  <parameter key="filters_entry_key" value="cluster.equals.loop_value"/>
                </list>
                <parameter key="filters_logic_and" value="true"/>
                <parameter key="filters_check_metadata" value="true"/>
              </operator>
              <operator activated="true" class="model_simulator:data_statistics" compatibility="9.6.000" expanded="true" height="82" name="Statistics" width="90" x="581" y="136"/>
              <connect from_port="input 2" to_op="Filter Examples" to_port="example set input"/>
              <connect from_op="Filter Examples" from_port="example set output" to_op="Statistics" to_port="example set"/>
              <connect from_op="Statistics" from_port="statistics" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="source_input 3" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="input 1" to_op="KMeans" to_port="example set"/>
          <connect from_port="input 2" to_op="Denormalised original data with clusters and labels" to_port="model"/>
          <connect from_op="KMeans" from_port="cluster model" to_op="Multiply (3)" to_port="input"/>
          <connect from_op="KMeans" from_port="clustered set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply (3)" from_port="output 1" to_op="Evaluation" to_port="cluster model"/>
          <connect from_op="Multiply (3)" from_port="output 2" to_op="Cluster Model Visualizer" to_port="model"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Evaluation" to_port="example set"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Loop Values" to_port="input 1"/>
          <connect from_op="Multiply" from_port="output 3" to_op="Denormalised original data with clusters and labels" to_port="unlabelled data"/>
          <connect from_op="Multiply" from_port="output 4" to_op="Cluster Model Visualizer" to_port="clustered data"/>
          <connect from_op="Multiply" from_port="output 5" to_op="Write CSV" to_port="input"/>
          <connect from_op="Write CSV" from_port="through" to_port="result 3"/>
          <connect from_op="Cluster Model Visualizer" from_port="visualizer output" to_port="result 2"/>
          <connect from_op="Evaluation" from_port="performance" to_op="ProcessLog" to_port="through 1"/>
          <connect from_op="Evaluation" from_port="example set" to_op="ProcessLog" to_port="through 2"/>
          <connect from_op="Denormalised original data with clusters and labels" from_port="labelled data" to_op="Loop Values" to_port="input 2"/>
          <connect from_op="Loop Values" from_port="output 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="source_input 3" spacing="0"/>
          <portSpacing port="sink_performance" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="concurrency:correlation_matrix" compatibility="9.6.000" expanded="true" height="103" name="Correlation Matrix" width="90" x="648" y="85">
        <parameter key="attribute_filter_type" value="all"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="normalize_weights" value="true"/>
        <parameter key="squared_correlation" value="false"/>
      </operator>
      <connect from_op="Retrieve CL_loan_ow" from_port="output" to_op="Filter Examples (2)" to_port="example set input"/>
      <connect from_op="Filter Examples (2)" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
      <connect from_op="Replace Missing Values" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Normalize" to_port="example set input"/>
      <connect from_op="Normalize" from_port="example set output" to_op="ParameterIteration" to_port="input 1"/>
      <connect from_op="Normalize" from_port="original" to_op="Correlation Matrix" to_port="example set"/>
      <connect from_op="Normalize" from_port="preprocessing model" to_op="De-Normalize" to_port="model input"/>
      <connect from_op="De-Normalize" from_port="model output" to_op="ParameterIteration" to_port="input 2"/>
      <connect from_op="ParameterIteration" from_port="result 1" to_port="result 1"/>
      <connect from_op="ParameterIteration" from_port="result 2" to_port="result 2"/>
      <connect from_op="Correlation Matrix" from_port="matrix" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>


Answers

  • hbajpaihbajpai Member Posts: 100   Unicorn

    The file output path is consistent and I think that is why it is overwriting the outputs. Try using a marco in the file name, e.g. k - value of the current iteration (\..\..\..\%{k-value}). This will help you to create 12 different files.ย 
    Best,
    Harshit
  • SLaxmidasSLaxmidas Member Posts: 2 Learner I
    @hbajpai

    I have tried to add that macro on the file name but it resulted in:ย "undefinied macro error".
    Then I tried to use the operator "set macro", but when I ran it the csv file contained 13 clusters and not the total 3 clusters I wanted (that is, the same result as without the macro)

    Maybe I did something wrong... I'm new to all this!

    can you help?

    thanks!
  • hbajpaihbajpai Member Posts: 100   Unicorn
    @SLaxmidas

    Sure, I have added XML of a process that you can check out, the processes saves 5 files one for each row using the macro approach. You can update the path based on your directory for testing. In the essence, you will have to incorporate the similar process in your clustering piece, where in you can use extract macro to figure out the current k and then save the file with k-value embedded in the file name.

    Let me know if this works for you.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.7.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.7.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="generate_data" compatibility="9.7.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="34">
            <parameter key="target_function" value="random"/>
            <parameter key="number_examples" value="5"/>
            <parameter key="number_of_attributes" value="5"/>
            <parameter key="attributes_lower_bound" value="-10.0"/>
            <parameter key="attributes_upper_bound" value="10.0"/>
            <parameter key="gaussian_standard_deviation" value="10.0"/>
            <parameter key="largest_radius" value="10.0"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="9.7.000" expanded="true" height="68" name="Extract Macro" width="90" x="179" y="34">
            <parameter key="macro" value="total_examples"/>
            <parameter key="macro_type" value="number_of_examples"/>
            <parameter key="statistics" value="average"/>
            <parameter key="attribute_name" value=""/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="concurrency:loop" compatibility="9.7.000" expanded="true" height="82" name="Loop" width="90" x="380" y="34">
            <parameter key="number_of_iterations" value="%{total_examples}"/>
            <parameter key="iteration_macro" value="i"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="true"/>
            <process expanded="true">
              <operator activated="true" class="filter_example_range" compatibility="9.7.000" expanded="true" height="82" name="Filter Example Range" width="90" x="45" y="34">
                <parameter key="first_example" value="%{i}"/>
                <parameter key="last_example" value="%{i}"/>
                <parameter key="invert_filter" value="false"/>
              </operator>
              <operator activated="true" class="write_csv" compatibility="9.7.000" expanded="true" height="82" name="Write CSV" width="90" x="313" y="34">
                <parameter key="csv_file" value="C:/Users/harsh/OneDrive/Documents/test/%{i}.csv"/>
                <parameter key="column_separator" value=";"/>
                <parameter key="write_attribute_names" value="true"/>
                <parameter key="quote_nominal_values" value="true"/>
                <parameter key="format_date_attributes" value="true"/>
                <parameter key="append_to_file" value="false"/>
                <parameter key="encoding" value="SYSTEM"/>
              </operator>
              <connect from_port="input 1" to_op="Filter Example Range" to_port="example set input"/>
              <connect from_op="Filter Example Range" from_port="example set output" to_op="Write CSV" to_port="input"/>
              <connect from_op="Write CSV" from_port="through" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="append" compatibility="9.7.000" expanded="true" height="82" name="Append" width="90" x="581" y="34">
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="merge_type" value="all"/>
          </operator>
          <connect from_op="Generate Data" from_port="output" to_op="Extract Macro" to_port="example set"/>
          <connect from_op="Extract Macro" from_port="example set" to_op="Loop" to_port="input 1"/>
          <connect from_op="Loop" from_port="output 1" to_op="Append" to_port="example set 1"/>
          <connect from_op="Append" from_port="merged set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    Best,
    Harshit
    lionelderkrikor
Sign In or Register to comment.