"[SOLVED] Filtering examples containing multiple attributes with a blacklist"

scepxkoscepxko Member Posts: 15 Maven
edited June 2019 in Help
Hi everyone,

I'm stuck on something since several hours and can't definitely find the solution by myself.
I've searched for a solution on the forum without any success so far.

Here's the situation:

- The source file is an Excel file "input.xls" containing 4 columns/attributes:
ID ---- industry --- location ------- title
111 xyz Cayman Islands xyz
222 xyz Peru xyz
333 xyz USA xyz

- I want to remove the lines whose attribute "location" has a value contained in a blacklist "location-to-filter-out.xls".
This Excel file looks like
location2
Cayman
France
USA


- Then save the filtered file as "output.xls". Here it would look like:
ID ---- industry --- location ------- title
222 xyz Peru xyz

What I have done so far:

- In the code below, I have a "Loop Attributes" on "location"

- Inside it (subprocess), I have a "Read Excel" operator, extracting the "location2" values from the blacklist. And a second "Loop attributes" running on "location2" this time.

- Inside this second "Loop attributes" (sub-subprocess), I have the filtering condition with the operator "Filter Examples", with the condition class "attribute_value_filter".

The parameter string is: %{loop_attribute}=(.*?)%{loop_attribute2}(.*?)

Where the %{loop_attribute} is the original attribute "location" in the source file. For example "Cayman Islands".

And %{loop_attribute2} is a location blacklisted. For example "Cayman" (the blacklist contains only single words for the sake of simplicity. That's why I use "(.*?)" in the filtering condition.


Problem:
It does not work and I'm unable to find the flaw in my logic.

Maybe can somebody enlighten me?
Thanks in advance for your help.
Alex

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (3)" width="90" x="45" y="30">
       <parameter key="excel_file" value="E:\Rapidminer\example\input.xls"/>
       <parameter key="imported_cell_range" value="A1:G650"/>
       <parameter key="first_row_as_names" value="false"/>
       <list key="annotations">
         <parameter key="0" value="Name"/>
       </list>
       <list key="data_set_meta_data_information">
         <parameter key="0" value="ID.true.integer.attribute"/>
         <parameter key="1" value="industry.true.polynominal.attribute"/>
         <parameter key="2" value="location.true.polynominal.attribute"/>
         <parameter key="3" value="title.true.polynominal.attribute"/>
       </list>
     </operator>
     <operator activated="true" class="loop_attributes" compatibility="5.3.015" expanded="true" height="94" name="Loop Attributes" width="90" x="514" y="30">
       <parameter key="attribute_filter_type" value="single"/>
       <parameter key="attribute" value="location"/>
       <process expanded="true">
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (4)" width="90" x="246" y="165">
           <parameter key="excel_file" value="E:\Rapidminer\example\location-to-filter-out.xls"/>
           <parameter key="imported_cell_range" value="A1:E67"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="location2.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="loop_attributes" compatibility="5.3.015" expanded="true" height="76" name="Loop Attributes (2)" width="90" x="514" y="165">
           <parameter key="attribute_filter_type" value="single"/>
           <parameter key="attribute" value="location2"/>
           <parameter key="iteration_macro" value="loop_attribute2"/>
           <process expanded="true">
             <operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (3)" width="90" x="447" y="30">
               <parameter key="condition_class" value="attribute_value_filter"/>
               <parameter key="parameter_string" value="%{loop_attribute}=(.*?)%{loop_attribute2}(.*?)"/>
               <parameter key="invert_filter" value="true"/>
             </operator>
             <connect from_port="example set" to_op="Filter Examples (3)" to_port="example set input"/>
             <connect from_op="Filter Examples (3)" from_port="example set output" to_port="example set"/>
             <portSpacing port="source_example set" spacing="0"/>
             <portSpacing port="sink_example set" spacing="0"/>
             <portSpacing port="sink_result 1" spacing="0"/>
           </process>
         </operator>
         <connect from_port="example set" to_port="example set"/>
         <connect from_op="Read Excel (4)" from_port="output" to_op="Loop Attributes (2)" to_port="example set"/>
         <connect from_op="Loop Attributes (2)" from_port="example set" to_port="result 1"/>
         <portSpacing port="source_example set" spacing="0"/>
         <portSpacing port="sink_example set" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="849" y="30">
       <parameter key="excel_file" value="E:\Rapidminer\example\output.xls"/>
     </operator>
     <connect from_op="Read Excel (3)" from_port="output" to_op="Loop Attributes" to_port="example set"/>
     <connect from_op="Loop Attributes" from_port="example set" to_op="Write Excel" to_port="input"/>
     <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>


Tagged:

Best Answer

  • scepxkoscepxko Member Posts: 15 Maven
    Solution Accepted
    Hi Andrew.

    You saved me.
    Using the Cartesian operator was one of the working solutions.

    As for the resources, I am currently processing 22'000 examples.
    And by launching the processes via command-line, I can make batches of 1400 examples at a time (in the final process, one example has 8 attributes, and there are 4 filtering processes).
    When I go above 1400, I simply get no results.
    But it's fine to me, it will take less than 30 minutes to process it all, whereas I spent +5 hours this week looking for a solution to this filtering issue!!

    Thanks again Andrew, enjoy this weekend fully, you deserve it :-)
    Alex

    PS: For those having a similar challenge to solve, I copy/paste below the working code.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.015">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
       <process expanded="true">
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (3)" width="90" x="45" y="30">
           <parameter key="excel_file" value="E:\Rapidminer\example\input.xls"/>
           <parameter key="imported_cell_range" value="A1:G1000"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="ID.true.integer.attribute"/>
             <parameter key="1" value="industry.true.polynominal.attribute"/>
             <parameter key="2" value="location.true.polynominal.attribute"/>
             <parameter key="3" value="title.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (2)" width="90" x="45" y="120">
           <parameter key="excel_file" value="E:\Rapidminer\example\location-to-filter-out.xls"/>
           <parameter key="imported_cell_range" value="A1:E1000"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="location2.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="cartesian_product" compatibility="5.3.015" expanded="true" height="76" name="Cartesian" width="90" x="246" y="30"/>
         <operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes" width="90" x="380" y="30">
           <list key="function_descriptions">
             <parameter key="toRemove" value="if(matches(location,&quot;.*&quot;+location2+&quot;.*&quot;),1,0)"/>
           </list>
         </operator>
         <operator activated="true" class="aggregate" compatibility="5.3.015" expanded="true" height="76" name="Aggregate" width="90" x="514" y="30">
           <list key="aggregation_attributes">
             <parameter key="toRemove" value="sum"/>
           </list>
           <parameter key="group_by_attributes" value="industry|location|title|ID"/>
         </operator>
         <operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (2)" width="90" x="648" y="30">
           <parameter key="condition_class" value="attribute_value_filter"/>
           <parameter key="parameter_string" value="sum(toRemove)!=1"/>
         </operator>
         <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="782" y="30">
           <parameter key="attribute_filter_type" value="subset"/>
           <parameter key="attributes" value="industry|location|title||ID"/>
         </operator>
         <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="916" y="30">
           <parameter key="excel_file" value="E:\Rapidminer\example\output.xls"/>
         </operator>
         <connect from_op="Read Excel (3)" from_port="output" to_op="Cartesian" to_port="left"/>
         <connect from_op="Read Excel (2)" from_port="output" to_op="Cartesian" to_port="right"/>
         <connect from_op="Cartesian" from_port="join" to_op="Generate Attributes" to_port="example set input"/>
         <connect from_op="Generate Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
         <connect from_op="Aggregate" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
         <connect from_op="Filter Examples (2)" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
         <connect from_op="Select Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
         <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
       </process>
     </operator>
    </process>

Answers

  • bkrieverbkriever RapidMiner Certified Analyst, Member Posts: 11 Contributor II
    1. Read in both "input.xls"  and "location-to-filter-out.xls"
    2. Do a left join where location=location2 (with input.xls going to the left input) and select "keep both join attributes".
    3. Use filter examples to filter for examples that are missing location2.

    Hope this helps!
  • scepxkoscepxko Member Posts: 15 Maven
    Thank you, it helped to go further.

    So here with the updated code below, if I have an exact match between the attribute "location" and the blacklisted attribute "location2", the example is removed.

    Problem: it only works with an exact match.

    I would like to filter the attribute "location" using a wildcard, like this %{location}=(.*?)%{location2}(.*?)
    If I put "India" in the blacklist, it would remove the example with "Gurgaon, India", "Delhi, India" etc.

    Any idea?
    Thanks again

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.015">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
       <process expanded="true">
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (3)" width="90" x="45" y="30">
           <parameter key="excel_file" value="E:\Rapidminer\example\input.xls"/>
           <parameter key="imported_cell_range" value="A1:G650"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="ID.true.integer.attribute"/>
             <parameter key="1" value="industry.true.polynominal.attribute"/>
             <parameter key="2" value="location.true.polynominal.attribute"/>
             <parameter key="3" value="title.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (2)" width="90" x="45" y="165">
           <parameter key="excel_file" value="E:\Rapidminer\example\location-to-filter-out.xls"/>
           <parameter key="imported_cell_range" value="A1:E67"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="location2.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join" width="90" x="246" y="165">
           <parameter key="join_type" value="left"/>
           <parameter key="use_id_attribute_as_key" value="false"/>
           <list key="key_attributes">
             <parameter key="location" value="location2"/>
           </list>
           <parameter key="keep_both_join_attributes" value="true"/>
         </operator>
         <operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (2)" width="90" x="447" y="165">
           <parameter key="condition_class" value="no_missing_attributes"/>
           <parameter key="parameter_string" value="location=(.*?)location2(.*?)"/>
           <parameter key="invert_filter" value="true"/>
         </operator>
         <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="648" y="165">
           <parameter key="attribute_filter_type" value="subset"/>
           <parameter key="attributes" value="|ID|industry|location|title"/>
         </operator>
         <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="849" y="165">
           <parameter key="excel_file" value="E:\Rapidminer\example\output.xls"/>
         </operator>
         <connect from_op="Read Excel (3)" from_port="output" to_op="Join" to_port="left"/>
         <connect from_op="Read Excel (2)" from_port="output" to_op="Join" to_port="right"/>
         <connect from_op="Join" from_port="join" to_op="Filter Examples (2)" to_port="example set input"/>
         <connect from_op="Filter Examples (2)" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
         <connect from_op="Select Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
         <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
       </process>
     </operator>
    </process>



  • awchisholmawchisholm RapidMiner Certified Expert, Member Posts: 458 Unicorn
    Version 6 of RapidMiner, the Filter Examples operator lets you filter based on partial matches.

    Andrew
  • scepxkoscepxko Member Posts: 15 Maven
    Hi Andrew,

    First I own your book. I wish I had it 3 years ago when I started to play with Rapidminer. Thank you for your work

    As for your suggestion, yes, RM6 has this more advanced filtering option.
    I just tried it with "contains" and "matches" and I don't get the result wished.

    In RM5, I introduced a new attribute "ToRemove" that compares the values of "location" and "location2", and then I filtered the examples with the ToRemove values of true/false.
    That's a method I saw in other posts.

    I tried "finds", "matches", and "if".
    Here are my results:

    ** Filters only the exact match (see my first post, it would remove the "USA" row, but not the "Cayman" row)

    finds(location,location2)
    finds(location,%{location2})
    matches(location,location2)
    matches(location,%{location2})
    matches(location,".*%{location2}.*")
    if((location==location2), true, false)

    ** Filters nothing at all (input and output files identical)

    finds(location,"%{location2}")
    finds(location,".*%{location2}.*")
    finds(location,"%{location2}")
    finds(location,".*%{location2}.*")
    if((location==".*location2.*"), true, false)
    if((%{location}==".*%{location2}.*"), true, false)
    if((%{location}=="(.*?)%{location2}(.*?)"), true, false)

    I am surely doing something wrong with the wildcards but I have really no idea what it is.

    Any idea?
    Thank you in advance

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.015">
     <context>
       <input/>
       <output/>
       <macros/>
     </context>
     <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
       <process expanded="true">
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (3)" width="90" x="45" y="30">
           <parameter key="excel_file" value="E:\Rapidminer\example\input.xls"/>
           <parameter key="imported_cell_range" value="A1:G650"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="ID.true.integer.attribute"/>
             <parameter key="1" value="industry.true.polynominal.attribute"/>
             <parameter key="2" value="location.true.polynominal.attribute"/>
             <parameter key="3" value="title.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (2)" width="90" x="45" y="165">
           <parameter key="excel_file" value="E:\Rapidminer\example\location-to-filter-out.xls"/>
           <parameter key="imported_cell_range" value="A1:E67"/>
           <parameter key="first_row_as_names" value="false"/>
           <list key="annotations">
             <parameter key="0" value="Name"/>
           </list>
           <list key="data_set_meta_data_information">
             <parameter key="0" value="location2.true.polynominal.attribute"/>
           </list>
         </operator>
         <operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join" width="90" x="246" y="165">
           <parameter key="join_type" value="left"/>
           <parameter key="use_id_attribute_as_key" value="false"/>
           <list key="key_attributes">
             <parameter key="location" value="location2"/>
           </list>
           <parameter key="keep_both_join_attributes" value="true"/>
         </operator>
         <operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes" width="90" x="380" y="165">
           <list key="function_descriptions">
             <parameter key="toRemove" value="finds(location,location2)"/>
           </list>
         </operator>
         <operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (2)" width="90" x="514" y="165">
           <parameter key="condition_class" value="attribute_value_filter"/>
           <parameter key="parameter_string" value="toRemove != true"/>
         </operator>
         <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="849" y="165">
           <parameter key="excel_file" value="E:\Rapidminer\example\output.xls"/>
         </operator>
         <connect from_op="Read Excel (3)" from_port="output" to_op="Join" to_port="left"/>
         <connect from_op="Read Excel (2)" from_port="output" to_op="Join" to_port="right"/>
         <connect from_op="Join" from_port="join" to_op="Generate Attributes" to_port="example set input"/>
         <connect from_op="Generate Attributes" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
         <connect from_op="Filter Examples (2)" from_port="example set output" to_op="Write Excel" to_port="input"/>
         <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
         <portSpacing port="sink_result 2" spacing="0"/>
       </process>
     </operator>
    </process>


  • awchisholmawchisholm RapidMiner Certified Expert, Member Posts: 458 Unicorn
    Hello

    Thanks for the thanks about the book!

    You could try a right join like in the following.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.1.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.1.000" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="6.1.000" expanded="true" height="60" name="Retrieve Iris" width="90" x="179" y="75">
            <parameter key="repository_entry" value="//Samples/data/Iris"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="179" y="210">
            <list key="attribute_values">
              <parameter key="label" value="&quot;Iris-setosa&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="join" compatibility="6.1.000" expanded="true" height="76" name="Join" width="90" x="447" y="75">
            <parameter key="join_type" value="right"/>
            <parameter key="use_id_attribute_as_key" value="false"/>
            <list key="key_attributes">
              <parameter key="label" value="label"/>
            </list>
          </operator>
          <connect from_op="Retrieve Iris" from_port="output" to_op="Join" to_port="left"/>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Join" to_port="right"/>
          <connect from_op="Join" from_port="join" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    regards

    Andrew
  • scepxkoscepxko Member Posts: 15 Maven
    Hi Andrew,

    Thank you for helping again.

    that's what I did with the post #3 and I'm afraid it doesn't work.
    I need to find a solution for this wildcards to work.
    Maybe go again in the direction of a Loop subprocess for each attribute "location" in the input file.
    Otherwise I'm out of resources.

    Black Friday for inspiration!
  • awchisholmawchisholm RapidMiner Certified Expert, Member Posts: 458 Unicorn
    Hello scepxko

    Here is one solution - the trick is to use the Cartesian operator - this might cause memory explosions - be brave. Memory saving is something I know a lot about so report back with progress.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.1.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.1.000" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="subprocess" compatibility="6.1.000" expanded="true" height="76" name="source" width="90" x="112" y="75">
            <process expanded="true">
              <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="112" y="120">
                <list key="attribute_values">
                  <parameter key="country" value="&quot;Cayman Islands&quot;"/>
                </list>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="112" y="210">
                <list key="attribute_values">
                  <parameter key="country" value="&quot;Peru&quot;"/>
                </list>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="112" y="300">
                <list key="attribute_values">
                  <parameter key="country" value="&quot;USA&quot;"/>
                </list>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="append" compatibility="6.1.000" expanded="true" height="112" name="Append" width="90" x="313" y="120"/>
              <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
              <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
              <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
              <connect from_op="Append" from_port="merged set" to_port="out 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="subprocess" compatibility="6.1.000" expanded="true" height="76" name="remove" width="90" x="112" y="165">
            <process expanded="true">
              <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification (4)" width="90" x="112" y="75">
                <list key="attribute_values">
                  <parameter key="country" value="&quot;Cayman&quot;"/>
                </list>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification (5)" width="90" x="112" y="165">
                <list key="attribute_values">
                  <parameter key="country" value="&quot;France&quot;"/>
                </list>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000" expanded="true" height="60" name="Generate Data by User Specification (6)" width="90" x="112" y="255">
                <list key="attribute_values">
                  <parameter key="country" value="&quot;USA&quot;"/>
                </list>
                <list key="set_additional_roles"/>
              </operator>
              <operator activated="true" class="append" compatibility="6.1.000" expanded="true" height="112" name="Append (2)" width="90" x="313" y="75"/>
              <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append (2)" to_port="example set 1"/>
              <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append (2)" to_port="example set 2"/>
              <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append (2)" to_port="example set 3"/>
              <connect from_op="Append (2)" from_port="merged set" to_port="out 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="cartesian_product" compatibility="6.1.000" expanded="true" height="76" name="Cartesian" width="90" x="246" y="75">
            <parameter key="remove_double_attributes" value="false"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="6.1.000" expanded="true" height="76" name="Generate Attributes" width="90" x="380" y="75">
            <list key="function_descriptions">
              <parameter key="deleteme" value="if(matches(country,&quot;.*&quot;+country_from_ES2+&quot;.*&quot;),1,0)"/>
            </list>
          </operator>
          <operator activated="true" class="aggregate" compatibility="6.1.000" expanded="true" height="76" name="Aggregate" width="90" x="514" y="75">
            <list key="aggregation_attributes">
              <parameter key="deleteme" value="sum"/>
            </list>
            <parameter key="group_by_attributes" value="country"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="6.1.000" expanded="true" height="94" name="Filter Examples" width="90" x="648" y="75">
            <list key="filters_list">
              <parameter key="filters_entry_key" value="sum(deleteme).eq.0"/>
            </list>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="6.1.000" expanded="true" height="76" name="Select Attributes" width="90" x="782" y="75">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="country"/>
          </operator>
          <connect from_op="source" from_port="out 1" to_op="Cartesian" to_port="left"/>
          <connect from_op="remove" from_port="out 1" to_op="Cartesian" to_port="right"/>
          <connect from_op="Cartesian" from_port="join" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    Good luck!

    regards

    Andrew

Sign In or Register to comment.