Keep ID in Extract Structured Data output objects + combine in one ExampleSet in Object Collectio

damon_kostdamon_kost Member Posts: 2 Contributor I
edited February 2020 in Help
I have tried many different threads to solve what seems like it should be simple - (https://community.rapidminer.com/discussion/18154/solved-joining-examplesets-of-a-collection / https://community.rapidminer.com/discussion/38582/problem-with-combining-all-example-set-from-io-object-collection seeming close)

The basic problem is that no matter how I configure it (unfolding/not unfolding), the result is a IOObjectcollection with discrete Example sets, and the original data as annotations.

My goal is simply to append the columns resulting from Extract structured data to it.  Any help would be appreciated!

My XML is below:

<?xml version="1.0" encoding="UTF-8"?><process version="9.5.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.5.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="9.5.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="187">
        <parameter key="excel_file" value="H:\Address Sales\RP-HA_COIP-PCR.xlsx"/>
        <parameter key="sheet_selection" value="sheet number"/>
        <parameter key="sheet_name" value="Sheet2"/>
        <parameter key="sheet_number" value="1"/>
        <parameter key="imported_cell_range" value="A1:G20"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations"/>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="DocumentName.true.polynominal.id"/>
          <parameter key="1" value="CurrentStatusInternalName.true.polynominal.attribute"/>
          <parameter key="2" value="Address.true.polynominal.attribute"/>
          <parameter key="3" value="City.true.polynominal.attribute"/>
          <parameter key="4" value="State.true.polynominal.attribute"/>
          <parameter key="5" value="zip.true.integer.attribute"/>
          <parameter key="6" value="Link.true.polynominal.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.5.001" expanded="true" height="82" name="Set Role (3)" width="90" x="179" y="187">
        <parameter key="attribute_name" value="Link"/>
        <parameter key="target_role" value="regular"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.5.001" expanded="true" height="82" name="Set Role (4)" width="90" x="313" y="187">
        <parameter key="attribute_name" value="DocumentName"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="subprocess" compatibility="9.5.001" expanded="true" height="82" name="Scrape and Loop" width="90" x="581" y="238">
        <process expanded="true">
          <operator activated="true" class="web_table_extraction:extract_structured_data_operator" compatibility="1.1.000" expanded="true" height="68" name="Extract Structured Data" width="90" x="45" y="187">
            <parameter key="resource_type" value="example set"/>
            <parameter key="attribute" value="Link"/>
            <parameter key="schema item" value="enter manually"/>
            <parameter key="encapsulator node's attribute" value="class"/>
            <parameter key="attribute value" value="notice-table"/>
            <parameter key="user agent" value="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"/>
            <parameter key="connection timeout (ms)" value="300000"/>
            <parameter key="specify value node's attribute" value="true"/>
            <parameter key="attribute name" value="class"/>
            <parameter key="value node's attribute" value="notice"/>
            <parameter key="encoding" value="UTF-8"/>
          </operator>
          <operator activated="true" class="loop_collection" compatibility="9.5.001" expanded="true" height="82" name="Loop Collection (3)" origin="GENERATED_TUTORIAL" width="90" x="380" y="85">
            <parameter key="set_iteration_macro" value="false"/>
            <parameter key="macro_name" value="iteration"/>
            <parameter key="macro_start_value" value="1"/>
            <parameter key="unfold" value="true"/>
            <process expanded="true">
              <connect from_port="single" to_port="output 1"/>
              <portSpacing port="source_single" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
            <description align="center" color="transparent" colored="false" width="126">Loop over tables and improve table format</description>
          </operator>
          <operator activated="true" class="collect" compatibility="9.5.001" expanded="true" height="82" name="Collect" width="90" x="648" y="85">
            <parameter key="unfold" value="true"/>
          </operator>
          <connect from_port="in 1" to_op="Extract Structured Data" to_port="input example set"/>
          <connect from_op="Extract Structured Data" from_port="collection of product data as example sets" to_op="Loop Collection (3)" to_port="collection"/>
          <connect from_op="Loop Collection (3)" from_port="output 1" to_op="Collect" to_port="input 1"/>
          <connect from_op="Collect" from_port="collection" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="flatten_collection" compatibility="9.5.001" expanded="true" height="68" name="Flatten Collection" width="90" x="916" y="238"/>
      <connect from_port="input 1" to_op="Read Excel" to_port="file"/>
      <connect from_op="Read Excel" from_port="output" to_op="Set Role (3)" to_port="example set input"/>
      <connect from_op="Set Role (3)" from_port="original" to_op="Set Role (4)" to_port="example set input"/>
      <connect from_op="Set Role (4)" from_port="original" to_op="Scrape and Loop" to_port="in 1"/>
      <connect from_op="Scrape and Loop" from_port="out 1" to_op="Flatten Collection" to_port="collection"/>
      <connect from_op="Flatten Collection" from_port="flat" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>




Answers

  • lionelderkrikorlionelderkrikor Moderator, RapidMiner Certified Analyst, Member Posts: 1,195 Unicorn
    Hi @damon_kost,

    I have difficulties to understand what you want to achieve.
    Can you provide a sample of what you have initially and what you want to obtain.

    Regards,

    Lionel
  • damon_kostdamon_kost Member Posts: 2 Contributor I
    Hi @lionelderkrikor

    Sorry if I wasn't clear...

    I have an Excel sheet with IDs in one column, and URLs in another (see attached).  I am trying to extract specific items from each URL in the pages, which are structure, and append them onto the original list of IDs and URLs as additional columns.  Ideally I would like to export this as one excel/csv file.

    Let me know if this makes sense....


Sign In or Register to comment.