Split data

ostlundtheo · October 2023

Hi!

I have a CSV file that consists of an id, which is an unique movie, and the keywords for this movie. It looks something like this: 15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': 'duringcreditsstinger'}, {'id': 208510, 'name': 'old men'}]"

I want to split the data so every movie (the id) gets every keyword. But using read csv-file, it only gets me a column with the id and then one column with all the keywords, including keyword-id and 'name'. Is there any solution to only get the specific keyword?

rjones13 · October 2023

Hi,

Would the following process work for you? It's not the most elegant, but does the job. If you need it to automatically select the correct attributes, might need a little more fancy work.

Let me know how you get one.

Best,

Roland

<?xml version="1.0" encoding="UTF-8"?><process version="10.2.000">

  <context>

    <input/>

    <output/>

    <macros/>

  </context>

  <operator activated="true" class="process" compatibility="10.2.000" expanded="true" name="Process">

    <parameter key="logverbosity" value="init"/>

    <parameter key="random_seed" value="2001"/>

    <parameter key="send_mail" value="never"/>

    <parameter key="notification_email" value=""/>

    <parameter key="process_duration_for_mail" value="30"/>

    <parameter key="encoding" value="UTF-8"/>

    <process expanded="true">

      <operator activated="true" class="utility:create_exampleset" compatibility="10.2.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">

        <parameter key="generator_type" value="comma separated text"/>

        <parameter key="number_of_examples" value="100"/>

        <parameter key="use_stepsize" value="false"/>

        <list key="function_descriptions"/>

        <parameter key="add_id_attribute" value="false"/>

        <list key="numeric_series_configuration"/>

        <list key="date_series_configuration"/>

        <list key="date_series_configuration (interval)"/>

        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>

        <parameter key="time_zone" value="SYSTEM"/>

        <parameter key="input_csv_text" value="att1; att2&#10;15602;[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': 'duringcreditsstinger'}, {'id': 208510, 'name': 'old men'}]"/>

        <parameter key="column_separator" value=";"/>

        <parameter key="parse_all_as_nominal" value="true"/>

        <parameter key="decimal_point_character" value="."/>

        <parameter key="trim_attribute_names" value="true"/>

      </operator>

      <operator activated="true" class="blending:set_role" compatibility="10.2.000" expanded="true" height="82" name="Set Role" width="90" x="179" y="34">

        <list key="set_roles">

          <parameter key="att1" value="id"/>

        </list>

      </operator>

      <operator activated="true" class="replace" compatibility="10.2.000" expanded="true" height="82" name="Remove special characters" width="90" x="313" y="34">

        <parameter key="attribute_filter_type" value="single"/>

        <parameter key="attribute" value="att2"/>

        <parameter key="attributes" value=""/>

        <parameter key="use_except_expression" value="false"/>

        <parameter key="value_type" value="nominal"/>

        <parameter key="use_value_type_exception" value="false"/>

        <parameter key="except_value_type" value="file_path"/>

        <parameter key="block_type" value="single_value"/>

        <parameter key="use_block_type_exception" value="false"/>

        <parameter key="except_block_type" value="single_value"/>

        <parameter key="invert_selection" value="false"/>

        <parameter key="include_special_attributes" value="false"/>

        <parameter key="replace_what" value="[\[\]\{\}\s\']"/>

        <parameter key="replace_by" value=""/>

      </operator>

      <operator activated="true" class="split" compatibility="10.2.000" expanded="true" height="82" name="Split" width="90" x="447" y="34">

        <parameter key="attribute_filter_type" value="all"/>

        <parameter key="attribute" value=""/>

        <parameter key="attributes" value=""/>

        <parameter key="use_except_expression" value="false"/>

        <parameter key="value_type" value="nominal"/>

        <parameter key="use_value_type_exception" value="false"/>

        <parameter key="except_value_type" value="file_path"/>

        <parameter key="block_type" value="single_value"/>

        <parameter key="use_block_type_exception" value="false"/>

        <parameter key="except_block_type" value="single_value"/>

        <parameter key="invert_selection" value="false"/>

        <parameter key="include_special_attributes" value="false"/>

        <parameter key="split_pattern" value=","/>

        <parameter key="split_mode" value="ordered_split"/>

      </operator>

      <operator activated="true" class="blending:select_attributes" compatibility="10.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="34">

        <parameter key="type" value="include attributes"/>

        <parameter key="attribute_filter_type" value="a subset"/>

        <parameter key="select_attribute" value=""/>

        <parameter key="select_subset" value="att2_2␞att2_4␞att2_6␞att2_8"/>

        <parameter key="also_apply_to_special_attributes_(id,_label..)" value="false"/>

      </operator>

      <operator activated="true" class="replace" compatibility="10.2.000" expanded="true" height="82" name="Remove name:" width="90" x="715" y="34">

        <parameter key="attribute_filter_type" value="all"/>

        <parameter key="attribute" value=""/>

        <parameter key="attributes" value=""/>

        <parameter key="use_except_expression" value="false"/>

        <parameter key="value_type" value="nominal"/>

        <parameter key="use_value_type_exception" value="false"/>

        <parameter key="except_value_type" value="file_path"/>

        <parameter key="block_type" value="single_value"/>

        <parameter key="use_block_type_exception" value="false"/>

        <parameter key="except_block_type" value="single_value"/>

        <parameter key="invert_selection" value="false"/>

        <parameter key="include_special_attributes" value="false"/>

        <parameter key="replace_what" value="name:"/>

      </operator>

      <connect from_op="Create ExampleSet" from_port="output" to_op="Set Role" to_port="example set input"/>

      <connect from_op="Set Role" from_port="example set output" to_op="Remove special characters" to_port="example set input"/>

      <connect from_op="Remove special characters" from_port="example set output" to_op="Split" to_port="example set input"/>

      <connect from_op="Split" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>

      <connect from_op="Select Attributes" from_port="example set output" to_op="Remove name:" to_port="example set input"/>

      <connect from_op="Remove name:" from_port="example set output" to_port="result 1"/>

      <portSpacing port="source_input 1" spacing="0"/>

      <portSpacing port="sink_result 1" spacing="0"/>

      <portSpacing port="sink_result 2" spacing="0"/>

    </process>

  </operator>

</process>

CKönig · October 2023

Hi @ostlundtheo,

your data structure in the CSV looks like JSON, so you would need to parse that first. Maybe even have a look at the process that is generating your csv file, if you can change that to make processing easier.

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Split data

Best Answers