Split data

ostlundtheoostlundtheo Member Posts: 3 Newbie
Hi!

I have a CSV file that consists of an id, which is an unique movie, and the keywords for this movie. It looks something like this: 15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': 'duringcreditsstinger'}, {'id': 208510, 'name': 'old men'}]"

I want to split the data so every movie (the id) gets every keyword. But using read csv-file, it only gets me a column with the id and then one column with all the keywords, including keyword-id and 'name'. Is there any solution to only get the specific keyword?

Best Answers

  • rjones13rjones13 Member Posts: 126 Unicorn
    Solution Accepted
    Hi,

    Would the following process work for you? It's not the most elegant, but does the job. If you need it to automatically select the correct attributes, might need a little more fancy work.

    Let me know how you get one.

    Best,

    Roland

    <?xml version="1.0" encoding="UTF-8"?><process version="10.2.000">
    
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="10.2.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
    <operator activated="true" class="utility:create_exampleset" compatibility="10.2.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
    <parameter key="generator_type" value="comma separated text"/>
    <parameter key="number_of_examples" value="100"/>
    <parameter key="use_stepsize" value="false"/>
    <list key="function_descriptions"/>
    <parameter key="add_id_attribute" value="false"/>
    <list key="numeric_series_configuration"/>
    <list key="date_series_configuration"/>
    <list key="date_series_configuration (interval)"/>
    <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
    <parameter key="time_zone" value="SYSTEM"/>
    <parameter key="input_csv_text" value="att1; att2&#10;15602;[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': 'duringcreditsstinger'}, {'id': 208510, 'name': 'old men'}]"/>
    <parameter key="column_separator" value=";"/>
    <parameter key="parse_all_as_nominal" value="true"/>
    <parameter key="decimal_point_character" value="."/>
    <parameter key="trim_attribute_names" value="true"/>
    </operator>
    <operator activated="true" class="blending:set_role" compatibility="10.2.000" expanded="true" height="82" name="Set Role" width="90" x="179" y="34">
    <list key="set_roles">
    <parameter key="att1" value="id"/>
    </list>
    </operator>
    <operator activated="true" class="replace" compatibility="10.2.000" expanded="true" height="82" name="Remove special characters" width="90" x="313" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="att2"/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="replace_what" value="[\[\]\{\}\s\']"/>
    <parameter key="replace_by" value=""/>
    </operator>
    <operator activated="true" class="split" compatibility="10.2.000" expanded="true" height="82" name="Split" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="all"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="split_pattern" value=","/>
    <parameter key="split_mode" value="ordered_split"/>
    </operator>
    <operator activated="true" class="blending:select_attributes" compatibility="10.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="34">
    <parameter key="type" value="include attributes"/>
    <parameter key="attribute_filter_type" value="a subset"/>
    <parameter key="select_attribute" value=""/>
    <parameter key="select_subset" value="att2_2␞att2_4␞att2_6␞att2_8"/>
    <parameter key="also_apply_to_special_attributes_(id,_label..)" value="false"/>
    </operator>
    <operator activated="true" class="replace" compatibility="10.2.000" expanded="true" height="82" name="Remove name:" width="90" x="715" y="34">
    <parameter key="attribute_filter_type" value="all"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value=""/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="nominal"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="file_path"/>
    <parameter key="block_type" value="single_value"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="single_value"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    <parameter key="replace_what" value="name:"/>
    </operator>
    <connect from_op="Create ExampleSet" from_port="output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Remove special characters" to_port="example set input"/>
    <connect from_op="Remove special characters" from_port="example set output" to_op="Split" to_port="example set input"/>
    <connect from_op="Split" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Remove name:" to_port="example set input"/>
    <connect from_op="Remove name:" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
  • CKönigCKönig Administrator, Moderator, Employee, Member Posts: 70 RM Team Member
    Solution Accepted
    Hi @ostlundtheo,

    your data structure in the CSV looks like JSON, so you would need to parse that first. Maybe even have a look at the process that is generating your csv file, if you can change that to make processing easier.
Sign In or Register to comment.