RapidMiner

Extracting ID from JSON string

SOLVED
Regular Contributor

Extracting ID from JSON string

Hi

 

I am attempting to extract the id from a JSON string, the below process pulls the entire field required but for some reason it is not breaking the data out into the various columns. What am I missing to pull out the filed "._id"?

 

As a side note for some reason the JSON is also sending the id field as incremented, so in other word the id's come in sucessing with labels of [0]._id, [1000]._id, [1001]._id. I am not sure if this is having an impact on reading the data in?

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<operator activated="true" class="productivity:execute_program" compatibility="7.5.003" expanded="true" height="103" name="Execute Program (2)" width="90" x="45" y="187">
<parameter key="command" value="./imp_id.sh"/>
<parameter key="log_stdout" value="true"/>
<parameter key="log_stderr" value="true"/>
<parameter key="working_directory" value="/import_id"/>
<list key="env_variables"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<operator activated="true" class="text:read_document" compatibility="7.5.000" expanded="true" height="68" name="Read Document (2)" width="90" x="179" y="187">
<parameter key="extract_text_only" value="false"/>
<parameter key="use_file_extension_as_type" value="true"/>
<parameter key="content_type" value="txt"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<operator activated="true" class="text:json_to_data" compatibility="7.5.000" expanded="true" height="82" name="JSON To Data" width="90" x="313" y="187">
<parameter key="ignore_arrays" value="false"/>
<parameter key="limit_attributes" value="false"/>
<parameter key="skip_invalid_documents" value="false"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="187">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="id" value="1.0"/>
</list>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<operator activated="true" class="text:combine_documents" compatibility="7.5.000" expanded="true" height="82" name="Combine Documents" width="90" x="581" y="187"/>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<operator activated="true" class="text:write_document" compatibility="7.5.000" expanded="true" height="82" name="Write Document" width="90" x="715" y="187">
<parameter key="file" value="/test.csv.*"/>
<parameter key="overwrite" value="true"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
</process>

5 REPLIES
Moderator

Re: Extracting ID from JSON string

Would you please use the </> toggle box to post your XML, it's corrupted for some reason. 

Regular Contributor

Re: Extracting ID from JSON string

[ Edited ]
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="productivity:execute_program" compatibility="7.5.003" expanded="true" height="103" name="Execute Program (2)" width="90" x="45" y="187">
        <parameter key="command" value="./tw_imp_id.sh"/>
        <parameter key="working_directory" value="/Users/Robin/Dropbox/import_id"/>
        <list key="env_variables"/>
      </operator>
      <operator activated="true" class="text:read_document" compatibility="7.5.000" expanded="true" height="68" name="Read Document (2)" width="90" x="179" y="187">
        <parameter key="extract_text_only" value="false"/>
      </operator>
      <operator activated="true" class="text:json_to_data" compatibility="7.5.000" expanded="true" height="82" name="JSON To Data" width="90" x="313" y="187"/>
      <operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="187">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="id" value="1.0"/>
        </list>
      </operator>
      <operator activated="true" class="text:combine_documents" compatibility="7.5.000" expanded="true" height="82" name="Combine Documents" width="90" x="581" y="187"/>
      <operator activated="true" class="text:cut_document" compatibility="7.5.000" expanded="true" height="68" name="Cut Document" width="90" x="715" y="187">
        <parameter key="query_type" value="JsonPath"/>
        <list key="string_machting_queries">
          <parameter key="id" value="&quot;*\._id&quot;: (\\d+)."/>
        </list>
        <list key="regular_expression_queries"/>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries">
          <parameter key="id" value="(\d+),"/>
        </list>
        <process expanded="true">
          <connect from_port="segment" to_port="document 1"/>
          <portSpacing port="source_segment" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Execute Program (2)" from_port="out" to_op="Read Document (2)" to_port="file"/>
      <connect from_op="Read Document (2)" from_port="output" to_op="JSON To Data" to_port="documents 1"/>
      <connect from_op="JSON To Data" from_port="example set" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Combine Documents" to_port="documents 1"/>
      <connect from_op="Combine Documents" from_port="document" to_op="Cut Document" to_port="document"/>
      <connect from_op="Cut Document" from_port="documents" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="21"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Regular Contributor

Re: Extracting ID from JSON string

What the output is looking like: 

 

[0]._id

[0]._links.people.href

[0]._links.people.method

[0]._links.self.href

[0]._links.self.method

[0].created

[0].handles_count

[0].operation

[0].percent_complete

[0].person

[0].search_key

[0].status

[0].total_handles

[0].updated

[1000]._id

[1000]._links.people.href

[1000]._links.people.method

[1000]._links.self.href

59b149368066510579744f83

/v2/api/import/twitter/requests/59b149368066510579744f83/people

GET

/v2/api/import/twitter/requests/59b149368066510579744f83

GET

2017-09-07T13:27:18.578000+00:00

1.0

Individual

0.99

59b1493b80665106cfb4c871

Screen_name

Finished

1.0

2017-09-07T13:27:23.924000+00:00

59ac4ea58066510579743c2e

/v2/api/import/requests/59ac4ea58066510579743c2e/people

GET

/v2/api/import/requests/59ac4ea58066510579743c2e

 

Highlighted
Moderator
Solution
Accepted by topic author robin
3 weeks ago

Re: Extracting ID from JSON string

From initial inspection the logic appears to be ok. Without the actual data file, its hard to diagnos but I do have some suggestions. I'm not sure why you use "select attributes and weights" on the Data to Documents operator? Maybe toggle that off. Also, would an Extract Information operator embeded inside the Cut Documents work for you?  This way you can cut the JSON string and extract what you need.

 

Regular Contributor

Re: Extracting ID from JSON string

Thank you