Read Document Error & Skipping Over Errors

carlcarl Member Posts: 30 Guru
edited November 2018 in Help

I get the following error from the Read Document operator (inside Loop Examples after Read Excel with the input URLs).  It stops after successully reading several hundred records.  I have a log that tells me where the process stops, but do not see anything obviously wrong with the input URL.

 

Any thoughts on the possible cause?  And is there a way to skip past any troublesome input URLs rather than stopping the process with no output?

Error.jpg

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.3.000" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="/Users/carl/Documents/SD PDFs.xlsx"/>
<parameter key="imported_cell_range" value="A"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="locale" value="English (United Kingdom)"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="SvceDef Link.true.file_path.attribute"/>
</list>
</operator>
<operator activated="true" class="loop_examples" compatibility="7.3.000" expanded="true" height="103" name="Loop Examples" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="7.3.000" expanded="true" height="68" name="Extract Macro" width="90" x="45" y="136">
<parameter key="macro" value="GetURL"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="SvceDef Link"/>
<parameter key="example_index" value="%{example}"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="log" compatibility="7.3.000" expanded="true" height="82" name="Log" width="90" x="179" y="136">
<parameter key="filename" value="/Users/carl/Documents/Log.log"/>
<list key="log">
<parameter key="Log" value="operator.Extract Macro.value.applycount"/>
</list>
</operator>
<operator activated="true" class="open_file" compatibility="7.3.000" expanded="true" height="68" name="Open File" width="90" x="246" y="34">
<parameter key="resource_type" value="URL"/>
<parameter key="url" value="%{GetURL}"/>
</operator>
<operator activated="true" class="text:read_document" compatibility="7.3.000" expanded="true" height="68" name="Read Document" width="90" x="380" y="34">
<parameter key="content_type" value="pdf"/>
<parameter key="encoding" value="UTF-8"/>
</operator>
<operator activated="true" class="text:extract_information" compatibility="7.3.000" expanded="true" height="68" name="Extract Information" width="90" x="514" y="34">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="Ark" value="[Cc]rown [Hh]osting|Ark Data Cent[re|er]s?|Cody Park|[sS]kyscape|Spring Park"/>
<parameter key="Mainframe" value="Mm]ainframe"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="7.3.000" expanded="true" height="82" name="Documents to Data" width="90" x="648" y="34">
<parameter key="text_attribute" value="OriginalText"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.3.000" expanded="true" height="82" name="Generate Attributes" width="90" x="782" y="34">
<list key="function_descriptions">
<parameter key="URL" value="%{GetURL}"/>
</list>
</operator>
<connect from_port="example set" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Log" to_port="through 1"/>
<connect from_op="Open File" from_port="file" to_op="Read Document" to_port="file"/>
<connect from_op="Read Document" from_port="output" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="7.3.000" expanded="true" height="82" name="Append" width="90" x="313" y="34"/>
<operator activated="true" class="select_attributes" compatibility="7.3.000" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value="Text"/>
<parameter key="attributes" value="URL|Ark|Mainframe"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="7.3.000" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="Ark.is_not_missing."/>
<parameter key="filters_entry_key" value="Mainframe.is_not_missing."/>
</list>
<parameter key="filters_logic_and" value="false"/>
</operator>
<operator activated="true" class="order_attributes" compatibility="7.3.000" expanded="true" height="82" name="Reorder Attributes" width="90" x="715" y="34">
<parameter key="attribute_ordering" value="URL|Mainframe|Ark"/>
</operator>
<operator activated="true" class="write_excel" compatibility="7.3.000" expanded="true" height="82" name="Write Excel" width="90" x="849" y="34">
<parameter key="excel_file" value="/Users/carl/Documents/Service Definition Matches.xlsx"/>
</operator>
<connect from_port="input 1" to_op="Read Excel" to_port="file"/>
<connect from_op="Read Excel" from_port="output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
<connect from_op="Reorder Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>

Best Answer

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
    Solution Accepted

    hi...ok I've looked at your process.  Some thoughts..

    - Are all the URLs that you're going point to PDF files?  Your Read Document operator is only looking for pdfs.

    - I tend not to use the Open File operator to get a web page.  I prefer to use the "Get Page" operator in the Web Mining extension.  There's a lot more functionality there.

    - That yellow text warning is what you want.  It's telling you that Handle Exception is skipping over the operator "Read Document" when it cannot do it.  If it were me, I would put both the Open File and the Read Document in the "Try" section.

    - That red text warning is telling you that whatever succeeds in the Handle Exception and is being passed onto Extract Information is not always a document, and hence it gives you an error (Extract Information requires a document).

     

    SO if it were me, I would try the following:

    - Place ALL the operators inside the Loop Examples inside the Handle Exception.  This way it skips over any problems it has along the way, and only passes complete successes to the output.

    - Rebuild the URL grab using Get Page rather than Open File.

     

    Scott

Answers

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    for skipping over errors, I would recommend the "Handle Exception" operator.  It's very handy.

     

    Scott

  • carlcarl Member Posts: 30 Guru

    Thank you Scott.  It feels like this approach should get me there.  I can't quite implement it correctly though.  

     

    I've copied the approach in the tutorial example for the operator.  When I run it, Handle exceptions cycles through the good URLs, and logs the bad one, but then the Extract Information operator (following Handle Exceptions) gives me this error.

     

    Dec 14, 2016 7:25:45 PM WARNING: Error occurred and will be neglected by Handle Exception: Could not read file 'InputFileObject': java.io.IOException: javax.crypto.BadPaddingException: Given final block not properly padded.
    Dec 14, 2016 7:25:45 PM SEVERE: Process failed: Wrong input of type 'File' at port 'document'. Expected type 'Document'.

     

    I tried Create Document (after the log) on the catch side of Handle Exceptions.  But that just moves the problem to the Append operator.  I don't really need to do any more than log the error, then proceed with the good URLs, but can't quite find a formulation to get me there.  Could you point me in the right direction?

     


    <?xml version="1.0" encoding="UTF-8"?><process version="7.3.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.3.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="read_excel" compatibility="7.3.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
    <parameter key="excel_file" value="/Users/carl/Documents/SD PDFs.xlsx"/>
    <parameter key="imported_cell_range" value="A1830:D1850"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <parameter key="locale" value="English (United Kingdom)"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="SvceDef Link.true.file_path.attribute"/>
    <parameter key="3" value="Supplier.true.nominal.attribute"/>
    <parameter key="1" value="Lot.true.nominal.attribute"/>
    <parameter key="2" value="Service ID.true.nominal.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="7.3.001" expanded="true" height="103" name="Loop Examples" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="true" class="extract_macro" compatibility="7.3.001" expanded="true" height="68" name="Extract Macro" width="90" x="45" y="136">
    <parameter key="macro" value="GetURL"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="attribute_name" value="SvceDef Link"/>
    <parameter key="example_index" value="%{example}"/>
    <list key="additional_macros">
    <parameter key="Supplier" value="Supplier"/>
    <parameter key="Lot" value="Lot"/>
    <parameter key="Service ID" value="Service ID"/>
    </list>
    </operator>
    <operator activated="true" class="log" compatibility="7.3.001" expanded="true" height="82" name="Log" width="90" x="179" y="136">
    <parameter key="filename" value="/Users/carl/Documents/Log.log"/>
    <list key="log">
    <parameter key="Log" value="operator.Extract Macro.value.applycount"/>
    </list>
    </operator>
    <operator activated="true" class="open_file" compatibility="7.3.001" expanded="true" height="68" name="Open File" width="90" x="246" y="34">
    <parameter key="resource_type" value="URL"/>
    <parameter key="url" value="%{GetURL}"/>
    </operator>
    <operator activated="true" class="handle_exception" compatibility="7.3.001" expanded="true" height="82" name="Handle Exception" width="90" x="380" y="34">
    <process expanded="true">
    <operator activated="true" class="text:read_document" compatibility="7.3.000" expanded="true" height="68" name="Read Document" width="90" x="179" y="34">
    <parameter key="content_type" value="pdf"/>
    <parameter key="encoding" value="UTF-8"/>
    </operator>
    <connect from_port="in 1" to_op="Read Document" to_port="file"/>
    <connect from_op="Read Document" from_port="output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="log" compatibility="7.3.001" expanded="true" height="82" name="Log (2)" width="90" x="179" y="34">
    <parameter key="filename" value="/Users/carl/Documents/Error Log.log"/>
    <list key="log">
    <parameter key="Error Message" value="operator.Handle Exception.value.exception"/>
    </list>
    </operator>
    <connect from_port="in 1" to_op="Log (2)" to_port="through 1"/>
    <connect from_op="Log (2)" from_port="through 1" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:extract_information" compatibility="7.3.000" expanded="true" height="68" name="Extract Information" width="90" x="514" y="34">
    <parameter key="query_type" value="Regular Expression"/>
    <list key="string_machting_queries"/>
    <list key="regular_expression_queries">
    <parameter key="Ark" value="[Cc]rown [Hh]osting|Ark Data Cent[re|er]s?|Cody Park|[sS]kyscape|Spring Park"/>
    <parameter key="Mainframe" value="Mm]ainframe"/>
    </list>
    <list key="regular_region_queries"/>
    <list key="xpath_queries"/>
    <list key="namespaces"/>
    <list key="index_queries"/>
    <list key="jsonpath_queries"/>
    </operator>
    <operator activated="true" class="text:documents_to_data" compatibility="7.3.000" expanded="true" height="82" name="Documents to Data" width="90" x="648" y="34">
    <parameter key="text_attribute" value="OriginalText"/>
    </operator>
    <operator activated="true" class="generate_attributes" compatibility="7.3.001" expanded="true" height="82" name="Generate Attributes" width="90" x="782" y="34">
    <list key="function_descriptions">
    <parameter key="URL" value="%{GetURL}"/>
    <parameter key="Supplier" value="%{Supplier}"/>
    <parameter key="Lot" value="%{Lot}"/>
    <parameter key="Service ID" value="%{Service ID}"/>
    </list>
    </operator>
    <connect from_port="example set" to_op="Extract Macro" to_port="example set"/>
    <connect from_op="Extract Macro" from_port="example set" to_op="Log" to_port="through 1"/>
    <connect from_op="Open File" from_port="file" to_op="Handle Exception" to_port="in 1"/>
    <connect from_op="Handle Exception" from_port="out 1" to_op="Extract Information" to_port="document"/>
    <connect from_op="Extract Information" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
    <connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
    <connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="7.3.001" expanded="true" height="82" name="Append" width="90" x="313" y="34"/>
    <operator activated="true" class="select_attributes" compatibility="7.3.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value="Text"/>
    <parameter key="attributes" value="Supplier|URL|Ark|Mainframe|Lot|Service ID"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="7.3.001" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Ark.is_not_missing."/>
    <parameter key="filters_entry_key" value="Mainframe.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    </operator>
    <operator activated="true" class="order_attributes" compatibility="7.3.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="715" y="34">
    <parameter key="attribute_ordering" value="Supplier|Lot|Service ID|URL|Mainframe|Ark"/>
    </operator>
    <operator activated="true" class="write_excel" compatibility="7.3.001" expanded="true" height="82" name="Write Excel" width="90" x="849" y="34">
    <parameter key="excel_file" value="/Users/carl/Documents/Service Definition Matches.xlsx"/>
    </operator>
    <connect from_port="input 1" to_op="Read Excel" to_port="file"/>
    <connect from_op="Read Excel" from_port="output" to_op="Loop Examples" to_port="example set"/>
    <connect from_op="Loop Examples" from_port="output 1" to_op="Append" to_port="example set 1"/>
    <connect from_op="Append" from_port="merged set" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
    <connect from_op="Reorder Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
    <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

     

  • carlcarl Member Posts: 30 Guru

    Perfect, thank you.   That worked.

     

    Yes, only after PDFs.  There was at least one ODT, but I filtered those out as I couldn't see an operator to handle those.

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    oh good glad it worked.

     

    Yes I don't know of a way to pull in .doc, .docx, .odt, etc... nicely.  Maybe there's an API that you can use to convert to pdf or text?  Otherwise submit to "Ideas".  :)

     

    Scott

Sign In or Register to comment.