Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Process failed abnormally in web mining

SGolbertSGolbert RapidMiner Certified Analyst, Member Posts: 344 Unicorn
edited April 2020 in Help

Hi, I have a web mining process in which some of the links are incorrect, therefore I use Handle Exception. Due to Loop Examples not working, the process is a bit messy:

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.000" expanded="true" height="68" name="Retrieve joined" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/sources/joined"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.2.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="multiply" compatibility="8.2.000" expanded="true" height="103" name="Multiply" width="90" x="179" y="238"/>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="442">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="link"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="442">
<list key="filters_list">
<parameter key="filters_entry_key" value="link.is_not_missing."/>
</list>
</operator>
<operator activated="false" class="split_data" compatibility="8.2.000" expanded="true" height="68" name="Split Data" width="90" x="581" y="442">
<enumeration key="partitions">
<parameter key="ratio" value="0.2"/>
<parameter key="ratio" value="0.8"/>
</enumeration>
<description align="center" color="transparent" colored="false" width="126">Downsampled!</description>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="715" y="442">
<parameter key="macro" value="number_examples"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="697">
<parameter key="number_of_iterations" value="%{number_examples}"/>
<parameter key="enable_parallel_execution" value="false"/>
<process expanded="true">
<operator activated="true" class="filter_example_range" compatibility="8.2.000" expanded="true" height="82" name="Filter Example Range" width="90" x="112" y="34">
<parameter key="first_example" value="%{iteration}"/>
<parameter key="last_example" value="%{iteration}"/>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (3)" width="90" x="246" y="34">
<parameter key="macro" value="link"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="link"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros">
<parameter key="id" value="id"/>
</list>
</operator>
<operator activated="true" class="handle_exception" compatibility="8.2.000" expanded="true" height="82" name="Handle Exception" width="90" x="380" y="34">
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="112" y="34">
<parameter key="url" value="%{link}"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<connect from_op="Get Page" from_port="output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
<process expanded="true">
<connect from_port="in 1" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data" width="90" x="514" y="34">
<parameter key="text_attribute" value="Text"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="648" y="34">
<list key="function_descriptions">
<parameter key="id" value="%{id}"/>
</list>
</operator>
<connect from_port="input 1" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Extract Macro (3)" to_port="example set"/>
<connect from_op="Handle Exception" from_port="out 1" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="8.2.000" expanded="true" height="82" name="Append" width="90" x="648" y="697"/>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples (2)" width="90" x="782" y="697">
<list key="filters_list">
<parameter key="filters_entry_key" value="Response-Code.eq.200"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="916" y="697">
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="Text" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content (2)" width="90" x="313" y="34"/>
<connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
<connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="parse_numbers" compatibility="8.2.000" expanded="true" height="82" name="Parse Numbers" width="90" x="1050" y="697">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="1184" y="697">
<parameter key="attribute_name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.2.000" expanded="true" height="82" name="Join" width="90" x="916" y="85">
<parameter key="join_type" value="left"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="1050" y="85">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="timestamp|text|message|link|excerpt"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="store" compatibility="8.2.000" expanded="true" height="68" name="Store" width="90" x="1184" y="85">
<parameter key="repository_entry" value="../data/sources/joined with text from links"/>
</operator>
<connect from_op="Retrieve joined" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Join" to_port="left"/>
<connect from_op="Multiply" from_port="output 2" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Extract Macro (2)" to_port="example set"/>
<connect from_op="Extract Macro (2)" from_port="example set" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Parse Numbers" to_port="example set input"/>
<connect from_op="Parse Numbers" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Store" to_port="input"/>
<connect from_op="Store" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

When I run it, I get an abnormal error:

 

Process failed abnormally
Ooops. Seems like you have found a bug. Please report it in our community at https://community.rapidminer.com. Reason: Could not create meta attributes
com.rapidminer.operator.OperatorException: Could not create meta attributes
at com.rapidminer.operator.text.io.Document2ExampleSet.doWork(Document2ExampleSet.java:101)
at com.rapidminer.operator.Operator.execute(Operator.java:1025)
at com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:812)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:807)
at java.security.AccessController.doPrivileged(Native Method)
at com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
at com.rapidminer.extension.concurrency.operator.process_control.loops.AbstractLoopOperator.doIteration(AbstractLoopOperator.java:408)
at com.rapidminer.extension.concurrency.operator.process_control.loops.AbstractLoopOperator.performSynchronizedLoop(AbstractLoopOperator.java:381)
at com.rapidminer.extension.concurrency.operator.process_control.loops.AbstractLoopOperator.doWork(AbstractLoopOperator.java:457)
at com.rapidminer.operator.Operator.execute(Operator.java:1025)
at com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:812)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:807)
at java.security.AccessController.doPrivileged(Native Method)
at com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
at com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:428)
at com.rapidminer.operator.Operator.execute(Operator.java:1025)
at com.rapidminer.Process.execute(Process.java:1315)
at com.rapidminer.Process.run(Process.java:1290)
at com.rapidminer.Process.run(Process.java:1181)
at com.rapidminer.Process.run(Process.java:1134)
at com.rapidminer.Process.run(Process.java:1129)
at com.rapidminer.Process.run(Process.java:1119)
at com.rapidminer.execution.jobcontainer.execution.SimpleExecutor.executeProcess(SimpleExecutor.java:84)
at com.rapidminer.execution.jobcontainer.EngineRunner.onApplicationEvent(EngineRunner.java:77)
at com.rapidminer.execution.jobcontainer.EngineRunner.onApplicationEvent(EngineRunner.java:31)
at org.springframework.context.event.SimpleApplicationEventMulticaster.invokeListener(SimpleApplicationEventMulticaster.java:167)
at org.springframework.context.event.SimpleApplicationEventMulticaster.multicastEvent(SimpleApplicationEventMulticaster.java:139)
at org.springframework.context.support.AbstractApplicationContext.publishEvent(AbstractApplicationContext.java:393)
at org.springframework.context.support.AbstractApplicationContext.publishEvent(AbstractApplicationContext.java:347)
at org.springframework.boot.context.event.EventPublishingRunListener.finished(EventPublishingRunListener.java:101)
at org.springframework.boot.SpringApplicationRunListeners.callFinishedListener(SpringApplicationRunListeners.java:79)
at org.springframework.boot.SpringApplicationRunListeners.finished(SpringApplicationRunListeners.java:72)
at org.springframework.boot.SpringApplication.run(SpringApplication.java:305)
at com.rapidminer.execution.jobcontainer.Application.main(Application.java:44)

Any idea what could it be? Since 8.0 the loops are behaving a bit unpredictably :(

Tagged:

Answers

  • MaerkliMaerkli Member Posts: 84 Guru

    Hallo SGolbert,

     

    I tried to reproduce your XML in my own RM 8.2.000; because I don't have your data, I think that it was quite difficult to have the same situation. Suggestion: could be an idea to use breakpoints in your process? I made as well research inside pure Java litterature

    but I didn't find any relevant information.

    Maerkli

  • SGolbertSGolbert RapidMiner Certified Analyst, Member Posts: 344 Unicorn

    Hi Maerkli,

     

    thank you for looking into the problem. It turns out, I replaced the process by another (actually moved this step to the webscrapping process -made with Scrapy-).

     

    If I come to the error again, I will try to reproduce it with simple data.

     

    Regards,

    Sebastian

Sign In or Register to comment.