Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Using 3 GB RAM for Rapidminer
Hi All,
I am trying to process 143000 records and am using 3GB Ram for rapidminer. It is taking two many days for process. Input file size 337 MB only.
I integrated mysql with Rapidminer. I fed the data into mysql.
My XML is like this:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="6.0.002" expanded="true" height="60" name="Read Database" width="90" x="45" y="30">
<parameter key="connection" value="mysql"/>
<parameter key="query" value="SELECT `id`, `title`, `keywords`, `keyphrases`, `description` FROM `cat45`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="6.0.002" expanded="true" height="76" name="Nominal to Text" width="90" x="45" y="165"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="390">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="165"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="246" y="165"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="380" y="165"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="6.0.002" expanded="true" height="94" name="Nominal to Numerical" width="90" x="380" y="345">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="k_means" compatibility="6.0.002" expanded="true" height="76" name="Clustering" width="90" x="581" y="120">
<parameter key="k" value="4"/>
<parameter key="measure_types" value="MixedMeasures"/>
</operator>
<connect from_op="Read Database" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Your help is very much appreciated.
Thanks in Advance,
Venkat
I am trying to process 143000 records and am using 3GB Ram for rapidminer. It is taking two many days for process. Input file size 337 MB only.
I integrated mysql with Rapidminer. I fed the data into mysql.
My XML is like this:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_database" compatibility="6.0.002" expanded="true" height="60" name="Read Database" width="90" x="45" y="30">
<parameter key="connection" value="mysql"/>
<parameter key="query" value="SELECT `id`, `title`, `keywords`, `keyphrases`, `description` FROM `cat45`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="6.0.002" expanded="true" height="76" name="Nominal to Text" width="90" x="45" y="165"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="390">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="165"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="246" y="165"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="380" y="165"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="6.0.002" expanded="true" height="94" name="Nominal to Numerical" width="90" x="380" y="345">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="k_means" compatibility="6.0.002" expanded="true" height="76" name="Clustering" width="90" x="581" y="120">
<parameter key="k" value="4"/>
<parameter key="measure_types" value="MixedMeasures"/>
</operator>
<connect from_op="Read Database" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Your help is very much appreciated.
Thanks in Advance,
Venkat
0
Answers
Perhaps you may reduce the size of your select statement only using "title" ? If this works you
really need more RAM.
Why do you need operator "Nominal to Numerical" if TF-IDF delivers numerical values for all tokens found ?
And last but not least: Why you do not apply the "tokenize" operator inside "Prozess Documents" operator ?
You should start with tokenizing first and if this works you may add further operators like Generate-N-Grams and so on.