Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
LDA Optimization
JEdward
RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 578 Unicorn
Hi guys, Any idea how best to tweak the parameters in optimizing the LDA model? I'm playing about with this example using RSS news feeds and not 100% sure if the Optimize model is working well enough on small values for topics. Is it not enough of a large dataset do you think?
Please note: increasing the start & end values for number of topics gives better results, I'm wanting to build a generic example on usage and any suggested pointers.
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="103" name="Get News Feeds" width="90" x="45" y="34">
<process expanded="true">
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Top Stories" width="90" x="45" y="34">
<parameter key="url" value="http://feeds.bbci.co.uk/news/rss.xml"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Asia" width="90" x="45" y="85">
<parameter key="url" value="http://feeds.bbci.co.uk/news/world/asia/rss.xml"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Business" width="90" x="45" y="136">
<parameter key="url" value="http://feeds.bbci.co.uk/news/business/rss.xml"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Entertainment" width="90" x="45" y="187">
<parameter key="url" value="http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"/>
</operator>
<operator activated="true" class="append" compatibility="8.1.001" expanded="true" height="145" name="Append" width="90" x="179" y="34"/>
<operator activated="true" class="generate_copy" compatibility="8.1.001" expanded="true" height="82" name="Generate Copy" width="90" x="313" y="34">
<parameter key="attribute_name" value="Title"/>
<parameter key="new_name" value="Title2"/>
</operator>
<operator activated="true" class="text_to_nominal" compatibility="8.1.001" expanded="true" height="82" name="Text to Nominal" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="Link|Title2"/>
<description align="center" color="transparent" colored="false" width="126">Don't convert article link to document text.</description>
</operator>
<operator activated="true" class="split_data" compatibility="8.1.001" expanded="true" height="103" name="Split Data" width="90" x="581" y="34">
<enumeration key="partitions">
<parameter key="ratio" value="0.7"/>
<parameter key="ratio" value="0.3"/>
</enumeration>
<parameter key="sampling_type" value="shuffled sampling"/>
<description align="center" color="transparent" colored="false" width="126">Randomly sort the data from the feeds. Split into training &amp; testing.</description>
</operator>
<connect from_op="BBC Top Stories" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="BBC Asia" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="BBC Business" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="BBC Entertainment" from_port="output" to_op="Append" to_port="example set 4"/>
<connect from_op="Append" from_port="merged set" to_op="Generate Copy" to_port="example set input"/>
<connect from_op="Generate Copy" from_port="example set output" to_op="Text to Nominal" to_port="example set input"/>
<connect from_op="Text to Nominal" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_port="out 1"/>
<connect from_op="Split Data" from_port="partition 2" to_port="out 2"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents (2)" width="90" x="179" y="289">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="34">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.1.001" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="581" y="34"/>
<connect from_port="single" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Text Prep using Text Mining</description>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.1.001" expanded="true" height="82" name="Loop Collection (3)" width="90" x="313" y="238">
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (3)" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (3)" width="90" x="313" y="34">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (3)" width="90" x="447" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (2)" width="90" x="581" y="34"/>
<connect from_port="single" to_op="Tokenize (3)" to_port="document"/>
<connect from_op="Tokenize (3)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
<connect from_op="Filter Tokens (3)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
<connect from_op="Filter Stopwords (3)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
<connect from_op="Generate n-Grams (2)" from_port="document" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Text Prep using Text Mining</description>
</operator>
<operator activated="true" class="concurrency:optimize_parameters_grid" compatibility="8.1.001" expanded="true" height="187" name="Optimize Parameters (Grid)" width="90" x="514" y="34">
<list key="parameters">
<parameter key="LDA.number_of_topics" value="[5;20;5;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="operator_toolbox:lda" compatibility="1.0.000" expanded="true" height="124" name="LDA" width="90" x="179" y="34">
<parameter key="number_of_topics" value="20"/>
<parameter key="iterations" value="100"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1997"/>
</operator>
<operator activated="true" class="generate_direct_mailing_data" compatibility="8.1.001" expanded="true" height="68" name="Generate Direct Mailing Data" width="90" x="112" y="238"/>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.1.001" expanded="true" height="103" name="Decision Tree" width="90" x="246" y="238">
<description align="center" color="transparent" colored="false" width="126">This is because the optimize operator doesn't recognize the LDA model type.</description>
</operator>
<connect from_port="input 1" to_op="LDA" to_port="col"/>
<connect from_op="LDA" from_port="exa" to_port="output 1"/>
<connect from_op="LDA" from_port="top" to_port="output 2"/>
<connect from_op="LDA" from_port="mod" to_port="output 3"/>
<connect from_op="LDA" from_port="per" to_port="performance"/>
<connect from_op="Generate Direct Mailing Data" from_port="output" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
<portSpacing port="sink_output 4" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="1.0.000" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="238"/>
<connect from_op="Get News Feeds" from_port="out 1" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Get News Feeds" from_port="out 2" to_op="Data to Documents (2)" to_port="example set"/>
<connect from_op="Data to Documents (2)" from_port="documents" to_op="Loop Collection (3)" to_port="collection"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Loop Collection (3)" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
<connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 2"/>
<connect from_op="Optimize Parameters (Grid)" from_port="output 1" to_port="result 3"/>
<connect from_op="Optimize Parameters (Grid)" from_port="output 2" to_port="result 4"/>
<connect from_op="Optimize Parameters (Grid)" from_port="output 3" to_op="Apply Model (Documents)" to_port="mod"/>
<connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="84"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
0
Answers
Hi @JEdward, maybe I am missing something here, but does LDA have any parameters that can be optimized? It certainly doesn't look like it in my version of the operator, and conceptually speaking I'm not sure what it would optimize either. Also, is there a reason you are trying to solve this problem with LDA? Conceptually it seems like you'd be better off anyways with either GLM or a SVM for this type of problem---both of which offer plenty of parameters to optimize :-)
Or even Regularized DA gives you a little more flexibility around the assumption of strict linearity.
Lindon Ventures
Data Science Consulting from Certified RapidMiner Experts
tagging @mschmitz, the LDA meister.
Scott