Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Coding open-ended data from surveys
A RapidMiner user wants to know the answer to this question: "Hey there, I am looking to code open-ended data from surveys. I'm used to QDA that uses a cluster algorithm to help find similar open-ends for easy categorization, does RapidMiner have such option? Thank you!"
0
Best Answer
-
yyhuang Administrator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 364 RM Data ScientistFor open ended questions in survey, you can apply vectorization on text and then build clustering models on TF-IDF. It will group the similar reviews, detect duplicated reviews.
Here is an example of text clustering process on job description data<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve REDUCED job post data set (5862 examples)" width="90" x="112" y="34"> <parameter key="repository_entry" value="//Community Samples/Community Data Science/Text Mining Tutorials by Neil McGuigan/data/REDUCED job post data set (5862 examples)"/> </operator> <operator activated="true" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="313" y="34"> <parameter key="attribute_name" value="Title"/> <parameter key="target_role" value="jobTitle"/> <list key="set_additional_roles"/> </operator> <operator activated="true" class="model_simulator:text_vectorization" compatibility="9.2.001" expanded="true" height="103" name="Text Vectorization" width="90" x="447" y="34"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value="JobDescription"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="add sentiment" value="false"/> <parameter key="add language" value="false"/> <parameter key="keep original" value="false"/> <parameter key="store training documents" value="true"/> <parameter key="store scoring documents" value="false"/> <parameter key="document class attribute" value=""/> <parameter key="token split" value="\s+"/> <parameter key="apply pruning" value="true"/> <parameter key="max number of new columns" value="1000"/> </operator> <operator activated="true" class="concurrency:k_means" compatibility="9.2.001" expanded="true" height="82" name="Clustering" width="90" x="581" y="34"> <parameter key="add_cluster_attribute" value="true"/> <parameter key="add_as_label" value="false"/> <parameter key="remove_unlabeled" value="false"/> <parameter key="k" value="4"/> <parameter key="max_runs" value="10"/> <parameter key="determine_good_start_values" value="true"/> <parameter key="measure_types" value="NumericalMeasures"/> <parameter key="mixed_measure" value="MixedEuclideanDistance"/> <parameter key="nominal_measure" value="NominalDistance"/> <parameter key="numerical_measure" value="CosineSimilarity"/> <parameter key="divergence" value="SquaredEuclideanDistance"/> <parameter key="kernel_type" value="radial"/> <parameter key="kernel_gamma" value="1.0"/> <parameter key="kernel_sigma1" value="1.0"/> <parameter key="kernel_sigma2" value="0.0"/> <parameter key="kernel_sigma3" value="2.0"/> <parameter key="kernel_degree" value="3.0"/> <parameter key="kernel_shift" value="1.0"/> <parameter key="kernel_a" value="1.0"/> <parameter key="kernel_b" value="0.0"/> <parameter key="max_optimization_steps" value="100"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <connect from_op="Retrieve REDUCED job post data set (5862 examples)" from_port="output" to_op="Set Role" to_port="example set input"/> <connect from_op="Set Role" from_port="example set output" to_op="Text Vectorization" to_port="example set input"/> <connect from_op="Text Vectorization" from_port="example set output" to_op="Clustering" to_port="example set"/> <connect from_op="Clustering" from_port="cluster model" to_port="result 1"/> <connect from_op="Clustering" from_port="clustered set" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> </process> </operator> </process>
8