Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.
Analysis of Zalando Customer Reviews
Hey everyone
At my university we are using rapidminer in a big data class for text mining. For a project I wanted to analyse some bad and some good rated articles on the ecommerce site Zalando. The goal was to proove that bad rated articles have a significant higher amount of words like "does not fit", "darker than in the picture" etc.
For this purpose i crawled the webpages of 20 women shoes and downloaded their reviews. Now i made a data analysis and had a look at the occurence of the words in the review (see xml code).
What i can't manage to do right now is the cross reference those occurences with smaller sentences fragments like "too big" etc. Has anyone a clue how to do that or could point me into the right direction?
I am a total Newbie ;D , but i am planning on using rapidminer more and more in the future, because it really is a great too.
Thanks in advance.
At my university we are using rapidminer in a big data class for text mining. For a project I wanted to analyse some bad and some good rated articles on the ecommerce site Zalando. The goal was to proove that bad rated articles have a significant higher amount of words like "does not fit", "darker than in the picture" etc.
For this purpose i crawled the webpages of 20 women shoes and downloaded their reviews. Now i made a data analysis and had a look at the occurence of the words in the review (see xml code).
What i can't manage to do right now is the cross reference those occurences with smaller sentences fragments like "too big" etc. Has anyone a clue how to do that or could point me into the right direction?
I am a total Newbie ;D , but i am planning on using rapidminer more and more in the future, because it really is a great too.
Thanks in advance.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.001">
<operator activated="true" class="process" compatibility="7.0.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="7.0.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="34">
<list key="text_directories">
<parameter key="Gute Bewertungen" value="C:\Users\Sven\Dropbox\Big Data\AP2\Rapidminer\Files\Gute Reviews"/>
<parameter key="file_pattern" value="*"/>
<parameter key="extract_text_only" value="true"/>
<parameter key="use_file_extension_as_type" value="false"/>
<parameter key="content_type" value="txt"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="create_word_vector" value="false"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<process expanded="true">
<connect from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
<operator activated="true" class="remove_duplicates" compatibility="7.0.001" expanded="true" height="82" name="Remove Duplicates" width="90" x="246" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="text"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="treat_missing_values_as_duplicates" value="false"/>
<operator activated="true" class="set_role" compatibility="7.0.001" expanded="true" height="82" name="Set Role" width="90" x="380" y="34">
<parameter key="attribute_name" value="text"/>
<parameter key="target_role" value="regular"/>
<list key="set_additional_roles"/>
<operator activated="true" class="text:process_document_from_data" compatibility="7.0.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="514" y="34">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="7.0.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="85">
<parameter key="transform_to" value="lower case"/>
<operator activated="true" class="text:tokenize" compatibility="7.0.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="85">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
<operator activated="true" class="text:stem_porter" compatibility="7.0.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="313" y="85"/>
<operator activated="true" class="text:filter_stopwords_german" compatibility="7.0.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="447" y="85">
<parameter key="stop_word_list" value="Standard"/>
<operator activated="true" class="text:filter_by_length" compatibility="7.0.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="85">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="25"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
<operator activated="true" class="aggregate" compatibility="7.0.001" expanded="true" height="82" name="Aggregate" width="90" x="447" y="238">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="default_aggregation_function" value="sum"/>
<list key="aggregation_attributes"/>
<parameter key="group_by_attributes" value=""/>
<parameter key="count_all_combinations" value="false"/>
<parameter key="only_distinct" value="false"/>
<parameter key="ignore_missings" value="true"/>
<operator activated="true" class="transpose" compatibility="7.0.001" expanded="true" height="82" name="Transpose" width="90" x="581" y="187"/>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Remove Duplicates" to_port="example set input"/>
<connect from_op="Remove Duplicates" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
[ /code]