dictionary based sentiment analysis using an own dictionary

danczadancza Member Posts: 2 Contributor I
edited January 2020 in Help

Hey guys,

 

I'm trying to apply a dictionary based sentiment analysis using an own dictionary.

First, my dataset are Excel Files of newspaper interviews and is structured in the following way: column 1 contains the text (the text is splitted in the first question in row 1, the first answer in row 2, the second question in row 3 and so on), column 2 contains an id (interview identifier as well as if it is a question/answer and which one). I am able to manage it from retrieving excel files, processing the documents including tokenization. Afterwards I am using "Dictionary Based Sentiment" and "Apply dictionary based sentiment" operators but it is not possible to me to match dictionary (excel file with column 1 for word and column 2 for weights (1 positive and -1 negative)) and newspaper interviews.

 

Can you help me?

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.003" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true">
<operator activated="true" class="concurrency:loop_files" compatibility="7.6.003" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34">
<parameter key="directory" value="C:\Users\g21640\Desktop\ojee\bla"/>
<parameter key="filter_type" value="glob"/>
<parameter key="recursive" value="false"/>
<parameter key="enable_macros" value="false"/>
<parameter key="macro_for_file_name" value="file_name"/>
<parameter key="macro_for_file_type" value="file_type"/>
<parameter key="macro_for_folder_name" value="folder_name"/>
<parameter key="reuse_results" value="false"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.6.003" expanded="true" height="68" name="Read Excel" width="90" x="179" y="34">
<parameter key="excel_file" value="C:\Users\g21640\Desktop\ojee\bla\2000_0101_2003_3112(1).xlsx"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1:E43"/>
<parameter key="encoding" value="UTF-8"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="data.false.file_path.attribute"/>
<parameter key="1" value="id.false.integer.attribute"/>
<parameter key="2" value="item.true.polynominal.attribute"/>
<parameter key="3" value="type.false.polynominal.attribute"/>
<parameter key="4" value="idid.true.polynominal.id"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
<connect from_port="file object" to_op="Read Excel" to_port="file"/>
<connect from_op="Read Excel" from_port="output" to_port="output 1"/>
<portSpacing port="source_file object" spacing="0"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="7.6.003" expanded="true" height="82" name="Append" width="90" x="179" y="34">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="7.6.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="item"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" breakpoints="after" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data (8)" width="90" x="447" y="34">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="7.6.003" expanded="true" height="82" name="Set Role" width="90" x="581" y="34">
<parameter key="attribute_name" value="text"/>
<parameter key="target_role" value="text"/>
<list key="set_additional_roles">
<parameter key="idid" value="id"/>
<parameter key="text" value="text"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="514" y="136">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="text" value="1.0"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="read_excel" compatibility="7.6.003" expanded="true" height="68" name="Read Excel (2)" width="90" x="45" y="187">
<parameter key="excel_file" value="C:\Users\g21640\Desktop\Dropbox\Promotion\rapidminer listen\BPW_Wortlisten_test.xlsx"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1:B12350"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="true"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="A.true.polynominal.attribute"/>
<parameter key="1" value="B.true.integer.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
<operator activated="true" breakpoints="after" class="operator_toolbox:dictionary_sentiment_learner" compatibility="0.9.000" expanded="true" height="82" name="Dictionary Based Sentiment" width="90" x="313" y="187">
<parameter key="Value Attribute" value="B"/>
<parameter key="Key Attribute" value="A"/>
<parameter key="Negation Attribute" value=""/>
<parameter key="Negation Window Size" value="1"/>
</operator>
<operator activated="true" class="operator_toolbox:apply_dictionary_learner" compatibility="0.9.000" expanded="true" height="103" name="Apply Dictionary Based Sentiment (2)" width="90" x="581" y="238"/>
<connect from_op="Loop Files" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data (8)" to_port="example set"/>
<connect from_op="Process Documents from Data (8)" from_port="example set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Apply Dictionary Based Sentiment (2)" to_port="doc"/>
<connect from_op="Read Excel (2)" from_port="output" to_op="Dictionary Based Sentiment" to_port="exa"/>
<connect from_op="Dictionary Based Sentiment" from_port="mod" to_op="Apply Dictionary Based Sentiment (2)" to_port="mod"/>
<connect from_op="Apply Dictionary Based Sentiment (2)" from_port="res" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="left" color="yellow" colored="true" height="128" resized="true" width="459" x="46" y="27">Step 1&lt;br&gt;</description>
</process>
</operator>
</process>

Best regards,

Daniel

Best Answer

  • MartinLiebigMartinLiebig Administrator, Moderator, Employee, RapidMiner Certified Analyst, RapidMiner Certified Expert, University Professor Posts: 3,314 RM Data Scientist
    Solution Accepted

    Hi @dancza,

     

    Thats for using my operator! Always great to see people start using something you write.

     

    Please try the attached process. If you do it your way you loose the "tokenization".You need to do it with Loop Collection in order to preserve it.

     

    I am actually in the process of reorganizing some code. For one of the next versions we will have one operator called "Apply Model (Documents)" to apply various models on a collection of docs. This is then checking for the tokenization and throws propper error messages.

     

    Best,

    Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
    <operator activated="true" class="concurrency:loop_files" compatibility="8.1.001" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34">
    <parameter key="directory" value="C:\Users\g21640\Desktop\ojee\bla"/>
    <process expanded="true">
    <operator activated="true" class="read_excel" compatibility="8.1.000" expanded="true" height="68" name="Read Excel" width="90" x="179" y="34">
    <parameter key="excel_file" value="C:\Users\g21640\Desktop\ojee\bla\2000_0101_2003_3112(1).xlsx"/>
    <parameter key="imported_cell_range" value="A1:E43"/>
    <parameter key="encoding" value="UTF-8"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="data.false.file_path.attribute"/>
    <parameter key="1" value="id.false.integer.attribute"/>
    <parameter key="2" value="item.true.polynominal.attribute"/>
    <parameter key="3" value="type.false.polynominal.attribute"/>
    <parameter key="4" value="idid.true.polynominal.id"/>
    </list>
    </operator>
    <connect from_port="file object" to_op="Read Excel" to_port="file"/>
    <connect from_op="Read Excel" from_port="output" to_port="output 1"/>
    <portSpacing port="source_file object" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="8.1.001" expanded="true" height="82" name="Append" width="90" x="179" y="34"/>
    <operator activated="true" class="nominal_to_text" compatibility="8.1.001" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="item"/>
    </operator>
    <operator activated="false" breakpoints="after" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (8)" width="90" x="514" y="340">
    <parameter key="vector_creation" value="Term Occurrences"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
    <connect from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="set_role" compatibility="8.1.001" expanded="true" height="82" name="Set Role" width="90" x="648" y="340">
    <parameter key="attribute_name" value="text"/>
    <parameter key="target_role" value="text"/>
    <list key="set_additional_roles">
    <parameter key="idid" value="id"/>
    <parameter key="text" value="text"/>
    </list>
    </operator>
    <operator activated="true" breakpoints="after" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="34">
    <parameter key="select_attributes_and_weights" value="true"/>
    <list key="specify_weights">
    <parameter key="text" value="1.0"/>
    </list>
    </operator>
    <operator activated="true" class="loop_collection" compatibility="8.1.001" expanded="true" height="82" name="Loop Collection" width="90" x="581" y="34">
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="246" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="514" y="34"/>
    <connect from_port="single" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" breakpoints="after" class="read_excel" compatibility="8.1.000" expanded="true" height="68" name="Read Excel (2)" width="90" x="45" y="289">
    <parameter key="excel_file" value="C:\Users\g21640\Desktop\Dropbox\Promotion\rapidminer listen\BPW_Wortlisten_test.xlsx"/>
    <parameter key="imported_cell_range" value="A1:B12350"/>
    <list key="annotations"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="A.true.polynominal.attribute"/>
    <parameter key="1" value="B.true.integer.attribute"/>
    </list>
    </operator>
    <operator activated="true" breakpoints="after" class="operator_toolbox:dictionary_sentiment_learner" compatibility="1.0.000-SNAPSHOT" expanded="true" height="82" name="Dictionary Based Sentiment" width="90" x="313" y="289">
    <parameter key="value_attribute" value="B"/>
    <parameter key="key_attribute" value="A"/>
    </operator>
    <operator activated="true" class="operator_toolbox:apply_dictionary_learner" compatibility="1.0.000-SNAPSHOT" expanded="true" height="103" name="Apply Dictionary Based Sentiment (2)" width="90" x="849" y="238"/>
    <connect from_op="Loop Files" from_port="output 1" to_op="Append" to_port="example set 1"/>
    <connect from_op="Append" from_port="merged set" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
    <connect from_op="Loop Collection" from_port="output 1" to_op="Apply Dictionary Based Sentiment (2)" to_port="doc"/>
    <connect from_op="Read Excel (2)" from_port="output" to_op="Dictionary Based Sentiment" to_port="exa"/>
    <connect from_op="Dictionary Based Sentiment" from_port="mod" to_op="Apply Dictionary Based Sentiment (2)" to_port="mod"/>
    <connect from_op="Apply Dictionary Based Sentiment (2)" from_port="res" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <description align="left" color="yellow" colored="true" height="128" resized="true" width="459" x="46" y="27">Step 1&lt;br&gt;</description>
    </process>
    </operator>
    </process>
    - Head of Data Science Services at RapidMiner -
    Dortmund, Germany
    sgenzerdang

Answers

Sign In or Register to comment.