Options

Matching stock return dates (Mo-Fr) with sentiment (daily) - join two datasets

TitzaaaTitzaaa Member Posts: 12 Learner I
Dear community,
I am doing a sentiment analysis on stock returns on the example of a specific company.
Now I am looking for the easiest way to match my sentiment measures with the firm-specific stock prices.
As Xetra is only open Monday through Friday with exception of national holidays, I want to average my sentiment measure on non-consecutive market days.
In essence I count the number of positive and negative words for firm-specific news and normalize them by the total number of words. I do that for all consecutive trading days but on non-consecutive market days I want to average all articles published from close to open.

Can a join operator do something like this automatically, meaning just pasting the measure when a stock price is available and averaging it and writing it to the next opening day when no matching date is available?

Thanks a lot for your help!


Answers

  • Options
    Telcontar120Telcontar120 Moderator, RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,635 Unicorn
    Based on your description, I don't think this is going to be done "automatically" with a simple join.  But that doesn't mean it can't be done!  With RapidMiner ETL, almost anything is possible.
    It sounds to me like the best approach would be to create a time index for your sentiment scores that mirrors your stock data.  This can be done using the Windowing operator in both time series and picking a common unit (although the settings used to generate this might be different in the two series).
    Once you have that, you should then be able to join the two series together using that as the join key.

    Brian T.
    Lindon Ventures 
    Data Science Consulting from Certified RapidMiner Experts
  • Options
    TitzaaaTitzaaa Member Posts: 12 Learner I
    Hi Telcontar120,
    thanks a lot for your help!
    However I do not quite understand the windowing operator...can you maybe help me with the settings which would suit my needs with my code?:

    ="1.0" encoding="UTF-8"?><process version="9.3.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve Aktienkurs_Volumen_PATRIZIA" width="90" x="45" y="136">
            <parameter key="repository_entry" value="../Data/Aktienkurs_Volumen_PATRIZIA"/>
          </operator>
          <operator activated="true" class="time_series:windowing" compatibility="9.3.001" expanded="true" height="82" name="Windowing" width="90" x="179" y="136">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="has_indices" value="false"/>
            <parameter key="indices_attribute" value=""/>
            <parameter key="window_size" value="20"/>
            <parameter key="no_overlapping_windows" value="false"/>
            <parameter key="step_size" value="1"/>
            <parameter key="create_horizon_(labels)" value="false"/>
            <parameter key="horizon_attribute" value=""/>
            <parameter key="horizon_size" value="1"/>
            <parameter key="horizon_offset" value="0"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve" width="90" x="45" y="442">
            <parameter key="repository_entry" value="../Data/Nachrichten_Lexis-Nexis"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve (2)" width="90" x="514" y="595">
            <parameter key="repository_entry" value="../Data/SentiWS"/>
          </operator>
          <operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.0.001" expanded="true" height="82" name="Dictionary-Based Sentiment (Documents)" width="90" x="648" y="595">
            <parameter key="value_attribute" value="Sentiment"/>
            <parameter key="key_attribute" value="Wort"/>
            <parameter key="negation_attribute" value=""/>
            <parameter key="negation_window_size" value="1"/>
            <parameter key="use_symmetric_negation_window" value="true"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="9.3.001" expanded="true" height="82" name="Set Role (2)" width="90" x="179" y="442">
            <parameter key="attribute_name" value="Datum"/>
            <parameter key="target_role" value="Datum"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="9.3.001" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="442">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value="Body Teil 1"/>
            <parameter key="attributes" value="|Body Teil 1|Body Teil 2"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="file_path"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="single_value"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:data_to_documents" compatibility="8.2.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="442">
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
          </operator>
          <operator activated="true" class="loop_collection" compatibility="9.3.001" expanded="true" height="82" name="Loop Collection" width="90" x="581" y="442">
            <parameter key="set_iteration_macro" value="false"/>
            <parameter key="macro_name" value="iteration"/>
            <parameter key="macro_start_value" value="1"/>
            <parameter key="unfold" value="false"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_german" compatibility="8.2.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="313" y="34">
                <parameter key="stop_word_list" value="Standard"/>
              </operator>
              <operator activated="true" class="text:filter_by_length" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="514" y="34">
                <parameter key="min_chars" value="3"/>
                <parameter key="max_chars" value="10000"/>
              </operator>
              <connect from_port="single" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
              <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
              <connect from_op="Filter Tokens (2)" from_port="document" to_port="output 1"/>
              <portSpacing port="source_single" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.0.001" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="983" y="493">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.3.001" expanded="true" height="82" name="Generate Attributes" width="90" x="1117" y="493">
            <list key="function_descriptions">
              <parameter key="#Pos_Wörter/#Wörter" value="Positivity/(-1*Negativity+Positivity+[Uncovered Tokens])"/>
              <parameter key="#Neg_Wörter/#Wörter" value="-Negativity/(-1*Negativity+Positivity+[Uncovered Tokens])"/>
              <parameter key="#Pos_Wörter" value="Positivity"/>
              <parameter key="#Neg_Wörter" value="-Negativity"/>
              <parameter key="#Wörter" value="[Uncovered Tokens]+Positivity-Negativity"/>
              <parameter key="(#Pos_Wörter-#Neg_Wörter)/#Wörter" value="(Positivity+Negativity)/(-1*Negativity+Positivity+[Uncovered Tokens])"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="9.3.001" expanded="true" height="82" name="Generate ID" width="90" x="1251" y="493">
            <parameter key="create_nominal_ids" value="false"/>
            <parameter key="offset" value="0"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve Nachrichten_Lexis-Nexis" width="90" x="1117" y="391">
            <parameter key="repository_entry" value="../Data/Nachrichten_Lexis-Nexis"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="9.3.001" expanded="true" height="82" name="Generate ID (2)" width="90" x="1251" y="391">
            <parameter key="create_nominal_ids" value="false"/>
            <parameter key="offset" value="0"/>
          </operator>
          <operator activated="true" class="concurrency:join" compatibility="9.3.001" expanded="true" height="82" name="Join" width="90" x="1385" y="442">
            <parameter key="remove_double_attributes" value="true"/>
            <parameter key="join_type" value="inner"/>
            <parameter key="use_id_attribute_as_key" value="true"/>
            <list key="key_attributes"/>
            <parameter key="keep_both_join_attributes" value="false"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes" width="90" x="1519" y="442">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value="#Neg_Wörter|#Neg_Wörter/#Wörter|#Pos_Wörter|#Pos_Wörter/#Wörter|#Wörter|(#Pos_Wörter-#Neg_Wörter)/#Wörter|Datum|Publikation (geordnet)|Titel|URL|Uncovered Tokens"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="9.3.001" expanded="true" height="82" name="Set Role" width="90" x="1653" y="442">
            <parameter key="attribute_name" value="Datum"/>
            <parameter key="target_role" value="Datum"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="sort" compatibility="9.3.001" expanded="true" height="82" name="Sort" width="90" x="1787" y="442">
            <parameter key="attribute_name" value="Datum"/>
            <parameter key="sorting_direction" value="increasing"/>
          </operator>
          <operator activated="true" class="time_series:windowing" compatibility="9.3.001" expanded="true" height="82" name="Windowing (2)" width="90" x="1921" y="442">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="has_indices" value="false"/>
            <parameter key="indices_attribute" value=""/>
            <parameter key="window_size" value="20"/>
            <parameter key="no_overlapping_windows" value="false"/>
            <parameter key="step_size" value="1"/>
            <parameter key="create_horizon_(labels)" value="false"/>
            <parameter key="horizon_attribute" value=""/>
            <parameter key="horizon_size" value="1"/>
            <parameter key="horizon_offset" value="0"/>
          </operator>
          <operator activated="true" class="concurrency:join" compatibility="9.3.001" expanded="true" height="82" name="Join (2)" width="90" x="1988" y="136">
            <parameter key="remove_double_attributes" value="true"/>
            <parameter key="join_type" value="inner"/>
            <parameter key="use_id_attribute_as_key" value="true"/>
            <list key="key_attributes"/>
            <parameter key="keep_both_join_attributes" value="false"/>
          </operator>
          <connect from_op="Retrieve Aktienkurs_Volumen_PATRIZIA" from_port="output" to_op="Windowing" to_port="example set"/>
          <connect from_op="Windowing" from_port="windowed example set" to_op="Join (2)" to_port="left"/>
          <connect from_op="Retrieve" from_port="output" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Retrieve (2)" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
          <connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
          <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
          <connect from_op="Loop Collection" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
          <connect from_op="Apply Model (Documents)" from_port="exa" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Join" to_port="right"/>
          <connect from_op="Retrieve Nachrichten_Lexis-Nexis" from_port="output" to_op="Generate ID (2)" to_port="example set input"/>
          <connect from_op="Generate ID (2)" from_port="example set output" to_op="Join" to_port="left"/>
          <connect from_op="Join" from_port="join" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Sort" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Windowing (2)" to_port="example set"/>
          <connect from_op="Windowing (2)" from_port="windowed example set" to_op="Join (2)" to_port="right"/>
          <connect from_op="Join (2)" from_port="join" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    
    Thanks a lot!

Sign In or Register to comment.