Training-, Testing- and new Data

TextMiner123TextMiner123 Member Posts: 4 Contributor I
edited November 2018 in Help
Hello Guys,

I have created the following process for text classification in 3 classes. I have only two Excel-Files:
- on the first "Read Excel" operator a manually labelled Training data
- on the second "Read Excel" operator I have linked the "to predicted data".

I have used X-Validation. I am a little bit confused, because of the huge amount of possibilities to create such process,If this is the right way to predict new data.
Interesting thing is that the accuracy is always the same when I use different "to predicted data" on the second "Read Excel" operator.
Does now my model learn from training data?
If you guys have a look on the process, I will be very glad..

Thank you very much & Greets

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.002" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <parameter key="parallelize_main_process" value="false"/>
    <process expanded="true" height="415" width="949">
      <operator activated="true" class="read_excel" compatibility="5.2.002" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="D:\Diplomarbeit-E-Government\Ohne Parteinamen45.xls"/>
        <parameter key="sheet_number" value="7"/>
        <parameter key="imported_cell_range" value="A1:V361"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="German (Germany)"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="from_user.true.text.attribute"/>
          <parameter key="1" value="text.true.text.attribute"/>
          <parameter key="2" value="Training Data.true.attribute_value.label"/>
          <parameter key="3" value="Zählen Pos/Neg.false.integer.attribute"/>
          <parameter key="4" value="Sortieren.false.integer.attribute"/>
          <parameter key="5" value="name.false.polynominal.attribute"/>
          <parameter key="6" value="location.false.polynominal.attribute"/>
          <parameter key="7" value="description.false.polynominal.attribute"/>
          <parameter key="8" value="statuses_count.false.integer.attribute"/>
          <parameter key="9" value="followers_count.false.integer.attribute"/>
          <parameter key="10" value="friends_count.false.integer.attribute"/>
          <parameter key="11" value="created_at.false.polynominal.attribute"/>
          <parameter key="12" value="from_user_id.false.integer.attribute"/>
          <parameter key="13" value="geo_coordinates.false.binominal.attribute"/>
          <parameter key="14" value="iso_language_code.false.binominal.attribute"/>
          <parameter key="15" value="to_user_id.false.integer.attribute"/>
          <parameter key="16" value="to_user_id_str.false.integer.attribute"/>
          <parameter key="17" value="source.false.polynominal.attribute"/>
          <parameter key="18" value="from_user_id_str.false.integer.attribute"/>
          <parameter key="19" value="id_str.false.real.attribute"/>
          <parameter key="20" value="profile_image_url.false.polynominal.attribute"/>
          <parameter key="21" value="status_url.false.polynominal.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
        <parameter key="datamanagement" value="double_array"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="45" y="120">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="Term Occurrences"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prunde_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="999"/>
        <parameter key="prune_below_rank" value="5.0"/>
        <parameter key="prune_above_rank" value="5.0"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <parameter key="parallelize_vector_creation" value="false"/>
        <process expanded="true" height="370" width="631">
          <operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="45" y="120">
            <parameter key="mode" value="regular expression"/>
            <parameter key="characters" value="fdp-"/>
            <parameter key="expression" value="[ ]"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="false" class="text:filter_tokens_by_pos" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by POS Tags)" width="90" x="246" y="120">
            <parameter key="language" value="German"/>
            <parameter key="expression" value="ADJD.*|ADV.*|NN.*|\$.*"/>
            <parameter key="invert_filter" value="false"/>
          </operator>
          <operator activated="false" class="text:filter_stopwords_german" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (3)" width="90" x="447" y="120">
            <parameter key="stop_word_list" value="Standard"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.2.002" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="255">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.2.002" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
        <parameter key="name" value="Training Data"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="read_excel" compatibility="5.2.002" expanded="true" height="60" name="Read Excel (2)" width="90" x="246" y="255">
        <parameter key="excel_file" value="D:\Diplomarbeit-E-Government\Twitter Daten\TWITTERALYTICS\Forsa\Archiv\25 - 29 02 2012 - 06 03 2012 Twitteralytics v2.2.3b.xls"/>
        <parameter key="sheet_number" value="3"/>
        <parameter key="imported_cell_range" value="A1:S6636"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="German"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="from_user.true.polynominal.attribute"/>
          <parameter key="1" value="text.true.text.attribute"/>
          <parameter key="2" value="name.false.polynominal.attribute"/>
          <parameter key="3" value="location.false.polynominal.attribute"/>
          <parameter key="4" value="description.false.polynominal.attribute"/>
          <parameter key="5" value="statuses_count.false.integer.attribute"/>
          <parameter key="6" value="followers_count.false.integer.attribute"/>
          <parameter key="7" value="friends_count.false.integer.attribute"/>
          <parameter key="8" value="created_at.false.polynominal.attribute"/>
          <parameter key="9" value="from_user_id.false.integer.attribute"/>
          <parameter key="10" value="geo_coordinates.false.binominal.attribute"/>
          <parameter key="11" value="iso_language_code.false.binominal.attribute"/>
          <parameter key="12" value="to_user_id.false.integer.attribute"/>
          <parameter key="13" value="to_user_id_str.false.integer.attribute"/>
          <parameter key="14" value="source.false.polynominal.attribute"/>
          <parameter key="15" value="from_user_id_str.false.integer.attribute"/>
          <parameter key="16" value="id_str.false.real.attribute"/>
          <parameter key="17" value="profile_image_url.false.polynominal.attribute"/>
          <parameter key="18" value="status_url.false.polynominal.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
        <parameter key="datamanagement" value="double_array"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="447" y="210">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="Term Occurrences"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prunde_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="5.0"/>
        <parameter key="prune_above_rank" value="5.0"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <parameter key="parallelize_vector_creation" value="false"/>
        <process expanded="true" height="314" width="613">
          <operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases (2)" width="90" x="45" y="30">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize (2)" width="90" x="246" y="30">
            <parameter key="mode" value="regular expression"/>
            <parameter key="characters" value=".:"/>
            <parameter key="expression" value="[ ]"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="false" class="text:filter_stopwords_german" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="447" y="30">
            <parameter key="stop_word_list" value="Standard"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.2.002" expanded="true" height="112" name="Validation" width="90" x="514" y="30">
        <parameter key="create_complete_model" value="false"/>
        <parameter key="average_performances_only" value="true"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_validations" value="10"/>
        <parameter key="sampling_type" value="stratified sampling"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="parallelize_training" value="false"/>
        <parameter key="parallelize_testing" value="false"/>
        <process expanded="true" height="341" width="290">
          <operator activated="true" class="k_nn" compatibility="5.2.002" expanded="true" height="76" name="k-NN" width="90" x="112" y="75">
            <parameter key="k" value="5"/>
            <parameter key="weighted_vote" value="false"/>
            <parameter key="measure_types" value="NumericalMeasures"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="CosineSimilarity"/>
            <parameter key="divergence" value="GeneralizedIDivergence"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
          </operator>
          <connect from_port="training" to_op="k-NN" to_port="training set"/>
          <connect from_op="k-NN" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="72"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="341" width="300">
          <operator activated="true" class="apply_model" compatibility="5.2.002" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.2.002" expanded="true" height="76" name="Performance" width="90" x="179" y="120">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.2.002" expanded="true" height="76" name="Apply Model (2)" width="90" x="648" y="165">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="Process Documents from Data (2)" to_port="word list"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Validation" to_port="training"/>
      <connect from_op="Read Excel (2)" from_port="output" to_op="Process Documents from Data (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Validation" from_port="training" to_port="result 1"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="54"/>
    </process>
  </operator>
</process>
Sign In or Register to comment.