Reduction Dimensionality

FlixportFlixport Member Posts: 33 Contributor II
edited April 2019 in Help
Hey Guys,

We have applied the PCA and ChiSquared methods to the topic of dimensionality reduction. This reduces the data volume considerably, which is my question: Can I remove the main component analysis PCA and leave the Weight By PCA method?
I would like to thank you for answering my question.

<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve reut2" width="90" x="45" y="187">
        <parameter key="repository_entry" value="reut2"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="187">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="exchanges|orgs|people|places|text_orig|topics|zahlen"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="generate_id" compatibility="9.2.001" expanded="true" height="82" name="Generate ID" width="90" x="313" y="187">
        <parameter key="create_nominal_ids" value="false"/>
        <parameter key="offset" value="0"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="9.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="289">
        <parameter key="parameter_expression" value=""/>
        <parameter key="condition_class" value="custom_filters"/>
        <parameter key="invert_filter" value="false"/>
        <list key="filters_list">
          <parameter key="filters_entry_key" value="category.does_not_equal.?"/>
        </list>
        <parameter key="filters_logic_and" value="true"/>
        <parameter key="filters_check_metadata" value="true"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="289">
        <parameter key="attribute_name" value="category"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="remove_correlated_attributes" compatibility="9.2.001" expanded="true" height="82" name="Remove Correlated Attributes" width="90" x="313" y="289">
        <parameter key="correlation" value="0.8"/>
        <parameter key="filter_relation" value="greater"/>
        <parameter key="attribute_order" value="random"/>
        <parameter key="use_absolute_correlation" value="true"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <operator activated="true" class="subprocess" compatibility="9.2.001" expanded="true" height="124" name="Subprocess" width="90" x="581" y="187">
        <process expanded="true">
          <operator activated="true" class="multiply" compatibility="9.2.001" expanded="true" height="124" name="Multiply (2)" width="90" x="380" y="442"/>
          <operator activated="true" class="weight_by_chi_squared_statistic" compatibility="9.2.001" expanded="true" height="82" name="Weight by Chi Squared Statistic" width="90" x="648" y="289">
            <parameter key="normalize_weights" value="false"/>
            <parameter key="sort_weights" value="true"/>
            <parameter key="sort_direction" value="descending"/>
            <parameter key="number_of_bins" value="10"/>
          </operator>
          <operator activated="true" class="select_by_weights" compatibility="9.2.001" expanded="true" height="103" name="Select by Weights (ChiSq)" width="90" x="849" y="289">
            <parameter key="weight_relation" value="top k"/>
            <parameter key="weight" value="10.0"/>
            <parameter key="k" value="45"/>
            <parameter key="p" value="0.1"/>
            <parameter key="deselect_unknown" value="true"/>
            <parameter key="use_absolute_weights" value="false"/>
          </operator>
          <operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store" width="90" x="1050" y="289">
            <parameter key="repository_entry" value="data/data_out_select_by_chisq_weights"/>
          </operator>
          <operator activated="true" class="principal_component_analysis" compatibility="9.2.001" expanded="true" height="103" name="PCA" width="90" x="648" y="442">
            <parameter key="dimensionality_reduction" value="keep variance"/>
            <parameter key="variance_threshold" value="0.8"/>
            <parameter key="number_of_components" value="1"/>
          </operator>
          <operator activated="true" class="weight_by_pca" compatibility="9.2.001" expanded="true" height="82" name="Weight by PCA" width="90" x="648" y="595">
            <parameter key="normalize_weights" value="false"/>
            <parameter key="sort_weights" value="true"/>
            <parameter key="sort_direction" value="ascending"/>
            <parameter key="component_number" value="1"/>
          </operator>
          <operator activated="true" class="select_by_weights" compatibility="9.2.001" expanded="true" height="103" name="Select by Weights (PCA)" width="90" x="849" y="595">
            <parameter key="weight_relation" value="top k"/>
            <parameter key="weight" value="10.0"/>
            <parameter key="k" value="45"/>
            <parameter key="p" value="0.1"/>
            <parameter key="deselect_unknown" value="true"/>
            <parameter key="use_absolute_weights" value="true"/>
          </operator>
          <operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store (3)" width="90" x="1050" y="595">
            <parameter key="repository_entry" value="data/data_out_select_by_pca_weights"/>
          </operator>
          <operator activated="true" class="store" compatibility="9.2.001" expanded="true" height="68" name="Store (2)" width="90" x="1050" y="442">
            <parameter key="repository_entry" value="data/data_out_pca"/>
          </operator>
          <connect from_port="in 1" to_op="Multiply (2)" to_port="input"/>
          <connect from_op="Multiply (2)" from_port="output 1" to_op="Weight by Chi Squared Statistic" to_port="example set"/>
          <connect from_op="Multiply (2)" from_port="output 2" to_op="PCA" to_port="example set input"/>
          <connect from_op="Multiply (2)" from_port="output 3" to_op="Weight by PCA" to_port="example set"/>
          <connect from_op="Weight by Chi Squared Statistic" from_port="weights" to_op="Select by Weights (ChiSq)" to_port="weights"/>
          <connect from_op="Weight by Chi Squared Statistic" from_port="example set" to_op="Select by Weights (ChiSq)" to_port="example set input"/>
          <connect from_op="Select by Weights (ChiSq)" from_port="example set output" to_op="Store" to_port="input"/>
          <connect from_op="Store" from_port="through" to_port="out 1"/>
          <connect from_op="PCA" from_port="example set output" to_op="Store (2)" to_port="input"/>
          <connect from_op="Weight by PCA" from_port="weights" to_op="Select by Weights (PCA)" to_port="weights"/>
          <connect from_op="Weight by PCA" from_port="example set" to_op="Select by Weights (PCA)" to_port="example set input"/>
          <connect from_op="Select by Weights (PCA)" from_port="example set output" to_op="Store (3)" to_port="input"/>
          <connect from_op="Store (3)" from_port="through" to_port="out 3"/>
          <connect from_op="Store (2)" from_port="through" to_port="out 2"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
        </process>
        <description align="center" color="transparent" colored="false" width="126">Subprozess</description>
      </operator>
      <connect from_op="Retrieve reut2" from_port="output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
      <connect from_op="Generate ID" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Remove Correlated Attributes" to_port="example set input"/>
      <connect from_op="Remove Correlated Attributes" from_port="example set output" to_op="Subprocess" to_port="in 1"/>
      <connect from_op="Subprocess" from_port="out 1" to_port="result 1"/>
      <connect from_op="Subprocess" from_port="out 2" to_port="result 2"/>
      <connect from_op="Subprocess" from_port="out 3" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>





Best Answer

Answers

  • varunm1varunm1 Moderator, Member Posts: 1,207 Unicorn
    Hello @Flixport

    Sorry, can you explain your question? I saw your process and it looks fine. Why are you asking to remove PCA?
    Regards,
    Varun
    https://www.varunmandalapu.com/

    Be Safe. Follow precautions and Maintain Social Distancing

  • FlixportFlixport Member Posts: 33 Contributor II
    The reason I ask is because I want to simplify the process and I would omit PCA. I built the validation model in my other process with Random Forest, as you already know, and I don't use the data set from PCA in this model ^^


Sign In or Register to comment.