Options

Parrallelize nested process & Parrallelize main process have no impact

nurmannurman Member Posts: 8 Contributor II
edited November 2018 in Help

I have tried to run my process with and without the parallel nested process & parrallelize main process but they have NO impact on performance on my 4 core processors. I have tested both on Ubuntu and Windows machine but there's no difference in performance?
Can you please help?

The following I attach my process

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.014">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.014" expanded="true" name="Process">
    <parameter key="parallelize_main_process" value="true"/>
    <process expanded="true" height="350" width="614">
      <operator activated="true" class="loop_files" compatibility="5.1.014" expanded="true" height="130" name="Loop Files (2)" width="90" x="179" y="75">
        <parameter key="directory" value="D:\documents\VaR\test"/>
        <parameter key="filter" value=".*\.csv"/>
        <parameter key="parallelize_nested_process" value="true"/>
        <process expanded="true" height="476" width="505">
          <operator activated="true" class="read_csv" compatibility="5.1.014" expanded="true" height="60" name="Read CSV (2)" width="90" x="45" y="30">
            <parameter key="csv_file" value="C:\Users\mhelf\tmp\files\file1.txt"/>
            <parameter key="column_separators" value=","/>
            <list key="annotations"/>
            <parameter key="encoding" value="windows-1252"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="Date.true.nominal.id"/>
              <parameter key="1" value="Open.true.real.attribute"/>
              <parameter key="2" value="High.true.real.attribute"/>
              <parameter key="3" value="Low.true.real.attribute"/>
              <parameter key="4" value="Close.true.real.attribute"/>
              <parameter key="5" value="Change.true.real.attribute"/>
              <parameter key="6" value="Change Percent.true.real.attribute"/>
              <parameter key="7" value="Volume.true.integer.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="sort" compatibility="5.1.014" expanded="true" height="76" name="Sort (2)" width="90" x="45" y="120">
            <parameter key="attribute_name" value="Close"/>
          </operator>
          <operator activated="true" class="sort" compatibility="5.1.014" expanded="true" height="76" name="Sort" width="90" x="45" y="255">
            <parameter key="attribute_name" value="Date"/>
          </operator>
          <operator activated="true" class="split_data" compatibility="5.1.014" expanded="true" height="94" name="Split Data" width="90" x="179" y="255">
            <enumeration key="partitions">
              <parameter key="ratio" value="0.99"/>
              <parameter key="ratio" value="0.01"/>
            </enumeration>
            <parameter key="sampling_type" value="linear sampling"/>
          </operator>
          <operator activated="true" class="series:windowing" compatibility="5.1.002" expanded="true" height="76" name="Windowing (2)" width="90" x="179" y="390">
            <parameter key="window_size" value="1"/>
            <parameter key="label_attribute" value="Close"/>
          </operator>
          <operator activated="true" class="sort" compatibility="5.1.014" expanded="true" height="76" name="Sort (3)" width="90" x="246" y="165">
            <parameter key="attribute_name" value="Date"/>
          </operator>
          <operator activated="true" class="series:windowing" compatibility="5.1.002" expanded="true" height="76" name="Windowing" width="90" x="189" y="32">
            <parameter key="horizon" value="1"/>
            <parameter key="window_size" value="1"/>
            <parameter key="create_label" value="true"/>
            <parameter key="label_attribute" value="Open"/>
          </operator>
          <operator activated="true" class="series:sliding_window_validation" compatibility="5.1.002" expanded="true" height="112" name="Validation" width="90" x="313" y="30">
            <parameter key="training_window_width" value="20"/>
            <parameter key="training_window_step_size" value="5"/>
            <parameter key="test_window_width" value="20"/>
            <parameter key="horizon" value="5"/>
            <process expanded="true" height="408" width="309">
              <operator activated="true" class="support_vector_machine" compatibility="5.1.014" expanded="true" height="112" name="SVM" width="90" x="114" y="30">
                <parameter key="convergence_epsilon" value="0.001"/>
              </operator>
              <connect from_port="training" to_op="SVM" to_port="training set"/>
              <connect from_op="SVM" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="408" width="309">
              <operator activated="true" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="series:forecasting_performance" compatibility="5.1.002" expanded="true" height="76" name="Performance" width="90" x="181" y="30">
                <parameter key="horizon" value="1"/>
              </operator>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.1.014" expanded="true" height="76" name="Apply Model (2)" width="90" x="380" y="210">
            <list key="application_parameters"/>
          </operator>
          <connect from_port="file object" to_op="Read CSV (2)" to_port="file"/>
          <connect from_op="Read CSV (2)" from_port="output" to_op="Sort (2)" to_port="example set input"/>
          <connect from_op="Sort (2)" from_port="example set output" to_op="Sort" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Split Data" to_port="example set"/>
          <connect from_op="Split Data" from_port="partition 1" to_op="Sort (3)" to_port="example set input"/>
          <connect from_op="Split Data" from_port="partition 2" to_op="Windowing (2)" to_port="example set input"/>
          <connect from_op="Windowing (2)" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
          <connect from_op="Sort (3)" from_port="example set output" to_op="Windowing" to_port="example set input"/>
          <connect from_op="Windowing" from_port="example set output" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
          <connect from_op="Validation" from_port="training" to_port="out 2"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="out 3"/>
          <connect from_op="Apply Model (2)" from_port="labelled data" to_port="out 4"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
          <portSpacing port="sink_out 5" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Loop Files (2)" from_port="out 1" to_port="result 1"/>
      <connect from_op="Loop Files (2)" from_port="out 2" to_port="result 2"/>
      <connect from_op="Loop Files (2)" from_port="out 3" to_port="result 3"/>
      <connect from_op="Loop Files (2)" from_port="out 4" to_port="result 4"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>
Would really appreciate your assistance. Thank you very much.

Answers

  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    if "parallelize process" is activated, independent pahs inside the process are executed in parallel. So in your example process {Windowing (2)} is executed in parallel to {Sort (3), Windowing, Validation}. Since Windowing (2) is probably quite fast, you don't notice any performance improvements from this.

    Best, Marius
  • Options
    nurmannurman Member Posts: 8 Contributor II
    Dear Marius,

    I hope you can help me here. All I wanted is to run all the processes in the Loop files operator with different input datasets (different csv files)  in parallel leveraging multi-core. How can I achieve this? May you please fix my workflow and show how this works.

    Would really appreciate your help.

    Thank you very much.

    regards,
    nurman
  • Options
    MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    There is no parallelized Loop Files operator. But you could install the Parallel Extension and use the X-Validation (Parallel) inside your loop. The validation is probably the most time-consuming step in your process, so you would gain a lot from parallelizing it. Be warned though that the Parallel extension sometimes still has some hicc-ups.
Sign In or Register to comment.