The number of times words appear in the RMP file is not the same as in the CSV file.

Booram · November 2023

Please show me a Word/document frequency table like the one in the business interpretation presentation. When I run the .rmp file in the rapidminer tool Call" has a frequency of 1, but when manually checking the raw Excel data it has hundreds of occurrences

Please help me with the reply as soon as possible, I am attaching the screenshot for your reference along with the .rmp file and CSV file

rjones13 · November 2023

Hi @Booram,

Please find the process below as a starting point. I would recommend using the following course on RapidMiner Academy as a reference:
https://academy.rapidminer.com/courses/text-and-web-mining-with-rapidminer

From here, you can think about other actions e.g. filtering out stop words and stemming.

Best,
Roland

<?xml version="1.0" encoding="UTF-8"?><process version="10.3.000">

  <context>

    <input/>

    <output/>

    <macros/>

  </context>

  <operator activated="true" class="process" compatibility="10.3.000" expanded="true" name="Process">

    <parameter key="logverbosity" value="init"/>

    <parameter key="random_seed" value="2001"/>

    <parameter key="send_mail" value="never"/>

    <parameter key="notification_email" value=""/>

    <parameter key="process_duration_for_mail" value="30"/>

    <parameter key="encoding" value="UTF-8"/>

    <process expanded="true">

      <operator activated="true" class="read_excel" compatibility="10.3.000" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">

        <parameter key="excel_file" value="C:\Users\rjones\Downloads\SPAM-text-message.xlsx"/>

        <parameter key="sheet_selection" value="sheet number"/>

        <parameter key="sheet_number" value="1"/>

        <parameter key="imported_cell_range" value="A1"/>

        <parameter key="encoding" value="UTF-8"/>

        <parameter key="use_header_row" value="true"/>

        <parameter key="header_row" value="1"/>

        <parameter key="first_row_as_names" value="true"/>

        <list key="annotations"/>

        <parameter key="date_format" value=""/>

        <parameter key="time_zone" value="SYSTEM"/>

        <parameter key="locale" value="English (United States)"/>

        <parameter key="read_all_values_as_polynominal" value="false"/>

        <list key="data_set_meta_data_information">

          <parameter key="0" value="Category.true.polynominal.attribute"/>

          <parameter key="1" value="Message.true.polynominal.attribute"/>

        </list>

        <parameter key="read_not_matching_values_as_missings" value="false"/>

      </operator>

      <operator activated="true" class="nominal_to_text" compatibility="10.3.000" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="34">

        <parameter key="attribute_filter_type" value="single"/>

        <parameter key="attribute" value="Message"/>

        <parameter key="attributes" value=""/>

        <parameter key="use_except_expression" value="false"/>

        <parameter key="value_type" value="nominal"/>

        <parameter key="use_value_type_exception" value="false"/>

        <parameter key="except_value_type" value="file_path"/>

        <parameter key="block_type" value="single_value"/>

        <parameter key="use_block_type_exception" value="false"/>

        <parameter key="except_block_type" value="single_value"/>

        <parameter key="invert_selection" value="false"/>

        <parameter key="include_special_attributes" value="false"/>

      </operator>

      <operator activated="true" class="text:process_document_from_data" compatibility="10.0.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">

        <parameter key="create_word_vector" value="true"/>

        <parameter key="vector_creation" value="TF-IDF"/>

        <parameter key="add_meta_information" value="true"/>

        <parameter key="keep_text" value="false"/>

        <parameter key="prune_method" value="none"/>

        <parameter key="prune_below_percent" value="3.0"/>

        <parameter key="prune_above_percent" value="30.0"/>

        <parameter key="prune_below_rank" value="0.05"/>

        <parameter key="prune_above_rank" value="0.95"/>

        <parameter key="datamanagement" value="double_sparse_array"/>

        <parameter key="data_management" value="auto"/>

        <parameter key="select_attributes_and_weights" value="false"/>

        <list key="specify_weights"/>

        <process expanded="true">

          <operator activated="true" class="text:tokenize" compatibility="10.0.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34">

            <parameter key="mode" value="non letters"/>

            <parameter key="characters" value=".:"/>

            <parameter key="language" value="English"/>

            <parameter key="max_token_length" value="3"/>

          </operator>

          <connect from_port="document" to_op="Tokenize" to_port="document"/>

          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>

          <portSpacing port="source_document" spacing="0"/>

          <portSpacing port="sink_document 1" spacing="0"/>

          <portSpacing port="sink_document 2" spacing="0"/>

        </process>

      </operator>

      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>

      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>

      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>

      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>

      <portSpacing port="source_input 1" spacing="0"/>

      <portSpacing port="sink_result 1" spacing="0"/>

      <portSpacing port="sink_result 2" spacing="0"/>

      <portSpacing port="sink_result 3" spacing="0"/>

    </process>

  </operator>

</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

The number of times words appear in the RMP file is not the same as in the CSV file.

Answers