how to develop wordcloud from pdf file using rapidminer

drsvdpdrsvdp Member Posts: 1 Newbie
how to develop wordcloud from pdf file using rapidminer


  • GuiGui Member Posts: 10 Contributor II
    You can use the Text Processing extension on Marketplace.

    > Import data (PDF file) using "read document" (you can use "loop files" and insert the "read doc" operator inside to read all pdf files there)
    then you can use the "process document" to treat your text(s) and in the end, you can use "wordlist to data" and run with the "EXA" port connected. So you go to Results View and choose a visualization on left and choose Word Cloud. At least I think it can help now, but you can improve this process as you like/need
  • MarcoBarradasMarcoBarradas Administrator, Employee, RapidMiner Certified Analyst, Member Posts: 271 Unicorn
    Hi @drsvdp,

    Please take a look into the text mining tutorial and make sure you install the text minning extension.

    This is an example that might help you.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.10.010">
      <operator activated="true" class="process" compatibility="9.10.010" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="-1"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="UTF-8"/>
        <process expanded="true">
          <operator activated="true" class="text:process_document_from_file" compatibility="9.4.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="179" y="85">
            <list key="text_directories"/>
            <parameter key="file_pattern" value="*.pdf"/>
            <parameter key="extract_text_only" value="true"/>
            <parameter key="use_file_extension_as_type" value="true"/>
            <parameter key="content_type" value="txt"/>
            <parameter key="encoding" value="UTF-8"/>
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <process expanded="true">
              <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34">
                <parameter key="transform_to" value="lower case"/>
              <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              <operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34">
                <parameter key="min_chars" value="4"/>
                <parameter key="max_chars" value="25"/>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="9.4.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/>
              <connect from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
          <operator activated="true" class="text:wordlist_to_data" compatibility="9.4.000" expanded="true" height="82" name="WordList to Data" width="90" x="380" y="85"/>
          <operator activated="true" class="blending:sort" compatibility="9.10.010" expanded="true" height="82" name="Sort" width="90" x="514" y="85">
            <list key="sort_by">
              <parameter key="in class (PDF_Files)" value="descending"/>
          <operator activated="true" class="filter_example_range" compatibility="9.10.010" expanded="true" height="82" name="Filter Example Range" width="90" x="648" y="85">
            <parameter key="first_example" value="1"/>
            <parameter key="last_example" value="100"/>
            <parameter key="invert_filter" value="false"/>
          <connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
          <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
          <connect from_op="Filter Example Range" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>

Sign In or Register to comment.