Due to recent updates, all users are required to create an Altair One account to login to the RapidMiner community. Click the Register button to create your account using the same email that you have previously used to login to the RapidMiner community. This will ensure that any previously created content will be synced to your Altair One account. Once you login, you will be asked to provide a username that identifies you to other Community users. Email us at Community with questions.

Include filename in Excel output

MarkwMarkw Member Posts: 1 Learner III
edited December 2018 in Help

Hi,

 

I'm new to RM.

 

I'm analysing a bunch of text files in a folder, getting some great results using n-grams, much excitement.

 

I really want to include the filename in the results that I'm sending out to Excel. I see that the filename is included in the meatada of the ExampleSet but I am converting the WordList to data and outputting that to Excel, how can I get the filenames as the last column in this dataset?

 

I have searched high and low and found several ideas but I think my skill level is still too low to properly use them.

 

I'll post the xml below as I have see other posters do in the past, I'd really appreciate any assistance.

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="85">
<list key="text_directories">
<parameter key="WFS call text" value="/Users/mark/Downloads/voicetext"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="187"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="187"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="187"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="187">
<parameter key="max_length" value="7"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="238"/>
<operator activated="true" class="write_excel" compatibility="8.0.001" expanded="true" height="82" name="Write Excel" width="90" x="514" y="238">
<parameter key="excel_file" value="/Users/mark/Downloads/WFS patterns.xlsx"/>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>

 

Tagged:

Answers

  • sgenzersgenzer Administrator, Moderator, Employee, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    hello @Markw - welcome to the community. So the filename should be part of the metadata that comes out of Process Documents from Files. I cannot see that column because I don't have your files, but it should be there.

     

    So if that's the case, I would put an "Extract Macro" operator on that wire that will save the filename. Then you can create an attribute with that macro's information. Something like this:

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="85">
    <list key="text_directories">
    <parameter key="WFS call text" value="/Users/mark/Downloads/voicetext"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="187"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="187"/>
    <operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="187"/>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="187">
    <parameter key="max_length" value="7"/>
    </operator>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
    <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="extract_macro" compatibility="8.0.001" expanded="true" height="68" name="Extract Macro" width="90" x="380" y="85">
    <parameter key="macro" value="filename"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="attribute_name" value="filename"/>
    <parameter key="example_index" value="1"/>
    <list key="additional_macros"/>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="238"/>
    <operator activated="true" class="generate_attributes" compatibility="8.0.001" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="238">
    <list key="function_descriptions">
    <parameter key="filename" value="%{filename}"/>
    </list>
    </operator>
    <operator activated="true" class="write_excel" compatibility="8.0.001" expanded="true" height="82" name="Write Excel" width="90" x="581" y="238">
    <parameter key="excel_file" value="/Users/mark/Downloads/WFS patterns.xlsx"/>
    </operator>
    <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
    <connect from_op="Process Documents from Files" from_port="example set" to_op="Extract Macro" to_port="example set"/>
    <connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="Extract Macro" from_port="example set" to_port="result 1"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
    <connect from_op="Generate Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Scott

     

Sign In or Register to comment.