Analyse logfile and create attributes out of tokens

MultiMulti Member Posts: 4 Contributor I
edited November 2018 in Help
Hi,

i tried to analyse a logfile (Read Document), extract some information and group this information.
The logfile looks like this:
[22:18:48.421] log.channel.name1: INFO: class#method: message bla bla bla
[22:19:48.421] log.channel.name2: ERROR: class#method: message2 bla bla bla
I tokenized the timestamp, the log channel name, log level (INFO; WARN; ERROR), the class/method, where the message comes from,the message itself and the whole line.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
   <parameter key="logverbosity" value="all"/>
   <parameter key="encoding" value="UTF-8"/>
   <process expanded="true" height="756" width="748">
     <operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="45" y="75">
       <parameter key="file" value="C:\Temp\bla.log"/>
       <parameter key="use_file_extension_as_type" value="false"/>
     </operator>
     <operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="94" name="Multiply" width="90" x="45" y="255"/>
     <operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information" width="90" x="246" y="210">
       <parameter key="query_type" value="Regular Expression"/>
       <list key="string_machting_queries"/>
       <parameter key="attribute_type" value="Numerical"/>
       <list key="regular_expression_queries">
         <parameter key="LogLevel" value="(?&lt;=\s(\bINFO\b|\bWARNING\b|\bERROR\b):\s)"/>
         <parameter key="Class#Method" value="(\w{1,40}#\w{1,50}:\s)"/>
       </list>
       <list key="regular_region_queries"/>
       <list key="xpath_queries"/>
       <list key="namespaces"/>
       <list key="index_queries"/>
     </operator>
     <operator activated="true" class="text:process_documents" compatibility="5.2.004" expanded="true" height="94" name="Process Documents" width="90" x="246" y="300">
       <parameter key="vector_creation" value="Binary Term Occurrences"/>
       <process expanded="true" height="774" width="731">
         <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75">
           <parameter key="mode" value="regular expression"/>
           <parameter key="characters" value="\n"/>
           <parameter key="expression" value="\n"/>
         </operator>
         <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize| timestamp" width="90" x="318" y="212">
           <parameter key="mode" value="regular expression"/>
           <parameter key="expression" value="(?&lt;=\[[\d:\.]{1,20}\])\s"/>
         </operator>
         <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize| LogLevel" width="90" x="313" y="120">
           <parameter key="mode" value="regular expression"/>
           <parameter key="expression" value="(?&lt;=\s(\bINFO\b|\bWARNING\b|\bERROR\b):\s)"/>
         </operator>
         <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize|Channel" width="90" x="313" y="30">
           <parameter key="mode" value="regular expression"/>
           <parameter key="expression" value="(?&lt;=\w{1,6}\.\w{1,30}\.\w{1,30}:)\s"/>
         </operator>
         <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize|ClassMethod" width="90" x="447" y="30">
           <parameter key="mode" value="regular expression"/>
           <parameter key="expression" value="(?&lt;=\w{1,40}#\w{1,50}:\s)"/>
         </operator>
         <connect from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Tokenize| timestamp" to_port="document"/>
         <connect from_op="Tokenize| timestamp" from_port="document" to_op="Tokenize| LogLevel" to_port="document"/>
         <connect from_op="Tokenize| LogLevel" from_port="document" to_op="Tokenize|Channel" to_port="document"/>
         <connect from_op="Tokenize|Channel" from_port="document" to_op="Tokenize|ClassMethod" to_port="document"/>
         <connect from_op="Tokenize|ClassMethod" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Read Document" from_port="output" to_op="Multiply" to_port="input"/>
     <connect from_op="Multiply" from_port="output 1" to_op="Extract Information" to_port="document"/>
     <connect from_op="Multiply" from_port="output 2" to_op="Process Documents" to_port="documents 1"/>
     <connect from_op="Extract Information" from_port="document" to_port="result 1"/>
     <connect from_op="Process Documents" from_port="example set" to_port="result 2"/>
     <connect from_op="Process Documents" from_port="word list" to_port="result 3"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="198"/>
     <portSpacing port="sink_result 2" spacing="54"/>
     <portSpacing port="sink_result 3" spacing="0"/>
     <portSpacing port="sink_result 4" spacing="18"/>
   </process>
 </operator>
</process>
But how can i group these tokens? I want to create an attribute called timestamp including all timestamps (regEx) e.g.
attribute           Values
timestamp [18:18:48.421],[19:20:48.421],[22:38:43.421],[22:44:44.421]
line [22:18:48.421] log.channel.name1: INFO: class#method: message bla bla bla,[22:19:48.421] log.channel.name2: ERROR: class#method: message2 bla bla bla
Is RapidMiner the right tool for this?
Maybe the "Text:Extract Information" could solve the problem (i can specify an attribute and the matching regEx). But i don't know how to proceed.
It was allready tricky to find out, that you have to bracket ( ) the regular expression, otherwise i got an "process failed: no group 1" exception?!

Another opinion was to take the "Web:Read Server Log" and describe the file format in a config file. The problem is, that the logfile includes stacktraces and other informations,too (but that's another matter altogether).

So far...

Answers

  • SkirzynskiSkirzynski Member Posts: 164 Maven
    Hey,

    Do you want to have an ExampleSet with the timestamp attribute and one example where all timestamps are concatenated or do you want an example for every line in your log-file. Since i think the second case does make more sense here is a solution for your problem.

    Actually it is quite easy to do this and you do not even need the text plugin. All you need is just one operator called "Generate Extract" which creates additional attributes from one attribute. So first of all you have to read in the log-file with the "Read CSV" operator for instance. The configuration of this operator has to be done in a way that the resulted ExampleSet have just one attribute which contains the single line from your log. Now you can specify several additional attributes in the "Generate Extract"  operator with the correct regular expressions. Additionally you can add more attributes like this, for instance the log level and so on.

    That is all. Here is a small example of this process design.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
        <process expanded="true" height="538" width="692">
          <operator activated="true" class="read_csv" compatibility="5.3.000" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
            <parameter key="csv_file" value="/home/marcin/temp/console.log"/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations"/>
            <parameter key="encoding" value="UTF-8"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="line.true.binominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="text:generate_extract" compatibility="5.2.005" expanded="true" height="60" name="Generate Extract" width="90" x="179" y="30">
            <parameter key="source_attribute" value="line"/>
            <parameter key="query_type" value="Regular Expression"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries">
              <parameter key="timestamp" value="(\[[\d:\.]{1,20}\])\s"/>
            </list>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Generate Extract" to_port="Example Set"/>
          <connect from_op="Generate Extract" from_port="Example Set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    Best regards
      Marcin
  • MultiMulti Member Posts: 4 Contributor I
    Hi Marcin,

    i actually used "Cut Document" to read in each line. Inside Cut Document i used "Extract Information" to get my attributes. Now i have a IOOObjectCollection. Have to figure out now, how to Visualize this=)

    About your example:
    i use "Read Document", so i need to convert the document to an ExampleSet to use "Generate Extract". Thanks for the example. I will try to pursue both strategies!
  • MultiMulti Member Posts: 4 Contributor I
    For now, i got my attributes, they are shown in the results.

    I tried to add attributes with Generate Extract, but there are no source attributes in the dropdown menu. I created 2 attributes in "Process Documents", but the exampleSet output port says 0 attributes?
    Even if i write "Line" into source attribute, an error occurs telling me, that "the example set does not contain an attribute with the given name" ?
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
        <parameter key="logverbosity" value="status"/>
        <process expanded="true" height="898" width="1016">
          <operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="45" y="75">
            <parameter key="file" value="C:\Bla\bla.log"/>
            <parameter key="extract_text_only" value="false"/>
            <parameter key="use_file_extension_as_type" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="76" name="Multiply" width="90" x="112" y="255"/>
          <operator activated="true" class="text:process_documents" compatibility="5.2.004" expanded="true" height="94" name="Process Documents -&gt;Collection (3)" width="90" x="246" y="165">
            <parameter key="create_word_vector" value="false"/>
            <process expanded="true" height="756" width="808">
              <operator activated="true" class="text:cut_document" compatibility="5.2.004" expanded="true" height="60" name="Cut Document (3)" width="90" x="179" y="30">
                <parameter key="query_type" value="Regular Region"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="Line" value="\\[[\\d:\\\.]{1,20}\\]\\s.[\\r\\n]"/>
                  <parameter key="Exception" value="Exception\\s.[\\r\\n]"/>
                </list>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <list key="index_queries"/>
                <process expanded="true" height="774" width="826">
                  <connect from_port="segment" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="document" to_op="Cut Document (3)" to_port="document"/>
              <connect from_op="Cut Document (3)" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:generate_extract" compatibility="5.2.004" expanded="true" height="60" name="Generate Extract (2)" width="90" x="380" y="255">
            <parameter key="source_attribute" value="Line"/>
            <parameter key="query_type" value="Regular Expression"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries">
              <parameter key="Timestamp" value="(\[[\d:\.]{1,20}\]\s)"/>
            </list>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <operator activated="true" class="nominal_to_date" compatibility="5.2.008" expanded="true" height="76" name="Nominal to Date (2)" width="90" x="514" y="255">
            <parameter key="attribute_name" value="Timestamp"/>
            <parameter key="date_type" value="time"/>
            <parameter key="date_format" value="[ HH:mm:ss.sss]"/>
            <parameter key="locale" value="German"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="5.2.008" expanded="true" height="76" name="Generate ID" width="90" x="715" y="255"/>
          <operator activated="true" class="examplevisualizer" compatibility="5.2.008" expanded="true" height="76" name="ExampleVisualizer" width="90" x="849" y="255"/>
          <connect from_op="Read Document" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Process Documents -&gt;Collection (3)" to_port="documents 1"/>
          <connect from_op="Process Documents -&gt;Collection (3)" from_port="example set" to_op="Generate Extract (2)" to_port="Example Set"/>
          <connect from_op="Generate Extract (2)" from_port="Example Set" to_op="Nominal to Date (2)" to_port="example set input"/>
          <connect from_op="Nominal to Date (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="ExampleVisualizer" to_port="example set input"/>
          <connect from_op="ExampleVisualizer" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="432"/>
        </process>
      </operator>
    </process>
  • MariusHelfMariusHelf RapidMiner Certified Expert, Member Posts: 1,869 Unicorn
    Hi,

    please set a breakpoint after the Process Documents operator, and you will see that the data is not quite in the format that you obviously expected.
    But if you have a look at Marcin's process you will see that he did not use Process Documents at all.

    Best, Marius
Sign In or Register to comment.