Grok update.

- Add docs for feature-specific libgrok version requirements
- Add 'match' config; takes a hash.
- Allow anything that looks like a field name as a config attribute.
- Deprecate 'pattern' config. It now is the equivalent of:
    pattern => "mypattern"
    match => [ "@message", "mypattern" ]
- Add some new tests to verify new libgrok inline-patterns and such.
This commit is contained in:
Jordan Sissel 2011-07-01 00:02:10 -07:00
parent f37c8b8a9e
commit 919122dcbc
2 changed files with 134 additions and 61 deletions

View file

@ -21,16 +21,35 @@ require "set"
# * libpcre >= 7.6
# * libevent >= 1.3 (though older versions may worK)
#
# Feature requirements:
#
# * Int/float coercion requires >= 1.20110223.*
# * In-line pattern definitions >= 1.20110630.*
#
#
# Note:
# CentOS 5 ships with an ancient version of pcre that does not work with grok.
class LogStash::Filters::Grok < LogStash::Filters::Base
config_name "grok"
# Specify a pattern to parse with.
# Specify a pattern to parse with. This will match the '@message' field.
# Multiple patterns is fine. First match breaks.
config :pattern, :validate => :array, :required => true
config :pattern, :validate => :array, :deprecated => true
# Specify a path to a directory with grok pattern files in it
# A hash of matches of field => value
config :match, :validate => :hash, :default => {}
# Any existing field name can be used as a config name here for matching
# against.
#
# # this config:
# foo => "some pattern"
#
# # same as:
# match => [ "foo", "some pattern" ]
config /[A-Za-z0-9_-]+/, :validate => :string
#
# logstash ships by default with a bunch of patterns, so you don't
# necessarily need to define this yourself unless you are adding additional
@ -43,7 +62,7 @@ class LogStash::Filters::Grok < LogStash::Filters::Base
# For example:
#
# NUMBER \d+
config :patterns_dir, :validate => :array
config :patterns_dir, :validate => :array, :default => []
# Drop if matched. Note, this feature may not stay. It is preferable to combine
# grok + grep filters to do parsing + dropping.
@ -67,16 +86,12 @@ class LogStash::Filters::Grok < LogStash::Filters::Base
@@patterns_path += val.split(":")
end
@@grokpiles = Hash.new { |h, k| h[k] = [] }
@@grokpiles_lock = Mutex.new
public
def register
gem "jls-grok", ">=0.4.3"
require "grok" # rubygem 'jls-grok'
@pile = Grok::Pile.new
@patterns_dir ||= []
@patternfiles = []
@patterns_dir += @@patterns_path.to_a
@logger.info("Grok patterns path: #{@patterns_dir.join(":")}")
@patterns_dir.each do |path|
@ -92,77 +107,93 @@ class LogStash::Filters::Grok < LogStash::Filters::Base
end
Dir.glob(path).each do |file|
@logger.info("Grok loading patterns from #{file}")
add_patterns_from_file(file)
#@logger.info("Grok loading patterns from #{file}")
@patternfiles << file
end
end
@pattern.each do |pattern|
groks = @pile.compile(pattern)
@logger.debug(["Compiled pattern", pattern, groks[-1].expanded_pattern])
end
@patterns = Hash.new { |h,k| h[k] = [] }
@@grokpiles_lock.synchronize do
@@grokpiles[@type] << @pile
end
@match["@message"] = []
@match["@message"] += @pattern if @pattern # the config 'pattern' value (array)
# TODO(sissel): Hash.merge actually overrides, not merges arrays.
# Work around it by implementing our own?
# TODO(sissel): Check if 'match' is empty?
@match.merge(@config).each do |field, patterns|
# Skip known config names
next if ["add_tag", "add_field", "type", "match", "patterns_dir",
"drop_if_match", "named_captures_only", "pattern" ].include?(field)
if !@patterns.include?(field)
@patterns[field] = Grok::Pile.new
add_patterns_from_files(@patternfiles, @patterns[field])
end
patterns.each do |pattern|
@logger.debug(["regexp: #{@type}/#{field}", pattern])
@patterns[field].compile(pattern)
end
end # @config.each
end # def register
public
def filter(event)
# parse it with grok
match = false
matched = false
# Only filter events we are configured for
if event.type != @type
return
end
if @@grokpiles[event.type].length == 0
@logger.debug("Skipping grok for event type=#{event.type} (no grokpiles defined)")
if @type != event.type
@logger.debug("Skipping grok for event type=#{event.type} (wanted '#{@type}')")
return
end
if !event.message.is_a?(Array)
messages = [event.message]
else
messages = event.message
end
messages.each do |message|
@logger.debug(["Running grok filter", event])
@@grokpiles[event.type].each do |pile|
@logger.debug(["Trying pattern for type #{event.type}", pile])
grok, match = @pile.match(message)
@logger.debug(["Result", { :grok => grok, :match => match }])
break if match
@logger.debug(["Running grok filter", event])
@patterns.each do |field, pile|
if !event[field]
@logger.debug(["Skipping match object, field not present", field,
event, event[field]])
next
end
if match
@logger.debug(["Trying pattern for type #{event.type}", pile])
(event[field].is_a?(Array) ? event[field] : [event[field]]).each do |fieldvalue|
grok, match = pile.match(fieldvalue)
next unless match
matched = true
match.each_capture do |key, value|
match_type = nil
type_coerce = nil
if key.include?(":")
name, key, match_type = key.split(":")
name, key, type_coerce = key.split(":")
end
# http://code.google.com/p/logstash/issues/detail?id=45
# Permit typing of captures by giving an additional colon and a type,
# like: %{FOO:name:int} for int coercion.
case match_type
if type_coerce
@logger.info("Match type coerce: #{type_coerce}")
@logger.info("Patt: #{grok.pattern}")
end
case type_coerce
when "int"
value = value.to_i
when "float"
value = value.to_f
end
if event.message == value
# Skip patterns that match the entire line
if fieldvalue == value and field == "@message"
# Skip patterns that match the entire message
@logger.debug("Skipping capture '#{key}' since it matches the whole line.")
next
end
if @named_captures_only && key.upcase == key
@logger.debug("Skipping capture '#{key}' since it is not a named capture and named_captures_only is true.")
if @named_captures_only && key =~ /^[A-Z]+/
@logger.debug("Skipping capture '#{key}' since it is not a named " \
"capture and named_captures_only is true.")
next
end
@ -177,38 +208,43 @@ class LogStash::Filters::Grok < LogStash::Filters::Base
if !value.nil? && (!value.empty? rescue true)
event.fields[key] << value
end
end
filter_matched(event)
else
# Tag this event if we can't parse it. We can use this later to
# reparse+reindex logs if we improve the patterns given .
event.tags << "_grokparsefailure"
end
end # message.each
end # match.each_capture
filter_matched(event)
end # event[field]
end # patterns.each
if !matched
# Tag this event if we can't parse it. We can use this later to
# reparse+reindex logs if we improve the patterns given .
event.tags << "_grokparsefailure"
end
#if !event.cancelled?
#filter_matched(event)
#end
@logger.debug(["Event now: ", event.to_hash])
end # def filter
private
def add_patterns_from_file(file)
def add_patterns_from_files(paths, pile)
paths.each { |path| add_patterns_from_file(path, pile) }
end
private
def add_patterns_from_file(path, pile)
# Check if the file path is a jar, if so, we'll have to read it ourselves
# since libgrok won't know what to do with it.
if file =~ /file:\/.*\.jar!.*/
File.new(file).each do |line|
if path =~ /file:\/.*\.jar!.*/
File.new(path).each do |line|
next if line =~ /^(?:\s*#|\s*$)/
# In some cases I have seen 'file.each' yield lines with newlines at
# the end. I don't know if this is a bug or intentional, but we need
# to chomp it.
name, pattern = line.chomp.split(/\s+/, 2)
@logger.debug "Adding pattern '#{name}' from file #{file}"
@logger.debug "Adding pattern '#{name}' from file #{path}"
@logger.debug name => pattern
@pile.add_pattern(name, pattern)
pile.add_pattern(name, pattern)
end
else
@pile.add_patterns_from_file(file)
pile.add_patterns_from_file(path)
end
end # def add_patterns
end # class LogStash::Filters::Grok

View file

@ -24,7 +24,10 @@ class TestFilterGrok < Test::Unit::TestCase
end
@filter = LogStash::Filters::Grok.new(cfg)
p :config => cfg, :id => @filter.object_id
p :fizzle => @filter.pattern
@filter.register
#p :newfilter => @filter
end
def test_grok_normal
@ -110,8 +113,12 @@ class TestFilterGrok < Test::Unit::TestCase
event.message = "#{expect}"
@filter.filter(event)
assert_equal(expect.class, event.fields["foo"].first.class, "Expected field 'foo' to be of type #{expect.class.name} but got #{event.fields["foo"].first.class.name}")
assert_equal([expect], event.fields["foo"], "Expected field 'foo' to be [#{expect.inspect}], is #{event.fields["expect"].inspect}")
assert_equal(expect.class, event.fields["foo"].first.class,
"Expected field 'foo' to be of type #{expect.class.name} " \
"but got #{event.fields["foo"].first.class.name}")
assert_equal([expect], event.fields["foo"],
"Expected field 'foo' to be [#{expect.inspect}], is " \
"#{event.fields["expect"].inspect}")
end # def test_grok_type_hinting_int
def test_grok_type_hinting_float
@ -128,4 +135,34 @@ class TestFilterGrok < Test::Unit::TestCase
assert_equal(expect.class, event.fields["foo"].first.class, "Expected field 'foo' to be of type #{expect.class.name} but got #{event.fields["foo"].first.class.name}")
assert_equal([expect], event.fields["foo"], "Expected field 'foo' to be [#{expect.inspect}], is #{event.fields["expect"].inspect}")
end # def test_grok_type_hinting_float
def test_grok_inline_define
test_name "grok_inline_define"
config "pattern" => [ "%{FIZZLE=\\d+}" ]
event = LogStash::Event.new
event.type = @typename
expect = "1234"
event.message = "hello #{expect}"
@filter.filter(event)
assert_equal(expect.class, event.fields["FIZZLE"].first.class, "Expected field 'FIZZLE' to be of type #{expect.class.name} but got #{event.fields["FIZZLE"].first.class.name}")
assert_equal([expect], event.fields["FIZZLE"], "Expected field 'FIZZLE' to be [#{expect.inspect}], is #{event.fields["expect"].inspect}")
end # def test_grok_type_hinting_float
def test_grok_field_name_attribute
test_name "grok_field_name_attribute"
config "rum" => [ "%{FIZZLE=\\d+}" ]
event = LogStash::Event.new
event.type = @typename
expect = "1234"
event.fields["rum"] = "hello #{expect}"
@filter.filter(event)
assert_equal(expect.class, event.fields["FIZZLE"].first.class, "Expected field 'FIZZLE' to be of type #{expect.class.name} but got #{event.fields["FIZZLE"].first.class.name}")
assert_equal([expect], event.fields["FIZZLE"], "Expected field 'FIZZLE' to be [#{expect.inspect}], is #{event.fields["expect"].inspect}")
end # def test_grok_type_hinting_float
end