Skip to content

Commit b137907

Browse files
committed
enhancement(regex_parser transform): Add RegexSet support to regex (fixes #2469)
This allows to specify multiple regular expressions to be defined that will be matched on the input using regex::RegexSet. Signed-off-by: Matthias Endler <[email protected]>
1 parent 9bd8264 commit b137907

File tree

12 files changed

+143
-75
lines changed

12 files changed

+143
-75
lines changed

.meta/transforms/regex_parser.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,17 @@ If `target_field` is set and the log contains a field of the same name \
3838
as the target, it will only be overwritten if this is set to `true`.\
3939
"""
4040

41-
[transforms.regex_parser.options.regex]
41+
[transforms.regex_parser.options.regexes]
4242
type = "string"
4343
common = true
4444
examples = [
4545
"""\
46-
^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$\
46+
['^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$']\
4747
"""
4848
]
4949
required = true
5050
description = """\
51-
The Regular Expression to apply. Do not include the leading or trailing `/`.\
51+
The Regular Expressions to apply. Do not include the leading or trailing `/` in any of the expressions.\
5252
"""
5353

5454
[transforms.regex_parser.options.target_field]
@@ -85,7 +85,7 @@ And the following configuration:
8585
[transforms.<transform-id>]
8686
type = "regex_parser"
8787
field = "message"
88-
regex = '^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
88+
regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']
8989
9090
[transforms.<transform-id>.types]
9191
bytes_in = "int"

benches/bench.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ fn benchmark_transforms(c: &mut Criterion) {
346346
"parser",
347347
&["in"],
348348
transforms::regex_parser::RegexParserConfig {
349-
regex: r"status=(?P<status>\d+)".to_string(),
349+
regexes: r#"['status=(?P<status>\d+)']"#.to_string(),
350350
field: None,
351351
..Default::default()
352352
},
@@ -410,7 +410,7 @@ fn benchmark_regex(c: &mut Criterion) {
410410
let rt = vector::runtime::Runtime::single_threaded().unwrap();
411411
let parser =transforms::regex_parser::RegexParserConfig {
412412
// Many captures to stress the regex parser
413-
regex: r#"^(?P<addr>\d+\.\d+\.\d+\.\d+) (?P<user>\S+) (?P<auth>\S+) \[(?P<date>\d+/[A-Za-z]+/\d+:\d+:\d+:\d+ [+-]\d{4})\] "(?P<method>[A-Z]+) (?P<uri>[^"]+) HTTP/\d\.\d" (?P<code>\d+) (?P<size>\d+) "(?P<referrer>[^"]+)" "(?P<browser>[^"]+)""#.into(),
413+
regex: r#"['^(?P<addr>\d+\.\d+\.\d+\.\d+) (?P<user>\S+) (?P<auth>\S+) \[(?P<date>\d+/[A-Za-z]+/\d+:\d+:\d+:\d+ [+-]\d{4})\] "(?P<method>[A-Z]+) (?P<uri>[^"]+) HTTP/\d\.\d" (?P<code>\d+) (?P<size>\d+) "(?P<referrer>[^"]+)" "(?P<browser>[^"]+)"']"#.into(),
414414
field: None,
415415
drop_failed: true,
416416
..Default::default()
@@ -465,7 +465,7 @@ fn benchmark_complex(c: &mut Criterion) {
465465
"parser",
466466
&["in1", "in2"],
467467
transforms::regex_parser::RegexParserConfig {
468-
regex: r"status=(?P<status>\d+)".to_string(),
468+
regex: r#"['status=(?P<status>\d+)']"#.to_string(),
469469
field: None,
470470
..Default::default()
471471
},

config/examples/file_to_cloudwatch_metrics.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ start_at_beginning = true
1414
[transforms.regex_parser]
1515
inputs = ["file"]
1616
type = "regex_parser"
17-
regex = '^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
17+
regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']
1818

1919
# Transform into metrics
2020
[transforms.log_to_metric]

config/examples/file_to_prometheus.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ start_at_beginning = true
1414
[transforms.regex_parser]
1515
inputs = ["file"]
1616
type = "regex_parser"
17-
regex = '^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$'
17+
regexes = ['^(?P<host>[\w\.]+) - (?P<user>[\w-]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$']
1818

1919
# Transform into metrics
2020
[transforms.log_to_metric]

config/vector.spec.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,11 +2182,11 @@ require('custom_module')
21822182
overwrite_target = true
21832183
overwrite_target = false
21842184

2185-
# The Regular Expression to apply. Do not include the leading or trailing `/`.
2185+
# The Regular Expressions to apply. Do not include the leading or trailing `/`.
21862186
#
21872187
# * required
21882188
# * type: string
2189-
regex = "^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$"
2189+
regex = "['^(?P<timestamp>[\\w\\-:\\+]+) (?P<level>\\w+) (?P<message>.*)$']"
21902190

21912191
# If this setting is present, the parsed fields will be inserted into the log
21922192
# as a sub-object with this name. If a field with the same name already exists,

scripts/test-unit.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,8 @@
88

99
set -euo pipefail
1010

11-
cargo test --all --no-default-features --target ${TARGET}
11+
if [ -z "${TARGET:-}" ]; then
12+
cargo test --all --no-default-features
13+
else
14+
cargo test --all --no-default-features --target "${TARGET}"
15+
fi

src/sources/kubernetes/message_parser.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,11 @@ impl Transform for DockerMessageTransformer {
8989
fn transform_cri_message() -> crate::Result<Box<dyn Transform>> {
9090
let mut rp_config = RegexParserConfig::default();
9191
// message field
92-
rp_config.regex =
92+
rp_config.regexes = vec![
9393
r"^(?P<timestamp>.*) (?P<stream>(stdout|stderr)) (?P<multiline_tag>(P|F)) (?P<message>.*)$"
94-
.to_owned();
94+
.to_owned(),
95+
];
96+
9597
// drop field
9698
rp_config.types.insert(
9799
event::log_schema().timestamp_key().clone(),

src/sources/kubernetes/mod.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ lazy_static! {
3939

4040
#[derive(Debug, Snafu)]
4141
enum BuildError {
42-
#[snafu(display("To large UID: {:?}", uid))]
42+
#[snafu(display("Too large UID: {:?}", uid))]
4343
UidToLarge { uid: String },
4444
#[snafu(display("UID contains illegal characters: {:?}", uid))]
4545
IllegalCharacterInUid { uid: String },
@@ -65,7 +65,7 @@ impl SourceConfig for KubernetesConfig {
6565
// Kubernetes source uses 'file source' and various transforms to implement
6666
// gathering of logs over Kubernetes CRI supported container runtimes.
6767

68-
// Side goal is to make kubernetes source behave as simillarly to docker source
68+
// Side goal is to make kubernetes source behave as similarly to Docker source
6969
// as possible to set a default behavior for all container related sources.
7070
// This will help with interchangeability.
7171

@@ -127,7 +127,7 @@ impl TimeFilter {
127127
if let Some(Value::Timestamp(ts)) = event.as_log().get(&event::log_schema().timestamp_key())
128128
{
129129
if ts < &self.start {
130-
trace!(message = "Recieved older log.", from = %ts.to_rfc3339());
130+
trace!(message = "Received older log.", from = %ts.to_rfc3339());
131131
return None;
132132
}
133133
}
@@ -152,9 +152,11 @@ fn transform_file() -> crate::Result<Box<dyn Transform>> {
152152

153153
config.field = Some("file".into());
154154

155-
config.regex = r"^".to_owned()
156-
+ LOG_DIRECTORY
157-
+ r"(?P<pod_uid>[^/]*)/(?P<container_name>[^/]*)/[0-9]*[.]log$";
155+
config.regexes = vec![
156+
r"^".to_owned()
157+
+ LOG_DIRECTORY
158+
+ r"(?P<pod_uid>[^/]*)/(?P<container_name>[^/]*)/[0-9]*[.]log$",
159+
];
158160

159161
// this field is implementation depended so remove it
160162
config.drop_field = true;
@@ -172,7 +174,7 @@ fn transform_file() -> crate::Result<Box<dyn Transform>> {
172174

173175
/// Contains several regexes that can parse common forms of pod_uid.
174176
/// On the first message, regexes are tried out one after the other until
175-
/// first succesfull one has been found. After that that regex will be
177+
/// first successful one has been found. After that that regex will be
176178
/// always used.
177179
///
178180
/// If nothing succeeds the message is still passed.
@@ -208,7 +210,7 @@ fn transform_pod_uid() -> crate::Result<ApplicableTransform> {
208210
let mut config = RegexParserConfig::default();
209211

210212
config.field = Some("pod_uid".into());
211-
config.regex = regex;
213+
config.regexes = vec![regex];
212214
// Remove pod_uid as it isn't usable anywhere else.
213215
config.drop_field = true;
214216
config.drop_failed = true;

0 commit comments

Comments
 (0)