From d68d01e1310904f03941687607a9710f570ef03f Mon Sep 17 00:00:00 2001 From: odubajDT Date: Tue, 25 Feb 2025 17:14:20 +0100 Subject: [PATCH 1/4] [processor/redaction] Support hashing instead of masking values via parameter Signed-off-by: odubajDT --- .chloggen/redaction-hash.yaml | 27 ++++++ processor/redactionprocessor/README.md | 9 ++ processor/redactionprocessor/config.go | 53 ++++++++++++ processor/redactionprocessor/config_test.go | 37 ++++++++ processor/redactionprocessor/factory_test.go | 1 + processor/redactionprocessor/go.mod | 1 + processor/redactionprocessor/go.sum | 2 + processor/redactionprocessor/processor.go | 37 +++++++- .../redactionprocessor/processor_test.go | 86 +++++++++++++++++++ .../redactionprocessor/testdata/config.yaml | 4 + 10 files changed, 254 insertions(+), 3 deletions(-) create mode 100644 .chloggen/redaction-hash.yaml diff --git a/.chloggen/redaction-hash.yaml b/.chloggen/redaction-hash.yaml new file mode 100644 index 0000000000000..7acebb47c8b65 --- /dev/null +++ b/.chloggen/redaction-hash.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: processor/redaction + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: "Support hashing instead of masking values via 'hash_function' parameter" + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [35830] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/processor/redactionprocessor/README.md b/processor/redactionprocessor/README.md index 2839fc4500f60..baf064df350b7 100644 --- a/processor/redactionprocessor/README.md +++ b/processor/redactionprocessor/README.md @@ -87,6 +87,10 @@ processors: # blocked span attributes. Values that match are not masked. allowed_values: - ".+@mycompany.com" + # hash_function defines the function for hashing the values instead of + # masking them with a fixed string. By default, no hash function is used + # and masking with a fixed string is performed. + hash_function: md5 # summary controls the verbosity level of the diagnostic attributes that # the processor adds to the spans/logs/datapoints when it redacts or masks other # attributes. In some contexts a list of redacted attributes leaks @@ -119,6 +123,11 @@ part of the value is masked with a fixed length of asterisks. `blocked_key_patterns` applies to the values of the keys matching one of the patterns. The value is then masked according to the configuration. +`hash_function` defines the function for hashing the values (or substrings of values) +instead of masking them with a fixed string. By default, no hash function is used +and masking with a fixed string is performed. The supported hash functions +are `md5`, `sha1` and `sha3`. + For example, if `notes` is on the list of allowed keys, then the `notes` attribute is retained. However, if there is a value such as a credit card number in the `notes` field that matched a regular expression on the list of diff --git a/processor/redactionprocessor/config.go b/processor/redactionprocessor/config.go index f6076088fb9a2..ccfa4678643da 100644 --- a/processor/redactionprocessor/config.go +++ b/processor/redactionprocessor/config.go @@ -3,6 +3,26 @@ package redactionprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor" +import ( + "encoding" + "errors" + "fmt" + "strings" +) + +type HashFunction string + +const ( + None HashFunction = "" + SHA1 HashFunction = "sha1" + SHA3 HashFunction = "sha3" + MD5 HashFunction = "md5" +) + +var ( + _ encoding.TextUnmarshaler = (*HashFunction)(nil) +) + type Config struct { // AllowAllKeys is a flag to allow all span attribute keys. Setting this // to true disables the AllowedKeys list. The list of BlockedValues is @@ -18,6 +38,11 @@ type Config struct { // matching the regexes on the list are masked. BlockedKeyPatterns []string `mapstructure:"blocked_key_patterns"` + // HashFunction defines the function for hashing the values instead of + // masking them with a fixed string. By default, no hash function is used + // and masking with a fixed string is performed. + HashFunction HashFunction `mapstructure:"hash_function"` + // IgnoredKeys is a list of span attribute keys that are not redacted. // Span attributes in this list are allowed to pass through the filter // without being changed or removed. @@ -38,3 +63,31 @@ type Config struct { // configuration. Possible values are `debug`, `info`, and `silent`. Summary string `mapstructure:"summary"` } + +func (u HashFunction) String() string { + return string(u) +} + +// UnmarshalText unmarshalls text to a HashFunction. +func (u *HashFunction) UnmarshalText(text []byte) error { + if u == nil { + return errors.New("cannot unmarshal to a nil *HashFunction") + } + + str := strings.ToLower(string(text)) + switch str { + case strings.ToLower(SHA1.String()): + *u = SHA1 + return nil + case strings.ToLower(MD5.String()): + *u = MD5 + return nil + case strings.ToLower(SHA3.String()): + *u = SHA3 + return nil + case strings.ToLower(None.String()): + *u = None + return nil + } + return fmt.Errorf("unknown HashFunction %s, allowed functions are %s, %s and %s", str, SHA1, SHA3, MD5) +} diff --git a/processor/redactionprocessor/config_test.go b/processor/redactionprocessor/config_test.go index 2b1de123994a7..057e5c1ad0781 100644 --- a/processor/redactionprocessor/config_test.go +++ b/processor/redactionprocessor/config_test.go @@ -4,6 +4,7 @@ package redactionprocessor import ( + "errors" "path/filepath" "testing" @@ -31,6 +32,7 @@ func TestLoadConfig(t *testing.T) { IgnoredKeys: []string{"safe_attribute"}, BlockedValues: []string{"4[0-9]{12}(?:[0-9]{3})?", "(5[1-5][0-9]{14})"}, BlockedKeyPatterns: []string{".*token.*", ".*api_key.*"}, + HashFunction: MD5, AllowedValues: []string{".+@mycompany.com"}, Summary: debug, }, @@ -58,3 +60,38 @@ func TestLoadConfig(t *testing.T) { }) } } + +func TestValidateConfig(t *testing.T) { + tests := []struct { + name string + hash HashFunction + expected error + }{ + { + name: "valid", + hash: MD5, + }, + { + name: "empty", + hash: None, + }, + { + name: "invalid", + hash: "hash", + expected: errors.New("unknown HashFunction hash, allowed functions are sha1, sha3 and md5"), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var h HashFunction + err := h.UnmarshalText([]byte(tt.hash)) + if tt.expected != nil { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.hash, h) + } + }) + } +} diff --git a/processor/redactionprocessor/factory_test.go b/processor/redactionprocessor/factory_test.go index 96858eb57a145..fdcace9374e58 100644 --- a/processor/redactionprocessor/factory_test.go +++ b/processor/redactionprocessor/factory_test.go @@ -20,6 +20,7 @@ func TestDefaultConfiguration(t *testing.T) { assert.Empty(t, c.BlockedValues) assert.Empty(t, c.AllowedValues) assert.Empty(t, c.BlockedKeyPatterns) + assert.Empty(t, c.HashFunction) } func TestCreateTestProcessor(t *testing.T) { diff --git a/processor/redactionprocessor/go.mod b/processor/redactionprocessor/go.mod index 45ae464c5e9ce..ee36e1c025c27 100644 --- a/processor/redactionprocessor/go.mod +++ b/processor/redactionprocessor/go.mod @@ -15,6 +15,7 @@ require ( go.opentelemetry.io/collector/processor/processortest v0.120.1-0.20250224010654-18e18b21da7a go.uber.org/goleak v1.3.0 go.uber.org/zap v1.27.0 + golang.org/x/crypto v0.31.0 ) require ( diff --git a/processor/redactionprocessor/go.sum b/processor/redactionprocessor/go.sum index 284f90b37a55b..d74805527b53f 100644 --- a/processor/redactionprocessor/go.sum +++ b/processor/redactionprocessor/go.sum @@ -101,6 +101,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= diff --git a/processor/redactionprocessor/processor.go b/processor/redactionprocessor/processor.go index 2806a5081cbcd..f84a12a749d3f 100644 --- a/processor/redactionprocessor/processor.go +++ b/processor/redactionprocessor/processor.go @@ -3,9 +3,14 @@ package redactionprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor" +//nolint:gosec import ( "context" + "crypto/md5" + "crypto/sha1" + "encoding/hex" "fmt" + "hash" "regexp" "sort" "strings" @@ -15,6 +20,7 @@ import ( "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/pdata/ptrace" "go.uber.org/zap" + "golang.org/x/crypto/sha3" ) const attrValuesSeparator = "," @@ -30,6 +36,8 @@ type redaction struct { allowRegexList map[string]*regexp.Regexp // Attribute keys blocked in a span blockKeyRegexList map[string]*regexp.Regexp + // Hash function to hash blocked values + hashFunction HashFunction // Redaction processor configuration config *Config // Logger @@ -63,6 +71,7 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red blockRegexList: blockRegexList, allowRegexList: allowRegexList, blockKeyRegexList: blockKeysRegexList, + hashFunction: config.HashFunction, config: config, logger: logger, }, nil @@ -217,7 +226,7 @@ func (s *redaction) processAttrs(_ context.Context, attributes pcommon.Map) { for _, compiledRE := range s.blockKeyRegexList { if match := compiledRE.MatchString(k); match { toBlock = append(toBlock, k) - maskedValue := compiledRE.ReplaceAllString(strVal, "****") + maskedValue := s.maskValue(strVal, regexp.MustCompile(".*")) value.SetStr(maskedValue) return true } @@ -226,13 +235,13 @@ func (s *redaction) processAttrs(_ context.Context, attributes pcommon.Map) { // Mask any blocked values for the other attributes var matched bool for _, compiledRE := range s.blockRegexList { - if match := compiledRE.MatchString(strVal); match { + if compiledRE.MatchString(strVal) { if !matched { matched = true toBlock = append(toBlock, k) } - maskedValue := compiledRE.ReplaceAllString(strVal, "****") + maskedValue := s.maskValue(strVal, compiledRE) value.SetStr(maskedValue) strVal = maskedValue } @@ -251,6 +260,28 @@ func (s *redaction) processAttrs(_ context.Context, attributes pcommon.Map) { s.addMetaAttrs(ignoring, attributes, "", ignoredKeyCount) } +//nolint:gosec +func (s *redaction) maskValue(val string, regex *regexp.Regexp) string { + hashFunc := func(match string) string { + switch s.hashFunction { + case SHA1: + return hashString(match, sha1.New()) + case SHA3: + return hashString(match, sha3.New256()) + case MD5: + return hashString(match, md5.New()) + default: + return "****" + } + } + return regex.ReplaceAllStringFunc(val, hashFunc) +} + +func hashString(input string, hasher hash.Hash) string { + hasher.Write([]byte(input)) + return hex.EncodeToString(hasher.Sum(nil)) +} + // addMetaAttrs adds diagnostic information about redacted or masked attribute keys func (s *redaction) addMetaAttrs(redactedAttrs []string, attributes pcommon.Map, valuesAttr, countAttr string) { redactedCount := int64(len(redactedAttrs)) diff --git a/processor/redactionprocessor/processor_test.go b/processor/redactionprocessor/processor_test.go index edf2282e16d1f..e30db07a5b783 100644 --- a/processor/redactionprocessor/processor_test.go +++ b/processor/redactionprocessor/processor_test.go @@ -271,6 +271,92 @@ func TestRedactSummaryDebug(t *testing.T) { } } +func TestRedactSummaryDebugHashMD5(t *testing.T) { + testConfig := TestConfig{ + config: &Config{ + AllowedKeys: []string{"id", "group", "name", "group.id", "member (id)", "token_some", "api_key_some", "email"}, + BlockedValues: []string{"4[0-9]{12}(?:[0-9]{3})?"}, + HashFunction: MD5, + IgnoredKeys: []string{"safe_attribute"}, + BlockedKeyPatterns: []string{".*token.*", ".*api_key.*"}, + Summary: "debug", + }, + allowed: map[string]pcommon.Value{ + "id": pcommon.NewValueInt(5), + "group.id": pcommon.NewValueStr("some.valid.id"), + "member (id)": pcommon.NewValueStr("some other valid id"), + }, + masked: map[string]pcommon.Value{ + "name": pcommon.NewValueStr("placeholder 4111111111111111"), + }, + ignored: map[string]pcommon.Value{ + "safe_attribute": pcommon.NewValueStr("harmless 4111111111111112"), + }, + redacted: map[string]pcommon.Value{ + "credit_card": pcommon.NewValueStr("4111111111111111"), + }, + blockedKeys: map[string]pcommon.Value{ + "token_some": pcommon.NewValueStr("tokenize"), + "api_key_some": pcommon.NewValueStr("apinize"), + }, + allowedValues: map[string]pcommon.Value{ + "email": pcommon.NewValueStr("user@mycompany.com"), + }, + } + + outTraces := runTest(t, testConfig) + outLogs := runLogsTest(t, testConfig) + outMetricsGauge := runMetricsTest(t, testConfig, pmetric.MetricTypeGauge) + outMetricsSum := runMetricsTest(t, testConfig, pmetric.MetricTypeSum) + outMetricsHistogram := runMetricsTest(t, testConfig, pmetric.MetricTypeHistogram) + outMetricsExponentialHistogram := runMetricsTest(t, testConfig, pmetric.MetricTypeExponentialHistogram) + outMetricsSummary := runMetricsTest(t, testConfig, pmetric.MetricTypeSummary) + + attrs := []pcommon.Map{ + outTraces.ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(0).Attributes(), + outLogs.ResourceLogs().At(0).ScopeLogs().At(0).LogRecords().At(0).Attributes(), + outMetricsGauge.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes(), + outMetricsSum.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Sum().DataPoints().At(0).Attributes(), + outMetricsHistogram.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Histogram().DataPoints().At(0).Attributes(), + outMetricsExponentialHistogram.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).ExponentialHistogram().DataPoints().At(0).Attributes(), + outMetricsSummary.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Summary().DataPoints().At(0).Attributes(), + } + + for _, attr := range attrs { + deleted := make([]string, 0, len(testConfig.redacted)) + for k := range testConfig.redacted { + _, ok := attr.Get(k) + assert.False(t, ok) + deleted = append(deleted, k) + } + maskedKeys, ok := attr.Get(redactedKeys) + assert.True(t, ok) + sort.Strings(deleted) + assert.Equal(t, strings.Join(deleted, ","), maskedKeys.Str()) + maskedKeyCount, ok := attr.Get(redactedKeyCount) + assert.True(t, ok) + assert.Equal(t, int64(len(deleted)), maskedKeyCount.Int()) + + ignoredKeyCount, ok := attr.Get(ignoredKeyCount) + assert.True(t, ok) + assert.Equal(t, int64(len(testConfig.ignored)), ignoredKeyCount.Int()) + + blockedKeys := []string{"api_key_some", "name", "token_some"} + maskedValues, ok := attr.Get(maskedValues) + assert.True(t, ok) + assert.Equal(t, strings.Join(blockedKeys, ","), maskedValues.Str()) + maskedValueCount, ok := attr.Get(maskedValueCount) + assert.True(t, ok) + assert.Equal(t, int64(3), maskedValueCount.Int()) + value, _ := attr.Get("name") + assert.Equal(t, "placeholder 5910f4ea0062a0e29afd3dccc741e3ce", value.Str()) + value, _ = attr.Get("api_key_some") + assert.Equal(t, "93a699237950bde9eb9d25c7ead025f3", value.Str()) + value, _ = attr.Get("token_some") + assert.Equal(t, "77e9ef3680c5518785ef0121d3884c3d", value.Str()) + } +} + // TestRedactSummaryInfo validates that the processor writes a verbose summary // of any attributes it deleted to the new redaction.redacted.count span // attribute (but not to redaction.redacted.keys) when set to the info level diff --git a/processor/redactionprocessor/testdata/config.yaml b/processor/redactionprocessor/testdata/config.yaml index e657bede3a276..8d64c213c7e1a 100644 --- a/processor/redactionprocessor/testdata/config.yaml +++ b/processor/redactionprocessor/testdata/config.yaml @@ -30,6 +30,10 @@ redaction: # blocked span attributes. Values that match are not masked. allowed_values: - ".+@mycompany.com" + # hash_function defines the function for hashing the values instead of + # masking them with a fixed string. By default, no hash function is used + # and masking with a fixed string is performed. + hash_function: md5 # Summary controls the verbosity level of the diagnostic attributes that # the processor adds to the spans when it redacts or masks other # attributes. In some contexts a list of redacted attributes leaks From 10cdcfebcf835c8d2a8ab384e915fcb016762d30 Mon Sep 17 00:00:00 2001 From: odubajDT Date: Tue, 25 Feb 2025 17:25:19 +0100 Subject: [PATCH 2/4] polishing Signed-off-by: odubajDT --- processor/redactionprocessor/README.md | 2 +- processor/redactionprocessor/config.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/processor/redactionprocessor/README.md b/processor/redactionprocessor/README.md index baf064df350b7..33f12cb1df4b9 100644 --- a/processor/redactionprocessor/README.md +++ b/processor/redactionprocessor/README.md @@ -126,7 +126,7 @@ The value is then masked according to the configuration. `hash_function` defines the function for hashing the values (or substrings of values) instead of masking them with a fixed string. By default, no hash function is used and masking with a fixed string is performed. The supported hash functions -are `md5`, `sha1` and `sha3`. +are `md5`, `sha1` and `sha3` (SHA-256). For example, if `notes` is on the list of allowed keys, then the `notes` attribute is retained. However, if there is a value such as a credit card diff --git a/processor/redactionprocessor/config.go b/processor/redactionprocessor/config.go index ccfa4678643da..119a6b793f6b1 100644 --- a/processor/redactionprocessor/config.go +++ b/processor/redactionprocessor/config.go @@ -10,6 +10,10 @@ import ( "strings" ) +var ( + _ encoding.TextUnmarshaler = (*HashFunction)(nil) +) + type HashFunction string const ( @@ -19,10 +23,6 @@ const ( MD5 HashFunction = "md5" ) -var ( - _ encoding.TextUnmarshaler = (*HashFunction)(nil) -) - type Config struct { // AllowAllKeys is a flag to allow all span attribute keys. Setting this // to true disables the AllowedKeys list. The list of BlockedValues is From 507171ca14b89433ed041e9ffc4189a90f015ce1 Mon Sep 17 00:00:00 2001 From: odubajDT Date: Tue, 25 Feb 2025 17:30:46 +0100 Subject: [PATCH 3/4] fix lint Signed-off-by: odubajDT --- processor/redactionprocessor/config.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/processor/redactionprocessor/config.go b/processor/redactionprocessor/config.go index 119a6b793f6b1..2c4993eb97e93 100644 --- a/processor/redactionprocessor/config.go +++ b/processor/redactionprocessor/config.go @@ -10,9 +10,7 @@ import ( "strings" ) -var ( - _ encoding.TextUnmarshaler = (*HashFunction)(nil) -) +var _ encoding.TextUnmarshaler = (*HashFunction)(nil) type HashFunction string From 792c593dc387d5b2542ad6d4c17bbeb2443f2ea0 Mon Sep 17 00:00:00 2001 From: odubajDT <93584209+odubajDT@users.noreply.github.com> Date: Mon, 3 Mar 2025 16:51:12 +0100 Subject: [PATCH 4/4] Update processor/redactionprocessor/README.md Co-authored-by: Evan Bradley <11745660+evan-bradley@users.noreply.github.com> --- processor/redactionprocessor/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processor/redactionprocessor/README.md b/processor/redactionprocessor/README.md index 33f12cb1df4b9..7a5cb62104d92 100644 --- a/processor/redactionprocessor/README.md +++ b/processor/redactionprocessor/README.md @@ -123,7 +123,7 @@ part of the value is masked with a fixed length of asterisks. `blocked_key_patterns` applies to the values of the keys matching one of the patterns. The value is then masked according to the configuration. -`hash_function` defines the function for hashing the values (or substrings of values) +`hash_function` defines the function for hashing values of matched keys or matches in values instead of masking them with a fixed string. By default, no hash function is used and masking with a fixed string is performed. The supported hash functions are `md5`, `sha1` and `sha3` (SHA-256).