From 612cc3f5f9f36f9b40a2e5dcee5ff007c8b49aa8 Mon Sep 17 00:00:00 2001 From: Federico Di Pierro Date: Wed, 10 Sep 2025 14:29:47 +0200 Subject: [PATCH 1/2] new(tests/e2e): introduce a metricschecker package for e2e tests. Signed-off-by: Federico Di Pierro --- go.mod | 2 +- tests/e2e/metricschecker/doc.go | 8 + tests/e2e/metricschecker/metricschecker.go | 218 +++++++++++++++++++++ 3 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 tests/e2e/metricschecker/doc.go create mode 100644 tests/e2e/metricschecker/metricschecker.go diff --git a/go.mod b/go.mod index d77b8cf6188..1970085287f 100644 --- a/go.mod +++ b/go.mod @@ -29,6 +29,7 @@ require ( github.com/opencontainers/runtime-spec v1.2.1 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 + github.com/prometheus/common v0.66.1 github.com/prometheus/procfs v0.17.0 github.com/sirupsen/logrus v1.9.3 github.com/spf13/cobra v1.10.1 @@ -116,7 +117,6 @@ require ( github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/common v0.66.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.7.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect diff --git a/tests/e2e/metricschecker/doc.go b/tests/e2e/metricschecker/doc.go new file mode 100644 index 00000000000..7641cc786df --- /dev/null +++ b/tests/e2e/metricschecker/doc.go @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +//go:build !windows + +// This package provides a Tetragon Metrics Checker that +// uses http GET requests to get metrics from all Tetragon pods. +package metricschecker diff --git a/tests/e2e/metricschecker/metricschecker.go b/tests/e2e/metricschecker/metricschecker.go new file mode 100644 index 00000000000..fd5ff717a7a --- /dev/null +++ b/tests/e2e/metricschecker/metricschecker.go @@ -0,0 +1,218 @@ +//go:build !windows + +package metricschecker + +import ( + "context" + "errors" + "fmt" + "net/http" + "testing" + "time" + + clmo "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" + "github.com/stretchr/testify/assert" + "k8s.io/klog/v2" + "sigs.k8s.io/e2e-framework/pkg/envconf" + "sigs.k8s.io/e2e-framework/pkg/features" + + "github.com/cilium/tetragon/tests/e2e/state" +) + +// MetricsChecker checks prometheus metrics from one or more events streams. +type MetricsChecker struct { + name string + timeLimit time.Duration +} + +// NewMetricsChecker constructs a new Metrics from a MultiEventChecker. +func NewMetricsChecker(name string) *MetricsChecker { + rc := &MetricsChecker{ + name: name, + timeLimit: 30 * time.Second, + } + return rc +} + +// WithTimeLimit sets the time limit for a MetricsChecker http GET calls. +func (rc *MetricsChecker) WithTimeLimit(limit time.Duration) *MetricsChecker { + rc.timeLimit = limit + return rc +} + +type metricOp int + +const ( + opEqual metricOp = 1 + opGreater = 1 << iota + opLess +) + +type metricValue struct { + tp clmo.MetricType + val float64 +} +type metricCheck struct { + value metricValue + op metricOp +} + +func (rc *MetricsChecker) Equal(metric string, val int) features.Func { + return rc.checkWithOp(metric, metricCheck{ + value: metricValue{ + tp: clmo.MetricType_COUNTER, + val: float64(val), + }, + op: opEqual, + }) +} + +func (rc *MetricsChecker) Less(metric string, val int) features.Func { + return rc.checkWithOp(metric, metricCheck{ + value: metricValue{ + tp: clmo.MetricType_COUNTER, + val: float64(val), + }, + op: opLess, + }) +} + +func (rc *MetricsChecker) LessThanOrEqual(metric string, val int) features.Func { + return rc.checkWithOp(metric, metricCheck{ + value: metricValue{ + tp: clmo.MetricType_COUNTER, + val: float64(val), + }, + op: opEqual | opLess, + }) +} + +func (rc *MetricsChecker) Greater(metric string, val int) features.Func { + return rc.checkWithOp(metric, metricCheck{ + value: metricValue{ + tp: clmo.MetricType_COUNTER, + val: float64(val), + }, + op: opGreater, + }) +} + +func (rc *MetricsChecker) GreaterOrEqual(metric string, val int) features.Func { + return rc.checkWithOp(metric, metricCheck{ + value: metricValue{ + tp: clmo.MetricType_COUNTER, + val: float64(val), + }, + op: opEqual | opGreater, + }) +} + +// getAndParseMetrics fetches metrics from a URL and parses them into MetricFamily objects. +func getAndParseMetrics(ctx context.Context, metricsURL string) (map[string]*clmo.MetricFamily, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, metricsURL, nil) + if err != nil { + return nil, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", metricsURL, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("received non-OK status code %d from %s", resp.StatusCode, metricsURL) + } + + parser := expfmt.NewTextParser(model.UTF8Validation) + mf, err := parser.TextToMetricFamilies(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse metrics body: %w", err) + } + return mf, nil +} + +// Collect metrics from all forwarded metrics ports for all pods +func (rc *MetricsChecker) checkWithOp(metric string, check metricCheck) features.Func { + return func(ctx context.Context, t *testing.T, _ *envconf.Config) context.Context { + klog.InfoS("Gathering metrics clients", "metricschecker", rc.name) + ports, ok := ctx.Value(state.PromForwardedPorts).(map[string]int) + if !ok { + assert.Fail(t, "failed to find forwarded prometheus ports") + return ctx + } + + timedCtx, cancel := context.WithTimeout(ctx, rc.timeLimit) + defer cancel() + + results := make(map[string]*clmo.MetricFamily) + for podName, port := range ports { + metricFamilies, err := getAndParseMetrics(timedCtx, fmt.Sprintf("http://localhost:%d/metrics", port)) + if err != nil { + assert.Fail(t, "Failed to fetch metrics families from metrics endpoint", "err", err, "podName", podName) + return ctx + } + + if mf, ok := metricFamilies[metric]; ok { + results[podName] = mf + } else { + klog.InfoS("No metric found on pod", "metricschecker", rc.name, "metric", metric, "podName", podName) + } + } + + // No metrics endpoint exported the requested metric. This is an error. + if len(results) == 0 { + assert.Fail(t, "Failed to fetch requested metrics from any pod", "metricName", metric) + return ctx + } + + if err := rc.check(results, metric, check); !assert.NoError(t, err, "checks should pass") { + return ctx + } + return ctx + } +} + +// Real check helper that implements the check logic for each metric type. +// For now, only COUNTER types are supported. +func (rc *MetricsChecker) check(results map[string]*clmo.MetricFamily, metric string, check metricCheck) error { + klog.InfoS("Running metrics checks", "metricschecker", rc.name) + + for podName, result := range results { + if result.GetType() != check.value.tp { + return errors.New("result type and check type mismatch") + } + switch result.GetType() { + case clmo.MetricType_COUNTER: + var sum float64 + // Accumulate values from all labels + for _, mf := range result.GetMetric() { + sum += mf.GetCounter().GetValue() + } + + success := false + if check.op&opEqual != 0 { + success = sum == check.value.val + } + if check.op&opGreater != 0 { + success = success || (sum > check.value.val) + } + if check.op&opLess != 0 { + success = success || (sum < check.value.val) + } + if !success { + return fmt.Errorf("failed metricscheck for metric '%s' on pod '%s'", metric, podName) + } + default: + // TODO implement check logic for non-counter types + return errors.New("metrics checker unsupported for non-counter types") + } + } + return nil +} + +// Name returns the name of the checker +func (rc *MetricsChecker) Name() string { + return rc.name +} From 162d3994e3abac371b0d7601aaf69d4d21a18d35 Mon Sep 17 00:00:00 2001 From: Federico Di Pierro Date: Wed, 10 Sep 2025 14:30:11 +0200 Subject: [PATCH 2/2] new(tests/e2e): use metricschecker in e2e tests. Signed-off-by: Federico Di Pierro --- tests/e2e/tests/labels/labels_test.go | 7 +++++++ tests/e2e/tests/policyfilter/policyfilter_test.go | 4 ++++ tests/e2e/tests/skeleton/skeleton_test.go | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/tests/e2e/tests/labels/labels_test.go b/tests/e2e/tests/labels/labels_test.go index b4ee248bd7c..648a701747c 100644 --- a/tests/e2e/tests/labels/labels_test.go +++ b/tests/e2e/tests/labels/labels_test.go @@ -16,6 +16,8 @@ import ( "sigs.k8s.io/e2e-framework/pkg/features" "sigs.k8s.io/e2e-framework/third_party/helm" + "github.com/cilium/tetragon/tests/e2e/metricschecker" + ec "github.com/cilium/tetragon/api/v1/tetragon/codegen/eventchecker" sm "github.com/cilium/tetragon/pkg/matchers/stringmatcher" "github.com/cilium/tetragon/tests/e2e/checker" @@ -115,8 +117,13 @@ func TestLabelsDemoApp(t *testing.T) { uninstall := features.New("Uninstall Demo App"). Assess("Uninstall", uninstallDemoApp()).Feature() + metricsChecker := metricschecker.NewMetricsChecker("labelsMetricsChecker") + metrics := features.New("Run Metrics Checks"). + Assess("Run Metrics Checks", metricsChecker.Greater("tetragon_events_total", 0)).Feature() + // Spawn workload and run checker runner.TestInParallel(t, runEventChecker, runWorkload) + runner.Test(t, metrics) runner.Test(t, uninstall) } diff --git a/tests/e2e/tests/policyfilter/policyfilter_test.go b/tests/e2e/tests/policyfilter/policyfilter_test.go index 4d7296252fa..a593bfec3e8 100644 --- a/tests/e2e/tests/policyfilter/policyfilter_test.go +++ b/tests/e2e/tests/policyfilter/policyfilter_test.go @@ -18,6 +18,8 @@ import ( "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" + "github.com/cilium/tetragon/tests/e2e/metricschecker" + "github.com/cilium/tetragon/api/v1/tetragon" ec "github.com/cilium/tetragon/api/v1/tetragon/codegen/eventchecker" "github.com/cilium/tetragon/tests/e2e/checker" @@ -85,6 +87,7 @@ func TestMain(m *testing.M) { func TestNamespacedPolicy(t *testing.T) { checker := nsChecker().WithTimeLimit(30 * time.Second).WithEventLimit(20) + metricsChecker := metricschecker.NewMetricsChecker("policyMetricsChecker") runEventChecker := features.New("Run Event Checks"). Assess("Run Event Checks", checker.CheckWithFilters( @@ -127,6 +130,7 @@ func TestNamespacedPolicy(t *testing.T) { } return ctx }). + Assess("Run Metrics Checks", metricsChecker.Greater("tetragon_policy_events_total", 0)). Assess("Uninstall policy", func(ctx context.Context, _ *testing.T, c *envconf.Config) context.Context { ctx, err := helpers.UnloadCRDString(policyNamespace, namespacedPolicy, false)(ctx, c) if err != nil { diff --git a/tests/e2e/tests/skeleton/skeleton_test.go b/tests/e2e/tests/skeleton/skeleton_test.go index 431a54660cd..8d13379163f 100644 --- a/tests/e2e/tests/skeleton/skeleton_test.go +++ b/tests/e2e/tests/skeleton/skeleton_test.go @@ -18,6 +18,8 @@ import ( "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" + "github.com/cilium/tetragon/tests/e2e/metricschecker" + ec "github.com/cilium/tetragon/api/v1/tetragon/codegen/eventchecker" sm "github.com/cilium/tetragon/pkg/matchers/stringmatcher" "github.com/cilium/tetragon/tests/e2e/checker" @@ -88,6 +90,8 @@ func TestSkeletonBasic(t *testing.T) { // Create an curl event checker with a limit or 10 events or 30 seconds, whichever comes first curlChecker := curlEventChecker(kversion).WithEventLimit(100).WithTimeLimit(30 * time.Second) + metricsChecker := metricschecker.NewMetricsChecker("skeletonMetricsChecker") + // Define test features here. These can be used to perform actions like: // - Spawning an event checker and running checks // - Modifying resources in the cluster @@ -110,6 +114,7 @@ func TestSkeletonBasic(t *testing.T) { } return ctx }). + Assess("Run Metrics Checks", metricsChecker.Greater("tetragon_events_total", 0)). Assess("Uninstall policy", func(ctx context.Context, _ *testing.T, c *envconf.Config) context.Context { ctx, err := helpers.UnloadCRDString(namespace, curlPod, true)(ctx, c) if err != nil {