[processor/tailsampling] Fixed sampling decision metrics (open-telemetry#37212)

yvrhdn · chengchuanpeng · commit 88f1be056c2d · 2025-02-07T17:31:34.000-07:00
#### Description Fixes some of the metrics emitted from sampling decisions. I believe `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early` and `otelcol_processor_tail_sampling_sampling_policy_evaluation_error_total` are sometimes overcounted. The bug: `samplingPolicyOnTick` creates a struct `policyMetrics` to hold on to some counters. This struct is shared for all the traces that are evaluated during that tick: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/22c647a3ae134697d90b67c45879227ea54d63be/processor/tailsamplingprocessor/processor.go#L324 Each loop, the values of the counters are added to the metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/22c647a3ae134697d90b67c45879227ea54d63be/processor/tailsamplingprocessor/processor.go#L340-L344 But the counters are not reset in between loops, so if the first evaluated trace could not be found this would set `idNotFoundOnMapCount` to `1`. Every loop after this will add `1` to `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early` metric, even though the trace was found. I've moved the metrics outside of the for loop so the counters are only added once. #### Testing I have added a dedicated test for each metric processing multiple traces in one tick. ~~I've a added a test for `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early`. I can add one for `sampling_policy_evaluation_error` too, just not sure how to deliberatly fail a policy.~~
diff --git a/.chloggen/tailsamplingprocessor-fixed-sampling-decision-metrics.yaml b/.chloggen/tailsamplingprocessor-fixed-sampling-decision-metrics.yaml
@@ -0,0 +1,32 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: bug_fix
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: tailsamplingprocessor
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Fixed sampling decision metrics `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early` and `otelcol_processor_tail_sampling_sampling_policy_evaluation_error_total`, these were sometimes overcounted.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [37212]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: |
+  As a result of this change non-zero values of `otelcol_processor_tail_sampling_sampling_trace_dropped_too_early`
+  and `otelcol_processor_tail_sampling_sampling_policy_evaluation_error_total` metrics will be lower.
+  Before this fix, errors got counted several times depending on the amount of traces being processed
+  that tick and where in the batch the error happened.
+  Zero values are unaffected.
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
diff --git a/processor/tailsamplingprocessor/processor.go b/processor/tailsamplingprocessor/processor.go
@@ -344,7 +344,6 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
 		decision := tsp.makeDecision(id, trace, &metrics)
 
 		tsp.telemetry.ProcessorTailSamplingSamplingDecisionTimerLatency.Record(tsp.ctx, int64(time.Since(startTime)/time.Microsecond))
-		tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load()))
 		tsp.telemetry.ProcessorTailSamplingGlobalCountTracesSampled.Add(tsp.ctx, 1, decisionToAttribute[decision])
 
 		// Sampled or not, remove the batches
@@ -362,6 +361,7 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
 		}
 	}
 
+	tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load()))
 	tsp.telemetry.ProcessorTailSamplingSamplingTraceDroppedTooEarly.Add(tsp.ctx, metrics.idNotFoundOnMapCount)
 	tsp.telemetry.ProcessorTailSamplingSamplingPolicyEvaluationError.Add(tsp.ctx, metrics.evaluateErrorCount)
 
diff --git a/processor/tailsamplingprocessor/processor_telemetry_test.go b/processor/tailsamplingprocessor/processor_telemetry_test.go
@@ -20,6 +20,8 @@ import (
 	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
 	"go.opentelemetry.io/otel/sdk/metric/metricdata"
 	"go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest"
+
+	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl"
 )
 
 func TestMetricsAfterOneEvaluation(t *testing.T) {
@@ -522,6 +524,138 @@ func TestProcessorTailSamplingSamplingLateSpanAge(t *testing.T) {
 	metricdatatest.AssertEqual(t, m, got, metricdatatest.IgnoreTimestamp())
 }
 
+func TestProcessorTailSamplingSamplingTraceDroppedTooEarly(t *testing.T) {
+	// prepare
+	s := setupTestTelemetry()
+	b := newSyncIDBatcher()
+	syncBatcher := b.(*syncIDBatcher)
+
+	cfg := Config{
+		DecisionWait: 1,
+		NumTraces:    2,
+		PolicyCfgs: []PolicyCfg{
+			{
+				sharedPolicyCfg: sharedPolicyCfg{
+					Name: "always",
+					Type: AlwaysSample,
+				},
+			},
+		},
+	}
+	cs := &consumertest.TracesSink{}
+	ct := s.newSettings()
+	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
+	require.NoError(t, err)
+	defer func() {
+		err = proc.Shutdown(context.Background())
+		require.NoError(t, err)
+	}()
+
+	err = proc.Start(context.Background(), componenttest.NewNopHost())
+	require.NoError(t, err)
+
+	// test
+	_, batches := generateIDsAndBatches(3)
+	for _, batch := range batches {
+		err = proc.ConsumeTraces(context.Background(), batch)
+		require.NoError(t, err)
+	}
+
+	tsp := proc.(*tailSamplingSpanProcessor)
+	tsp.policyTicker.OnTick() // the first tick always gets an empty batch
+	tsp.policyTicker.OnTick()
+
+	// verify
+	var md metricdata.ResourceMetrics
+	require.NoError(t, s.reader.Collect(context.Background(), &md))
+
+	m := metricdata.Metrics{
+		Name:        "otelcol_processor_tail_sampling_sampling_trace_dropped_too_early",
+		Description: "Count of traces that needed to be dropped before the configured wait time",
+		Unit:        "{traces}",
+		Data: metricdata.Sum[int64]{
+			IsMonotonic: true,
+			Temporality: metricdata.CumulativeTemporality,
+			DataPoints: []metricdata.DataPoint[int64]{
+				{
+					Value: 1,
+				},
+			},
+		},
+	}
+
+	got := s.getMetric(m.Name, md)
+	metricdatatest.AssertEqual(t, m, got, metricdatatest.IgnoreTimestamp())
+}
+
+func TestProcessorTailSamplingSamplingPolicyEvaluationError(t *testing.T) {
+	// prepare
+	s := setupTestTelemetry()
+	b := newSyncIDBatcher()
+	syncBatcher := b.(*syncIDBatcher)
+
+	cfg := Config{
+		DecisionWait: 1,
+		NumTraces:    100,
+		PolicyCfgs: []PolicyCfg{
+			{
+				sharedPolicyCfg: sharedPolicyCfg{
+					Name: "ottl",
+					Type: OTTLCondition,
+					OTTLConditionCfg: OTTLConditionCfg{
+						ErrorMode:      ottl.PropagateError,
+						SpanConditions: []string{"attributes[1] == \"test\""},
+					},
+				},
+			},
+		},
+	}
+	cs := &consumertest.TracesSink{}
+	ct := s.newSettings()
+	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
+	require.NoError(t, err)
+	defer func() {
+		err = proc.Shutdown(context.Background())
+		require.NoError(t, err)
+	}()
+
+	err = proc.Start(context.Background(), componenttest.NewNopHost())
+	require.NoError(t, err)
+
+	// test
+	_, batches := generateIDsAndBatches(2)
+	for _, batch := range batches {
+		err = proc.ConsumeTraces(context.Background(), batch)
+		require.NoError(t, err)
+	}
+
+	tsp := proc.(*tailSamplingSpanProcessor)
+	tsp.policyTicker.OnTick() // the first tick always gets an empty batch
+	tsp.policyTicker.OnTick()
+
+	// verify
+	var md metricdata.ResourceMetrics
+	require.NoError(t, s.reader.Collect(context.Background(), &md))
+
+	m := metricdata.Metrics{
+		Name:        "otelcol_processor_tail_sampling_sampling_policy_evaluation_error",
+		Description: "Count of sampling policy evaluation errors",
+		Unit:        "{errors}",
+		Data: metricdata.Sum[int64]{
+			IsMonotonic: true,
+			Temporality: metricdata.CumulativeTemporality,
+			DataPoints: []metricdata.DataPoint[int64]{
+				{
+					Value: 2,
+				},
+			},
+		},
+	}
+
+	got := s.getMetric(m.Name, md)
+	metricdatatest.AssertEqual(t, m, got, metricdatatest.IgnoreTimestamp())
+}
+
 type testTelemetry struct {
 	reader        *sdkmetric.ManualReader
 	meterProvider *sdkmetric.MeterProvider

Original file line number	Diff line number	Diff line change
`@@ -344,7 +344,6 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {`
`344`	`344`	`decision := tsp.makeDecision(id, trace, &metrics)`
`345`	`345`
`346`	`346`	`tsp.telemetry.ProcessorTailSamplingSamplingDecisionTimerLatency.Record(tsp.ctx, int64(time.Since(startTime)/time.Microsecond))`
`347`		`- tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load()))`
`348`	`347`	`tsp.telemetry.ProcessorTailSamplingGlobalCountTracesSampled.Add(tsp.ctx, 1, decisionToAttribute[decision])`
`349`	`348`
`350`	`349`	`// Sampled or not, remove the batches`
`@@ -362,6 +361,7 @@ func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {`
`362`	`361`	`}`
`363`	`362`	`}`
`364`	`363`
	`364`	`+ tsp.telemetry.ProcessorTailSamplingSamplingTracesOnMemory.Record(tsp.ctx, int64(tsp.numTracesOnMap.Load()))`
`365`	`365`	`tsp.telemetry.ProcessorTailSamplingSamplingTraceDroppedTooEarly.Add(tsp.ctx, metrics.idNotFoundOnMapCount)`
`366`	`366`	`tsp.telemetry.ProcessorTailSamplingSamplingPolicyEvaluationError.Add(tsp.ctx, metrics.evaluateErrorCount)`
`367`	`367`