[processor/tailsampling] Add first policy match decision to tailsampling processor (#39655)

stollero · web-flow · commit 31fcfbd20af9 · 2025-05-22T15:12:19.000-07:00
#### Description The policies are evaluated in order, and the first policy that matches will be used to determine the sample rate.  #### Link to tracking issue #36795  #### Testing Unit-Test added  #### Documentation ✅ new config added to README
diff --git a/.chloggen/first-policy-match.yaml b/.chloggen/first-policy-match.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: processor/tailsampling
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add first policy match decision to tailsampling processor
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [36795]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
diff --git a/processor/tailsamplingprocessor/README.md b/processor/tailsamplingprocessor/README.md
@@ -60,6 +60,7 @@ The following configuration options can also be modified:
   - `non_sampled_cache_size` (default = 0) Configures amount of trace IDs to be kept in an LRU cache,
     persisting the "drop" decisions for traces that may have already been released from memory.
     By default, the size is 0 and the cache is inactive.
+- `sample_on_first_match`: Make decision as soon as a policy matches
 
 
 Each policy will result in a decision, and the processor will evaluate them to make a final decision:
diff --git a/processor/tailsamplingprocessor/config.go b/processor/tailsamplingprocessor/config.go
@@ -264,4 +264,6 @@ type Config struct {
 	DecisionCache DecisionCacheConfig `mapstructure:"decision_cache"`
 	// Options allows for additional configuration of the tail-based sampling processor in code.
 	Options []Option `mapstructure:"-"`
+	// Make decision as soon as a policy matches
+	SampleOnFirstMatch bool `mapstructure:"sample_on_first_match"`
 }
diff --git a/processor/tailsamplingprocessor/factory.go b/processor/tailsamplingprocessor/factory.go
@@ -27,8 +27,9 @@ func NewFactory() processor.Factory {
 
 func createDefaultConfig() component.Config {
 	return &Config{
-		DecisionWait: 30 * time.Second,
-		NumTraces:    50000,
+		DecisionWait:       30 * time.Second,
+		NumTraces:          50000,
+		SampleOnFirstMatch: false,
 	}
 }
 
diff --git a/processor/tailsamplingprocessor/processor.go b/processor/tailsamplingprocessor/processor.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"math"
 	"runtime"
+	"slices"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -50,20 +51,21 @@ type tailSamplingSpanProcessor struct {
 	telemetry *metadata.TelemetryBuilder
 	logger    *zap.Logger
 
-	nextConsumer      consumer.Traces
-	maxNumTraces      uint64
-	policies          []*policy
-	idToTrace         sync.Map
-	policyTicker      timeutils.TTicker
-	tickerFrequency   time.Duration
-	decisionBatcher   idbatcher.Batcher
-	sampledIDCache    cache.Cache[bool]
-	nonSampledIDCache cache.Cache[bool]
-	deleteChan        chan pcommon.TraceID
-	numTracesOnMap    *atomic.Uint64
-	recordPolicy      bool
-	setPolicyMux      sync.Mutex
-	pendingPolicy     []PolicyCfg
+	nextConsumer       consumer.Traces
+	maxNumTraces       uint64
+	policies           []*policy
+	idToTrace          sync.Map
+	policyTicker       timeutils.TTicker
+	tickerFrequency    time.Duration
+	decisionBatcher    idbatcher.Batcher
+	sampledIDCache     cache.Cache[bool]
+	nonSampledIDCache  cache.Cache[bool]
+	deleteChan         chan pcommon.TraceID
+	numTracesOnMap     *atomic.Uint64
+	recordPolicy       bool
+	setPolicyMux       sync.Mutex
+	pendingPolicy      []PolicyCfg
+	sampleOnFirstMatch bool
 }
 
 // spanAndScope a structure for holding information about span and its instrumentation scope.
@@ -113,16 +115,17 @@ func newTracesProcessor(ctx context.Context, set processor.Settings, nextConsume
 	}
 
 	tsp := &tailSamplingSpanProcessor{
-		ctx:               ctx,
-		set:               set,
-		telemetry:         telemetry,
-		nextConsumer:      nextConsumer,
-		maxNumTraces:      cfg.NumTraces,
-		sampledIDCache:    sampledDecisions,
-		nonSampledIDCache: nonSampledDecisions,
-		logger:            telemetrySettings.Logger,
-		numTracesOnMap:    &atomic.Uint64{},
-		deleteChan:        make(chan pcommon.TraceID, cfg.NumTraces),
+		ctx:                ctx,
+		set:                set,
+		telemetry:          telemetry,
+		nextConsumer:       nextConsumer,
+		maxNumTraces:       cfg.NumTraces,
+		sampledIDCache:     sampledDecisions,
+		nonSampledIDCache:  nonSampledDecisions,
+		logger:             telemetrySettings.Logger,
+		numTracesOnMap:     &atomic.Uint64{},
+		deleteChan:         make(chan pcommon.TraceID, cfg.NumTraces),
+		sampleOnFirstMatch: cfg.SampleOnFirstMatch,
 	}
 	tsp.policyTicker = &timeutils.PolicyTicker{OnTickFunc: tsp.samplingPolicyOnTick}
 
@@ -263,6 +266,7 @@ func (tsp *tailSamplingSpanProcessor) loadSamplingPolicy(cfgs []PolicyCfg) error
 
 	cLen := len(cfgs)
 	policies := make([]*policy, 0, cLen)
+	dropPolicies := make([]*policy, 0, cLen)
 	policyNames := make(map[string]struct{}, cLen)
 
 	for _, cfg := range cfgs {
@@ -285,14 +289,20 @@ func (tsp *tailSamplingSpanProcessor) loadSamplingPolicy(cfgs []PolicyCfg) error
 			uniquePolicyName = fmt.Sprintf("%s.%s", componentID, cfg.Name)
 		}
 
-		policies = append(policies, &policy{
+		p := &policy{
 			name:      cfg.Name,
 			evaluator: eval,
 			attribute: metric.WithAttributes(attribute.String("policy", uniquePolicyName)),
-		})
-	}
+		}
 
-	tsp.policies = policies
+		if cfg.Type == Drop {
+			dropPolicies = append(dropPolicies, p)
+		} else {
+			policies = append(policies, p)
+		}
+	}
+	// Dropped decision takes precedence over all others, therefore we evaluate them first.
+	tsp.policies = slices.Concat(dropPolicies, policies)
 
 	tsp.logger.Debug("Loaded sampling policy", zap.Int("policies.len", len(policies)))
 
@@ -430,6 +440,10 @@ func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *sa
 		if decision == sampling.Dropped {
 			break
 		}
+		// If sampleOnFirstMatch is enabled, make decision as soon as a policy matches
+		if tsp.sampleOnFirstMatch && decision == sampling.Sampled {
+			break
+		}
 	}
 
 	var sampledPolicy *policy
diff --git a/processor/tailsamplingprocessor/processor_decisions_test.go b/processor/tailsamplingprocessor/processor_decisions_test.go
@@ -636,3 +636,61 @@ func TestLateSpanUsesNonSampledDecisionCache(t *testing.T) {
 	require.Equal(t, 1, mpe.EvaluationCount)
 	require.Equal(t, 0, nextConsumer.SpanCount(), "original final decision not honored")
 }
+
+func TestSampleOnFirstMatch(t *testing.T) {
+	nextConsumer := new(consumertest.TracesSink)
+	idb := newSyncIDBatcher()
+
+	mpe1 := &mockPolicyEvaluator{}
+	mpe2 := &mockPolicyEvaluator{}
+	mpe3 := &mockPolicyEvaluator{}
+
+	policies := []*policy{
+		{name: "mock-policy-1", evaluator: mpe1, attribute: metric.WithAttributes(attribute.String("policy", "mock-policy-1"))},
+		{name: "mock-policy-2", evaluator: mpe2, attribute: metric.WithAttributes(attribute.String("policy", "mock-policy-2"))},
+		{name: "mock-policy-3", evaluator: mpe2, attribute: metric.WithAttributes(attribute.String("policy", "mock-policy-3"))},
+	}
+
+	cfg := Config{
+		DecisionWait:       defaultTestDecisionWait,
+		NumTraces:          defaultNumTraces,
+		SampleOnFirstMatch: true,
+		Options: []Option{
+			withDecisionBatcher(idb),
+			withPolicies(policies),
+		},
+	}
+	p, err := newTracesProcessor(context.Background(), processortest.NewNopSettings(metadata.Type), nextConsumer, cfg)
+	require.NoError(t, err)
+
+	require.NoError(t, p.Start(context.Background(), componenttest.NewNopHost()))
+	defer func() {
+		require.NoError(t, p.Shutdown(context.Background()))
+	}()
+
+	// Second policy matches, last policy should not be evaluated
+	mpe1.NextDecision = sampling.NotSampled
+	mpe2.NextDecision = sampling.Sampled
+
+	// Generate and deliver first span
+	require.NoError(t, p.ConsumeTraces(context.Background(), simpleTraces()))
+
+	tsp := p.(*tailSamplingSpanProcessor)
+
+	// The first tick won't do anything
+	tsp.policyTicker.OnTick()
+	require.Equal(t, 0, mpe1.EvaluationCount)
+	require.Equal(t, 0, mpe2.EvaluationCount)
+	require.Equal(t, 0, mpe3.EvaluationCount)
+
+	// This will cause policy evaluations on the first span
+	tsp.policyTicker.OnTick()
+
+	// Only the first policy should have been evaluated
+	require.Equal(t, 1, mpe1.EvaluationCount)
+	require.Equal(t, 1, mpe2.EvaluationCount)
+	require.Equal(t, 0, mpe3.EvaluationCount)
+
+	// The final decision SHOULD be Sampled.
+	require.Equal(t, 1, nextConsumer.SpanCount())
+}
diff --git a/processor/tailsamplingprocessor/processor_test.go b/processor/tailsamplingprocessor/processor_test.go
@@ -669,6 +669,39 @@ func TestDecisionPolicyMetrics(t *testing.T) {
 	assert.EqualValues(t, 0, metrics.evaluateErrorCount)
 }
 
+func TestDropPolicyIsFirstInPolicyList(t *testing.T) {
+	idb := newSyncIDBatcher()
+	msp := new(consumertest.TracesSink)
+
+	cfg := Config{
+		DecisionWait: defaultTestDecisionWait,
+		NumTraces:    defaultNumTraces,
+		PolicyCfgs: []PolicyCfg{
+			{
+				sharedPolicyCfg: sharedPolicyCfg{
+					Name: "regular-policy",
+					Type: AlwaysSample,
+				},
+			},
+			{
+				sharedPolicyCfg: sharedPolicyCfg{
+					Name: "drop-policy",
+					Type: Drop,
+				},
+			},
+		},
+		Options: []Option{
+			withDecisionBatcher(idb),
+		},
+	}
+	p, err := newTracesProcessor(context.Background(), processortest.NewNopSettings(metadata.Type), msp, cfg)
+	require.NoError(t, err)
+
+	tsp := p.(*tailSamplingSpanProcessor)
+	require.GreaterOrEqual(t, len(tsp.policies), 2)
+	assert.Equal(t, "drop-policy", tsp.policies[0].name)
+}
+
 func collectSpanIDs(trace ptrace.Traces) []pcommon.SpanID {
 	var spanIDs []pcommon.SpanID
 

Original file line number	Diff line number	Diff line change
`@@ -264,4 +264,6 @@ type Config struct {`
`264`	`264`	DecisionCache DecisionCacheConfig `mapstructure:"decision_cache"`
`265`	`265`	`// Options allows for additional configuration of the tail-based sampling processor in code.`
`266`	`266`	Options []Option `mapstructure:"-"`
	`267`	`+ // Make decision as soon as a policy matches`
	`268`	+ SampleOnFirstMatch bool `mapstructure:"sample_on_first_match"`
`267`	`269`	`}`
Original file line number	Diff line number	Diff line change
`@@ -27,8 +27,9 @@ func NewFactory() processor.Factory {`
`27`	`27`
`28`	`28`	`func createDefaultConfig() component.Config {`
`29`	`29`	`return &Config{`
`30`		`- DecisionWait: 30 * time.Second,`
`31`		`- NumTraces: 50000,`
	`30`	`+ DecisionWait: 30 * time.Second,`
	`31`	`+ NumTraces: 50000,`
	`32`	`+ SampleOnFirstMatch: false,`
`32`	`33`	`}`
`33`	`34`	`}`
`34`	`35`