Skip to content

Commit 5f9d943

Browse files
portertechmwear
andauthored
[processor/tailsampling] Support hot sampling policy loading (#37014)
#### Description Adding a feature. This pull-request adds support for hot sampling policy loading to the tail sampling processor. This allows the collector (or another service using the processor) to dynamically update tail sampling policy without needing to restart the processor (or the entire collector). This greatly minimizes the impact of sampling policy modifications on pipeline availability and processing. Changes to policy are safely applied on the next tick loop. A collector (and/or other service) could use OpAMP to remotely manage sampling policy with little to no negative impact on pipeline availability and performance. This is what the https://tailctrl.io/ agent did. #### Usage Currently need to define a custom interface in order to set sampling policy. ``` go type SamplingProcessor interface { processor.Traces SetSamplingPolicy(cfgs []tailsamplingprocessor.PolicyCfg) } factory := tailsamplingprocessor.NewFactory() tsp, _ := factory.CreateTraces() sp = tsp.(SamplingProcessor) sp.SetSamplingPolicy(cfgs) ``` #### Testing Added a test to ensure changes to policy are loaded. Using the changes in a private project. --------- Signed-off-by: Sean Porter <[email protected]> Co-authored-by: Matthew Wear <[email protected]>
1 parent 9226667 commit 5f9d943

File tree

3 files changed

+198
-30
lines changed

3 files changed

+198
-30
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: tailsamplingprocessor
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Support hot sampling policy loading
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [37014]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: [user]

processor/tailsamplingprocessor/processor.go

Lines changed: 84 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type policy struct {
4545
type tailSamplingSpanProcessor struct {
4646
ctx context.Context
4747

48+
set processor.Settings
4849
telemetry *metadata.TelemetryBuilder
4950
logger *zap.Logger
5051

@@ -59,6 +60,9 @@ type tailSamplingSpanProcessor struct {
5960
nonSampledIDCache cache.Cache[bool]
6061
deleteChan chan pcommon.TraceID
6162
numTracesOnMap *atomic.Uint64
63+
64+
setPolicyMux sync.Mutex
65+
pendingPolicy []PolicyCfg
6266
}
6367

6468
// spanAndScope a structure for holding information about span and its instrumentation scope.
@@ -108,6 +112,7 @@ func newTracesProcessor(ctx context.Context, set processor.Settings, nextConsume
108112

109113
tsp := &tailSamplingSpanProcessor{
110114
ctx: ctx,
115+
set: set,
111116
telemetry: telemetry,
112117
nextConsumer: nextConsumer,
113118
maxNumTraces: cfg.NumTraces,
@@ -128,31 +133,9 @@ func newTracesProcessor(ctx context.Context, set processor.Settings, nextConsume
128133
}
129134

130135
if tsp.policies == nil {
131-
policyNames := map[string]bool{}
132-
tsp.policies = make([]*policy, len(cfg.PolicyCfgs))
133-
componentID := set.ID.Name()
134-
for i := range cfg.PolicyCfgs {
135-
policyCfg := &cfg.PolicyCfgs[i]
136-
137-
if policyNames[policyCfg.Name] {
138-
return nil, fmt.Errorf("duplicate policy name %q", policyCfg.Name)
139-
}
140-
policyNames[policyCfg.Name] = true
141-
142-
eval, err := getPolicyEvaluator(telemetrySettings, policyCfg)
143-
if err != nil {
144-
return nil, err
145-
}
146-
uniquePolicyName := policyCfg.Name
147-
if componentID != "" {
148-
uniquePolicyName = fmt.Sprintf("%s.%s", componentID, policyCfg.Name)
149-
}
150-
p := &policy{
151-
name: policyCfg.Name,
152-
evaluator: eval,
153-
attribute: metric.WithAttributes(attribute.String("policy", uniquePolicyName)),
154-
}
155-
tsp.policies[i] = p
136+
err := tsp.loadSamplingPolicy(cfg.PolicyCfgs)
137+
if err != nil {
138+
return nil, err
156139
}
157140
}
158141

@@ -262,7 +245,82 @@ type policyMetrics struct {
262245
idNotFoundOnMapCount, evaluateErrorCount, decisionSampled, decisionNotSampled int64
263246
}
264247

248+
func (tsp *tailSamplingSpanProcessor) loadSamplingPolicy(cfgs []PolicyCfg) error {
249+
telemetrySettings := tsp.set.TelemetrySettings
250+
componentID := tsp.set.ID.Name()
251+
252+
policyNames := map[string]bool{}
253+
tsp.policies = make([]*policy, len(cfgs))
254+
255+
for i := range cfgs {
256+
policyCfg := &cfgs[i]
257+
258+
if policyNames[policyCfg.Name] {
259+
return fmt.Errorf("duplicate policy name %q", policyCfg.Name)
260+
}
261+
policyNames[policyCfg.Name] = true
262+
263+
eval, err := getPolicyEvaluator(telemetrySettings, policyCfg)
264+
if err != nil {
265+
return err
266+
}
267+
uniquePolicyName := policyCfg.Name
268+
if componentID != "" {
269+
uniquePolicyName = fmt.Sprintf("%s.%s", componentID, policyCfg.Name)
270+
}
271+
p := &policy{
272+
name: policyCfg.Name,
273+
evaluator: eval,
274+
attribute: metric.WithAttributes(attribute.String("policy", uniquePolicyName)),
275+
}
276+
tsp.policies[i] = p
277+
}
278+
279+
tsp.logger.Debug("Loaded sampling policy", zap.Int("policies.len", len(tsp.policies)))
280+
281+
return nil
282+
}
283+
284+
func (tsp *tailSamplingSpanProcessor) SetSamplingPolicy(cfgs []PolicyCfg) {
285+
tsp.logger.Debug("Setting pending sampling policy", zap.Int("pending.len", len(cfgs)))
286+
287+
tsp.setPolicyMux.Lock()
288+
defer tsp.setPolicyMux.Unlock()
289+
290+
tsp.pendingPolicy = cfgs
291+
}
292+
293+
func (tsp *tailSamplingSpanProcessor) loadPendingSamplingPolicy() {
294+
tsp.setPolicyMux.Lock()
295+
defer tsp.setPolicyMux.Unlock()
296+
297+
// Nothing pending, do nothing.
298+
pLen := len(tsp.pendingPolicy)
299+
if pLen == 0 {
300+
return
301+
}
302+
303+
tsp.logger.Debug("Loading pending sampling policy", zap.Int("pending.len", pLen))
304+
305+
// In case something goes wrong.
306+
prev := tsp.policies
307+
308+
err := tsp.loadSamplingPolicy(tsp.pendingPolicy)
309+
310+
// Empty pending regardless of error. If policy is invalid, it will fail on
311+
// every tick, no need to do extra work and flood the log with errors.
312+
tsp.pendingPolicy = nil
313+
314+
if err != nil {
315+
tsp.logger.Error("Failed to load pending sampling policy", zap.Error(err))
316+
tsp.logger.Debug("Falling back to previous sampling policy")
317+
tsp.policies = prev
318+
}
319+
}
320+
265321
func (tsp *tailSamplingSpanProcessor) samplingPolicyOnTick() {
322+
tsp.loadPendingSamplingPolicy()
323+
266324
metrics := policyMetrics{}
267325

268326
startTime := time.Now()
@@ -401,11 +459,7 @@ func (tsp *tailSamplingSpanProcessor) processTraces(resourceSpans ptrace.Resourc
401459
}
402460

403461
lenSpans := int64(len(spans))
404-
lenPolicies := len(tsp.policies)
405-
initialDecisions := make([]sampling.Decision, lenPolicies)
406-
for i := 0; i < lenPolicies; i++ {
407-
initialDecisions[i] = sampling.Pending
408-
}
462+
409463
d, loaded := tsp.idToTrace.Load(id)
410464
if !loaded {
411465
spanCount := &atomic.Int64{}

processor/tailsamplingprocessor/processor_test.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,93 @@ func TestMultipleBatchesAreCombinedIntoOne(t *testing.T) {
449449
}
450450
}
451451

452+
func TestSetSamplingPolicy(t *testing.T) {
453+
cfg := Config{
454+
DecisionWait: defaultTestDecisionWait,
455+
NumTraces: defaultNumTraces,
456+
PolicyCfgs: []PolicyCfg{
457+
{
458+
sharedPolicyCfg: sharedPolicyCfg{
459+
Name: "always",
460+
Type: AlwaysSample,
461+
},
462+
},
463+
},
464+
}
465+
s := setupTestTelemetry()
466+
ct := s.NewSettings()
467+
idb := newSyncIDBatcher()
468+
msp := new(consumertest.TracesSink)
469+
470+
p, err := newTracesProcessor(context.Background(), ct, msp, cfg, withDecisionBatcher(idb))
471+
require.NoError(t, err)
472+
473+
require.NoError(t, p.Start(context.Background(), componenttest.NewNopHost()))
474+
defer func() {
475+
require.NoError(t, p.Shutdown(context.Background()))
476+
}()
477+
478+
tsp := p.(*tailSamplingSpanProcessor)
479+
480+
assert.Len(t, tsp.policies, 1)
481+
482+
tsp.policyTicker.OnTick()
483+
484+
assert.Len(t, tsp.policies, 1)
485+
486+
cfgs := []PolicyCfg{
487+
{
488+
sharedPolicyCfg: sharedPolicyCfg{
489+
Name: "always",
490+
Type: AlwaysSample,
491+
},
492+
},
493+
{
494+
sharedPolicyCfg: sharedPolicyCfg{
495+
Name: "everything",
496+
Type: AlwaysSample,
497+
},
498+
},
499+
}
500+
tsp.SetSamplingPolicy(cfgs)
501+
502+
assert.Len(t, tsp.policies, 1)
503+
504+
tsp.policyTicker.OnTick()
505+
506+
assert.Len(t, tsp.policies, 2)
507+
508+
// Duplicate policy name.
509+
cfgs = []PolicyCfg{
510+
{
511+
sharedPolicyCfg: sharedPolicyCfg{
512+
Name: "always",
513+
Type: AlwaysSample,
514+
},
515+
},
516+
{
517+
sharedPolicyCfg: sharedPolicyCfg{
518+
Name: "everything",
519+
Type: AlwaysSample,
520+
},
521+
},
522+
{
523+
sharedPolicyCfg: sharedPolicyCfg{
524+
Name: "everything",
525+
Type: AlwaysSample,
526+
},
527+
},
528+
}
529+
tsp.SetSamplingPolicy(cfgs)
530+
531+
assert.Len(t, tsp.policies, 2)
532+
533+
tsp.policyTicker.OnTick()
534+
535+
// Should revert sampling policy.
536+
assert.Len(t, tsp.policies, 2)
537+
}
538+
452539
func TestSubSecondDecisionTime(t *testing.T) {
453540
// prepare
454541
msp := new(consumertest.TracesSink)

0 commit comments

Comments
 (0)