Skip to content

Commit c12b3cc

Browse files
committed
Persist trace context encoded as w3c trace-context
1 parent 4074b72 commit c12b3cc

File tree

6 files changed

+1010
-81
lines changed

6 files changed

+1010
-81
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: 'enhancement'
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver)
7+
component: 'exporterhelper'
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Add `exporter.PropagateSpanContext` to enable propagating SpanContext along with telemetry requests in the persistent queue"
11+
12+
# One or more tracking issues or pull requests related to the change
13+
issues: [11740, 12212, 12934]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext: |
19+
This change will allow internal telemetry spans to be processed when using persistent queue/storage.
20+
When enabled, requests will use approximately 128 bytes more in persistent storage.
21+
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: [user]

exporter/exporterhelper/internal/queuebatch/persistent_queue.go

Lines changed: 80 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -253,17 +253,22 @@ func (pq *persistentQueue[T]) putInternal(ctx context.Context, req T) error {
253253
return err
254254
}
255255
}
256+
// Operations will include item and write index (and context if spancontext feature enabled)
257+
ops := make([]*storage.Operation, 2, 3)
258+
ops[0] = storage.SetOperation(writeIndexKey, itemIndexToBytes(pq.metadata.WriteIndex+1))
256259

257260
reqBuf, err := pq.set.encoding.Marshal(req)
258261
if err != nil {
259262
return err
260263
}
264+
ops[1] = storage.SetOperation(getItemKey(pq.metadata.WriteIndex), reqBuf)
261265

262-
// Carry out a transaction where we both add the item and update the write index
263-
ops := []*storage.Operation{
264-
storage.SetOperation(writeIndexKey, itemIndexToBytes(pq.metadata.WriteIndex+1)),
265-
storage.SetOperation(getItemKey(pq.metadata.WriteIndex), reqBuf),
266+
contextBuf := marshalSpanContext(ctx)
267+
if len(contextBuf) > 0 {
268+
ops = append(ops, storage.SetOperation(getContextKey(pq.metadata.WriteIndex), contextBuf))
266269
}
270+
271+
// Carry out a transaction where we add the item/context and update the write index
267272
if err = pq.client.Batch(ctx, ops...); err != nil {
268273
return err
269274
}
@@ -295,7 +300,13 @@ func (pq *persistentQueue[T]) Read(ctx context.Context) (context.Context, T, Don
295300

296301
// Read until either a successful retrieved element or no more elements in the storage.
297302
for pq.metadata.ReadIndex != pq.metadata.WriteIndex {
298-
index, req, consumed := pq.getNextItem(ctx)
303+
index, req, consumed, restoredContext, err := pq.getNextItem(ctx)
304+
if err != nil {
305+
pq.logger.Debug("Failed to dispatch item", zap.Error(err))
306+
if err = pq.itemDispatchingFinish(ctx, index); err != nil {
307+
pq.logger.Error("Error deleting item from queue", zap.Error(err))
308+
}
309+
}
299310
// Ensure the used size and the channel size are in sync.
300311
if pq.metadata.ReadIndex == pq.metadata.WriteIndex {
301312
pq.metadata.QueueSize = 0
@@ -304,7 +315,7 @@ func (pq *persistentQueue[T]) Read(ctx context.Context) (context.Context, T, Don
304315
if consumed {
305316
id := indexDonePool.Get().(*indexDone)
306317
id.reset(index, pq.set.sizer.Sizeof(req), pq)
307-
return context.Background(), req, id, true
318+
return restoredContext, req, id, true
308319
}
309320
}
310321

@@ -317,37 +328,44 @@ func (pq *persistentQueue[T]) Read(ctx context.Context) (context.Context, T, Don
317328
// getNextItem pulls the next available item from the persistent storage along with its index. Once processing is
318329
// finished, the index should be called with onDone to clean up the storage. If no new item is available,
319330
// returns false.
320-
func (pq *persistentQueue[T]) getNextItem(ctx context.Context) (uint64, T, bool) {
331+
func (pq *persistentQueue[T]) getNextItem(ctx context.Context) (uint64, T, bool, context.Context, error) {
321332
index := pq.metadata.ReadIndex
322333
// Increase here, so even if errors happen below, it always iterates
323334
pq.metadata.ReadIndex++
324335
pq.metadata.CurrentlyDispatchedItems = append(pq.metadata.CurrentlyDispatchedItems, index)
325336
getOp := storage.GetOperation(getItemKey(index))
326-
err := pq.client.Batch(ctx,
327-
storage.SetOperation(readIndexKey, itemIndexToBytes(pq.metadata.ReadIndex)),
328-
storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems)),
329-
getOp)
337+
ops := make([]*storage.Operation, 3, 4)
338+
ops[0] = storage.SetOperation(readIndexKey, itemIndexToBytes(pq.metadata.ReadIndex))
339+
ops[1] = storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems))
340+
ops[2] = getOp
330341

331-
var request T
332-
if err == nil {
333-
request, err = pq.set.encoding.Unmarshal(getOp.Value)
342+
// Only add context operation if feature gate is enabled
343+
var ctxOp *storage.Operation
344+
if persistRequestContextFeatureGate.IsEnabled() {
345+
ctxOp = storage.GetOperation(getContextKey(index))
346+
ops = append(ops, ctxOp)
334347
}
335348

349+
var request T
350+
restoredContext := context.Background()
351+
err := pq.client.Batch(ctx, ops...)
336352
if err != nil {
337-
pq.logger.Debug("Failed to dispatch item", zap.Error(err))
338-
// We need to make sure that currently dispatched items list is cleaned
339-
if err = pq.itemDispatchingFinish(ctx, index); err != nil {
340-
pq.logger.Error("Error deleting item from queue", zap.Error(err))
341-
}
342-
343-
return 0, request, false
353+
return 0, request, false, restoredContext, err
354+
}
355+
request, err = pq.set.encoding.Unmarshal(getOp.Value)
356+
if err != nil {
357+
return 0, request, false, restoredContext, err
344358
}
345359

360+
// Only try to restore context if feature gate is enabled
361+
if persistRequestContextFeatureGate.IsEnabled() {
362+
restoredContext = unmarshalSpanContext(ctxOp.Value)
363+
}
346364
// Increase the reference count, so the client is not closed while the request is being processed.
347365
// The client cannot be closed because we hold the lock since last we checked `stopped`.
348366
pq.refClient++
349367

350-
return index, request, true
368+
return index, request, true, restoredContext, nil
351369
}
352370

353371
// onDone should be called to remove the item of the given index from the queue once processing is finished.
@@ -414,13 +432,29 @@ func (pq *persistentQueue[T]) retrieveAndEnqueueNotDispatchedReqs(ctx context.Co
414432

415433
pq.logger.Info("Fetching items left for dispatch by consumers", zap.Int(zapNumberOfItems,
416434
len(dispatchedItems)))
417-
retrieveBatch := make([]*storage.Operation, len(dispatchedItems))
418-
cleanupBatch := make([]*storage.Operation, len(dispatchedItems))
435+
436+
// Calculate batch sizes based on whether context persistence is enabled
437+
batchSize := len(dispatchedItems)
438+
if persistRequestContextFeatureGate.IsEnabled() {
439+
batchSize *= 2
440+
}
441+
442+
retrieveBatch := make([]*storage.Operation, batchSize)
443+
cleanupBatch := make([]*storage.Operation, batchSize)
444+
419445
for i, it := range dispatchedItems {
420-
key := getItemKey(it)
421-
retrieveBatch[i] = storage.GetOperation(key)
422-
cleanupBatch[i] = storage.DeleteOperation(key)
446+
reqKey := getItemKey(it)
447+
retrieveBatch[i] = storage.GetOperation(reqKey)
448+
cleanupBatch[i] = storage.DeleteOperation(reqKey)
449+
450+
if persistRequestContextFeatureGate.IsEnabled() {
451+
// store the context keys at at the end of the batch
452+
ctxKey := getContextKey(it)
453+
retrieveBatch[len(dispatchedItems)+i] = storage.GetOperation(ctxKey)
454+
cleanupBatch[len(dispatchedItems)+i] = storage.DeleteOperation(ctxKey)
455+
}
423456
}
457+
424458
retrieveErr := pq.client.Batch(ctx, retrieveBatch...)
425459
cleanupErr := pq.client.Batch(ctx, cleanupBatch...)
426460

@@ -434,18 +468,27 @@ func (pq *persistentQueue[T]) retrieveAndEnqueueNotDispatchedReqs(ctx context.Co
434468
}
435469

436470
errCount := 0
437-
for _, op := range retrieveBatch {
471+
// only need to iterate over first half of batch if spancontext is persisted as these items
472+
// are at corresponding index in the second half of retrieveBatch
473+
for idx := 0; idx < len(dispatchedItems); idx++ {
474+
op := retrieveBatch[idx]
438475
if op.Value == nil {
439476
pq.logger.Warn("Failed retrieving item", zap.String(zapKey, op.Key), zap.Error(errValueNotSet))
440477
continue
441478
}
479+
restoredContext := ctx
442480
req, err := pq.set.encoding.Unmarshal(op.Value)
443481
// If error happened or item is nil, it will be efficiently ignored
444482
if err != nil {
445483
pq.logger.Warn("Failed unmarshalling item", zap.String(zapKey, op.Key), zap.Error(err))
446484
continue
447485
}
448-
if pq.putInternal(ctx, req) != nil {
486+
// We will then retrieve the context from the back half of the batch list, see above
487+
if persistRequestContextFeatureGate.IsEnabled() {
488+
ctxOp := retrieveBatch[len(dispatchedItems)+idx]
489+
restoredContext = unmarshalSpanContext(ctxOp.Value)
490+
}
491+
if pq.putInternal(restoredContext, req) != nil {
449492
errCount++
450493
}
451494
}
@@ -470,9 +513,12 @@ func (pq *persistentQueue[T]) itemDispatchingFinish(ctx context.Context, index u
470513
}
471514
}
472515

473-
setOp := storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems))
474-
deleteOp := storage.DeleteOperation(getItemKey(index))
475-
if err := pq.client.Batch(ctx, setOp, deleteOp); err != nil {
516+
setOps := []*storage.Operation{storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems))}
517+
deleteOps := []*storage.Operation{storage.DeleteOperation(getItemKey(index))}
518+
if persistRequestContextFeatureGate.IsEnabled() {
519+
deleteOps = append(deleteOps, storage.DeleteOperation(getContextKey(index)))
520+
}
521+
if err := pq.client.Batch(ctx, append(setOps, deleteOps...)...); err != nil {
476522
// got an error, try to gracefully handle it
477523
pq.logger.Warn("Failed updating currently dispatched items, trying to delete the item first",
478524
zap.Error(err))
@@ -481,12 +527,12 @@ func (pq *persistentQueue[T]) itemDispatchingFinish(ctx context.Context, index u
481527
return nil
482528
}
483529

484-
if err := pq.client.Batch(ctx, deleteOp); err != nil {
530+
if err := pq.client.Batch(ctx, deleteOps...); err != nil {
485531
// Return an error here, as this indicates an issue with the underlying storage medium
486532
return fmt.Errorf("failed deleting item from queue, got error from storage: %w", err)
487533
}
488534

489-
if err := pq.client.Batch(ctx, setOp); err != nil {
535+
if err := pq.client.Batch(ctx, setOps...); err != nil {
490536
// even if this fails, we still have the right dispatched items in memory
491537
// at worst, we'll have the wrong list in storage, and we'll discard the nonexistent items during startup
492538
return fmt.Errorf("failed updating currently dispatched items, but deleted item successfully: %w", err)
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package queuebatch // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/queuebatch"
5+
6+
import (
7+
"context"
8+
"strconv"
9+
10+
"go.opentelemetry.io/otel/propagation"
11+
12+
"go.opentelemetry.io/collector/featuregate"
13+
)
14+
15+
// persistRequestContextFeatureGate controls whether request context should be persisted in the queue.
16+
var persistRequestContextFeatureGate = featuregate.GlobalRegistry().MustRegister(
17+
"exporter.PersistRequestContext",
18+
featuregate.StageAlpha,
19+
featuregate.WithRegisterFromVersion("v0.127.0"),
20+
featuregate.WithRegisterDescription("controls whether context should be stored alongside requests in the persistent queue"),
21+
featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector/pull/12934"),
22+
)
23+
24+
var tracePropagator = propagation.TraceContext{}
25+
26+
func marshalSpanContext(ctx context.Context) []byte {
27+
if !persistRequestContextFeatureGate.IsEnabled() {
28+
return nil
29+
}
30+
carrier := newByteMapCarrier()
31+
tracePropagator.Inject(ctx, carrier)
32+
return carrier.Bytes()
33+
}
34+
35+
func unmarshalSpanContext(b []byte) context.Context {
36+
ctx := context.Background()
37+
if !persistRequestContextFeatureGate.IsEnabled() || b == nil {
38+
return ctx
39+
}
40+
carrier := &byteMapCarrier{buf: b}
41+
tracePropagator.Extract(ctx, carrier)
42+
return ctx
43+
}
44+
45+
func getContextKey(index uint64) string {
46+
return strconv.FormatUint(index, 10) + "_context"
47+
}
48+
49+
// byteMapCarrier implements propagation.TextMapCarrier on top of a byte slice.
50+
// The format is a sequence of key-value pairs encoded as:
51+
// - 1 byte length of key
52+
// - key string
53+
// - '=' character
54+
// - value string
55+
// - NUL terminator (0 byte)
56+
type byteMapCarrier struct{ buf []byte }
57+
58+
var _ propagation.TextMapCarrier = (*byteMapCarrier)(nil)
59+
60+
// defaultCarrierCap is a capacity for the byteMapCarrier buffer that should fit typical `traceparent` and `tracestate`
61+
// keys and values.
62+
const defaultCarrierCap = 128
63+
64+
func newByteMapCarrier() *byteMapCarrier {
65+
return &byteMapCarrier{buf: make([]byte, 0, defaultCarrierCap)}
66+
}
67+
68+
func (c *byteMapCarrier) Set(k, v string) {
69+
c.buf = append(c.buf, byte(len(k)))
70+
c.buf = append(c.buf, k...)
71+
c.buf = append(c.buf, '=')
72+
c.buf = append(c.buf, v...)
73+
c.buf = append(c.buf, 0) // NUL terminator
74+
}
75+
76+
func (c *byteMapCarrier) Get(k string) string {
77+
for i := 0; i < len(c.buf); {
78+
l := int(c.buf[i])
79+
i++
80+
if i+l > len(c.buf) {
81+
return ""
82+
}
83+
key := string(c.buf[i : i+l])
84+
i += l
85+
if i >= len(c.buf) || c.buf[i] != '=' {
86+
return ""
87+
}
88+
i++
89+
valStart := i
90+
for i < len(c.buf) && c.buf[i] != 0 {
91+
i++
92+
}
93+
val := string(c.buf[valStart:i])
94+
i++ // skip NUL
95+
if key == k {
96+
return val
97+
}
98+
}
99+
return ""
100+
}
101+
102+
func (c *byteMapCarrier) Keys() []string {
103+
var out []string
104+
for i := 0; i < len(c.buf); {
105+
l := int(c.buf[i])
106+
i++
107+
out = append(out, string(c.buf[i:i+l]))
108+
i += l
109+
for i < len(c.buf) && c.buf[i] != 0 {
110+
i++
111+
}
112+
i++ // skip '=' / NUL block
113+
}
114+
return out
115+
}
116+
117+
func (c *byteMapCarrier) Bytes() []byte { return c.buf }

0 commit comments

Comments
 (0)