Skip to content

Commit 39bdda7

Browse files
authored
[connector/spanmetrics] Discard counter span metric exemplars after flushing (#32210)
**Description:** Discard counter span metric exemplars after flushing to avoid unbounded memory growth when exemplars are enabled. This is needed because #28671 added exemplars to counter span metrics, but they are not removed after each flush interval like they are for histogram span metrics. Note: this may change behaviour if using the undocumented `exemplars.max_per_data_point` configuration option, since exemplars would no longer be accumulated up until that count. However, i'm unclear on the value of that feature since there's no mechanism to replace old exemplars with newer ones once the maximum is reached. Maybe a follow-up enhancement is only discarding exemplars once the maximum is reached, or using a circular buffer to replace them. That could be useful for pull-based exporters like `prometheusexporter`, as retaining exemplars for longer would decrease the chance of them getting discarded before being scraped. **Link to tracking Issue:** Closes #31683 **Testing:** - Unit tests - Running the collector and setting a breakpoint to verify the exemplars are being cleared in-between flushes. Before the change I could see the exemplar count continually growing **Documentation:** <Describe the documentation added.> Updated the documentation to mention that exemplars are added to all span metrics. Also mentioned when they are discarded
1 parent c5d1d59 commit 39bdda7

File tree

5 files changed

+151
-36
lines changed

5 files changed

+151
-36
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: bug_fix
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: spanmetrics
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Discard counter span metric exemplars after each flush interval to avoid unbounded memory growth
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [31683]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext: This aligns exemplar discarding for counter span metrics with the existing logic for histogram span metrics
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: [user]

connector/spanmetricsconnector/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ The following settings can be optionally configured:
114114
- `namespace`: Defines the namespace of the generated metrics. If `namespace` provided, generated metric name will be added `namespace.` prefix.
115115
- `metrics_flush_interval` (default: `60s`): Defines the flush interval of the generated metrics.
116116
- `metrics_expiration` (default: `0`): Defines the expiration time as `time.Duration`, after which, if no new spans are received, metrics will no longer be exported. Setting to `0` means the metrics will never expire (default behavior).
117-
- `exemplars`: Use to configure how to attach exemplars to histograms
118-
- `enabled` (default: `false`): enabling will add spans as Exemplars.
117+
- `exemplars`: Use to configure how to attach exemplars to metrics.
118+
- `enabled` (default: `false`): enabling will add spans as Exemplars to all metrics. Exemplars are only kept for one flush interval.
119119
- `events`: Use to configure the events metric.
120120
- `enabled`: (default: `false`): enabling will add the events metric.
121121
- `dimensions`: (mandatory if `enabled`) the list of the span's event attributes to add as dimensions to the events metric, which will be included _on top of_ the common and configured `dimensions` for span and resource attributes.

connector/spanmetricsconnector/connector.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,17 +291,21 @@ func (p *connectorImp) resetState() {
291291
p.resourceMetrics.RemoveEvictedItems()
292292
p.metricKeyToDimensions.RemoveEvictedItems()
293293

294-
// If no histogram and no metrics expiration is configured, we can skip the remaining operations.
294+
// If none of these features are enabled then we can skip the remaining operations.
295295
// Enabling either of these features requires to go over resource metrics and do operation on each.
296-
if p.config.Histogram.Disable && p.config.MetricsExpiration == 0 {
296+
if p.config.Histogram.Disable && p.config.MetricsExpiration == 0 && !p.config.Exemplars.Enabled {
297297
return
298298
}
299299

300300
now := time.Now()
301301
p.resourceMetrics.ForEach(func(k resourceKey, m *resourceMetrics) {
302302
// Exemplars are only relevant to this batch of traces, so must be cleared within the lock
303-
if !p.config.Histogram.Disable {
304-
m.histograms.Reset(true)
303+
if p.config.Exemplars.Enabled {
304+
m.sums.ClearExemplars()
305+
m.events.ClearExemplars()
306+
if !p.config.Histogram.Disable {
307+
m.histograms.ClearExemplars()
308+
}
305309
}
306310

307311
// If metrics expiration is configured, remove metrics that haven't been seen for longer than the expiration period.

connector/spanmetricsconnector/connector_test.go

Lines changed: 103 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1519,18 +1519,89 @@ func TestSpanMetrics_Events(t *testing.T) {
15191519
})
15201520
}
15211521
}
1522-
func TestExemplarsForSumMetrics(t *testing.T) {
1523-
p, _, err := newConnectorImp(stringp("defaultNullValue"), explicitHistogramsConfig, enabledExemplarsConfig, enabledEventsConfig, cumulative, 0, []string{})
1524-
require.NoError(t, err)
1525-
traces := buildSampleTrace()
1522+
func TestExemplarsAreDiscardedAfterFlushing(t *testing.T) {
1523+
tests := []struct {
1524+
name string
1525+
temporality string
1526+
histogramConfig func() HistogramConfig
1527+
}{
1528+
{
1529+
name: "cumulative explicit histogram",
1530+
temporality: cumulative,
1531+
histogramConfig: explicitHistogramsConfig,
1532+
},
1533+
{
1534+
name: "cumulative exponential histogram",
1535+
temporality: cumulative,
1536+
histogramConfig: exponentialHistogramsConfig,
1537+
},
1538+
{
1539+
name: "delta explicit histogram",
1540+
temporality: delta,
1541+
histogramConfig: explicitHistogramsConfig,
1542+
},
1543+
}
1544+
for _, tt := range tests {
1545+
t.Run(tt.name, func(t *testing.T) {
1546+
p, _, err := newConnectorImp(stringp("defaultNullValue"), tt.histogramConfig, enabledExemplarsConfig, enabledEventsConfig, tt.temporality, 0, []string{})
1547+
p.metricsConsumer = &consumertest.MetricsSink{}
1548+
require.NoError(t, err)
15261549

1527-
// Test
1528-
ctx := metadata.NewIncomingContext(context.Background(), nil)
1550+
traces := ptrace.NewTraces()
1551+
trace1ID := [16]byte{0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x10}
1552+
initServiceSpans(
1553+
serviceSpans{
1554+
serviceName: "service-b",
1555+
spans: []span{
1556+
{
1557+
name: "/ping",
1558+
kind: ptrace.SpanKindServer,
1559+
statusCode: ptrace.StatusCodeError,
1560+
traceID: trace1ID,
1561+
spanID: [8]byte{0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18},
1562+
},
1563+
},
1564+
}, traces.ResourceSpans().AppendEmpty())
15291565

1530-
err = p.ConsumeTraces(ctx, traces)
1531-
require.NoError(t, err)
1532-
metrics := p.buildMetrics()
1566+
// Test
1567+
ctx := metadata.NewIncomingContext(context.Background(), nil)
1568+
1569+
// Verify exactly 1 exemplar is added to all data points when flushing
1570+
err = p.ConsumeTraces(ctx, traces)
1571+
require.NoError(t, err)
1572+
1573+
p.exportMetrics(ctx)
1574+
m := p.metricsConsumer.(*consumertest.MetricsSink).AllMetrics()[0]
1575+
assertDataPointsHaveExactlyOneExemplarForTrace(t, m, trace1ID)
1576+
1577+
// Verify exemplars from previous batch's trace are replaced with exemplars for the new batch's trace
1578+
traces = ptrace.NewTraces()
1579+
trace2ID := [16]byte{0x00, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x10}
1580+
initServiceSpans(
1581+
serviceSpans{
1582+
serviceName: "service-b",
1583+
spans: []span{
1584+
{
1585+
name: "/ping",
1586+
kind: ptrace.SpanKindServer,
1587+
statusCode: ptrace.StatusCodeError,
1588+
traceID: trace2ID,
1589+
spanID: [8]byte{0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18},
1590+
},
1591+
},
1592+
}, traces.ResourceSpans().AppendEmpty())
1593+
1594+
err = p.ConsumeTraces(ctx, traces)
1595+
require.NoError(t, err)
1596+
1597+
p.exportMetrics(ctx)
1598+
m = p.metricsConsumer.(*consumertest.MetricsSink).AllMetrics()[1]
1599+
assertDataPointsHaveExactlyOneExemplarForTrace(t, m, trace2ID)
1600+
})
1601+
}
1602+
}
15331603

1604+
func assertDataPointsHaveExactlyOneExemplarForTrace(t *testing.T, metrics pmetric.Metrics, traceID pcommon.TraceID) {
15341605
for i := 0; i < metrics.ResourceMetrics().Len(); i++ {
15351606
rm := metrics.ResourceMetrics().At(i)
15361607
ism := rm.ScopeMetrics()
@@ -1539,12 +1610,33 @@ func TestExemplarsForSumMetrics(t *testing.T) {
15391610
m := ism.At(ilmC).Metrics()
15401611
for mC := 0; mC < m.Len(); mC++ {
15411612
metric := m.At(mC)
1542-
if metric.Type() == pmetric.MetricTypeSum {
1613+
switch metric.Type() {
1614+
case pmetric.MetricTypeSum:
15431615
dps := metric.Sum().DataPoints()
1616+
assert.Greater(t, dps.Len(), 0)
1617+
for dpi := 0; dpi < dps.Len(); dpi++ {
1618+
dp := dps.At(dpi)
1619+
assert.Equal(t, dp.Exemplars().Len(), 1)
1620+
assert.Equal(t, dp.Exemplars().At(0).TraceID(), traceID)
1621+
}
1622+
case pmetric.MetricTypeHistogram:
1623+
dps := metric.Histogram().DataPoints()
1624+
assert.Greater(t, dps.Len(), 0)
1625+
for dpi := 0; dpi < dps.Len(); dpi++ {
1626+
dp := dps.At(dpi)
1627+
assert.Equal(t, dp.Exemplars().Len(), 1)
1628+
assert.Equal(t, dp.Exemplars().At(0).TraceID(), traceID)
1629+
}
1630+
case pmetric.MetricTypeExponentialHistogram:
1631+
dps := metric.ExponentialHistogram().DataPoints()
1632+
assert.Greater(t, dps.Len(), 0)
15441633
for dpi := 0; dpi < dps.Len(); dpi++ {
15451634
dp := dps.At(dpi)
1546-
assert.Greater(t, dp.Exemplars().Len(), 0)
1635+
assert.Equal(t, dp.Exemplars().Len(), 1)
1636+
assert.Equal(t, dp.Exemplars().At(0).TraceID(), traceID)
15471637
}
1638+
default:
1639+
t.Fatalf("Unexpected metric type %s", metric.Type())
15481640
}
15491641
}
15501642
}

connector/spanmetricsconnector/internal/metrics/metrics.go

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ type Key string
1717
type HistogramMetrics interface {
1818
GetOrCreate(key Key, attributes pcommon.Map) Histogram
1919
BuildMetrics(pmetric.Metric, pcommon.Timestamp, pmetric.AggregationTemporality)
20-
Reset(onlyExemplars bool)
20+
ClearExemplars()
2121
}
2222

2323
type Histogram interface {
@@ -116,15 +116,10 @@ func (m *explicitHistogramMetrics) BuildMetrics(
116116
}
117117
}
118118

119-
func (m *explicitHistogramMetrics) Reset(onlyExemplars bool) {
120-
if onlyExemplars {
121-
for _, h := range m.metrics {
122-
h.exemplars = pmetric.NewExemplarSlice()
123-
}
124-
return
119+
func (m *explicitHistogramMetrics) ClearExemplars() {
120+
for _, h := range m.metrics {
121+
h.exemplars = pmetric.NewExemplarSlice()
125122
}
126-
127-
m.metrics = make(map[Key]*explicitHistogram)
128123
}
129124

130125
func (m *exponentialHistogramMetrics) GetOrCreate(key Key, attributes pcommon.Map) Histogram {
@@ -202,15 +197,10 @@ func expoHistToExponentialDataPoint(agg *structure.Histogram[float64], dp pmetri
202197
}
203198
}
204199

205-
func (m *exponentialHistogramMetrics) Reset(onlyExemplars bool) {
206-
if onlyExemplars {
207-
for _, m := range m.metrics {
208-
m.exemplars = pmetric.NewExemplarSlice()
209-
}
210-
return
200+
func (m *exponentialHistogramMetrics) ClearExemplars() {
201+
for _, m := range m.metrics {
202+
m.exemplars = pmetric.NewExemplarSlice()
211203
}
212-
213-
m.metrics = make(map[Key]*exponentialHistogram)
214204
}
215205

216206
func (h *explicitHistogram) Observe(value float64) {
@@ -316,6 +306,8 @@ func (m *SumMetrics) BuildMetrics(
316306
}
317307
}
318308

319-
func (m *SumMetrics) Reset() {
320-
m.metrics = make(map[Key]*Sum)
309+
func (m *SumMetrics) ClearExemplars() {
310+
for _, sum := range m.metrics {
311+
sum.exemplars = pmetric.NewExemplarSlice()
312+
}
321313
}

0 commit comments

Comments
 (0)