Skip to content

Commit d67a526

Browse files
committed
receiver/prometheusreceiver: add option to fallback to collector starttime
This change adds an option to the metric adjuster to use an approximation of the collector starttime as a fallback for the start time of scraped cumulative metrics. This is useful when no start time is found and when the collector starts up alongside its targets (like in serverless environments or sidecar approaches). Signed-off-by: Ridwan Sharif <[email protected]>
1 parent 6db5d1a commit d67a526

11 files changed

+190
-26
lines changed

.chloggen/starttime-fallback.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: prometheusreceiver
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Add `UseCollectorStartTimeFallback` option for the start time metric adjuster to use the collector start time as an approximation of process start time as a fallback.
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [36364]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

receiver/prometheusreceiver/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ The prometheus receiver also supports additional top-level options:
119119
- **trim_metric_suffixes**: [**Experimental**] When set to true, this enables trimming unit and some counter type suffixes from metric names. For example, it would cause `singing_duration_seconds_total` to be trimmed to `singing_duration`. This can be useful when trying to restore the original metric names used in OpenTelemetry instrumentation. Defaults to false.
120120
- **use_start_time_metric**: When set to true, this enables retrieving the start time of all counter metrics from the process_start_time_seconds metric. This is only correct if all counters on that endpoint started after the process start time, and the process is the only actor exporting the metric after the process started. It should not be used in "exporters" which export counters that may have started before the process itself. Use only if you know what you are doing, as this may result in incorrect rate calculations. Defaults to false.
121121
- **start_time_metric_regex**: The regular expression for the start time metric, and is only applied when use_start_time_metric is enabled. Defaults to process_start_time_seconds.
122-
122+
- **use_collector_start_time_fallback**: When set to true, this option enables using the collector start time as the metric start time if the process_start_time_seconds metric yields no result (for example if targets expose no process_start_time_seconds metric). This is useful when the collector start time is a good approximation of the process start time - for example in serverless workloads when the collector is deployed as a sidecar. This is only applied when use_start_time_metric is enabled. Defaults to false.
123123
For example,
124124

125125
```yaml
@@ -128,6 +128,7 @@ receivers:
128128
trim_metric_suffixes: true
129129
use_start_time_metric: true
130130
start_time_metric_regex: foo_bar_.*
131+
use_collector_start_time_fallback: true
131132
config:
132133
scrape_configs:
133134
- job_name: 'otel-collector'

receiver/prometheusreceiver/config.go

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,43 @@ import (
2323
type Config struct {
2424
PrometheusConfig *PromConfig `mapstructure:"config"`
2525
TrimMetricSuffixes bool `mapstructure:"trim_metric_suffixes"`
26-
// UseStartTimeMetric enables retrieving the start time of all counter metrics
27-
// from the process_start_time_seconds metric. This is only correct if all counters on that endpoint
28-
// started after the process start time, and the process is the only actor exporting the metric after
29-
// the process started. It should not be used in "exporters" which export counters that may have
30-
// started before the process itself. Use only if you know what you are doing, as this may result
31-
// in incorrect rate calculations.
32-
UseStartTimeMetric bool `mapstructure:"use_start_time_metric"`
33-
StartTimeMetricRegex string `mapstructure:"start_time_metric_regex"`
26+
27+
// Settings for adjusting metrics. Will default to using an InitialPointAdjuster
28+
// which will use the first scraped point to define the start time for the timeseries.
29+
AdjustOpts MetricAdjusterOpts `mapstructure:",squash"` // squash ensures fields are correctly decoded in embedded struct.
3430

3531
// ReportExtraScrapeMetrics - enables reporting of additional metrics for Prometheus client like scrape_body_size_bytes
3632
ReportExtraScrapeMetrics bool `mapstructure:"report_extra_scrape_metrics"`
3733

3834
TargetAllocator *targetallocator.Config `mapstructure:"target_allocator"`
3935
}
4036

37+
type MetricAdjusterOpts struct {
38+
// UseStartTimeMetric enables retrieving the start time of all counter
39+
// metrics from the process_start_time_seconds metric. This is only correct
40+
// if all counters on that endpoint started after the process start time,
41+
// and the process is the only actor exporting the metric after the process
42+
// started. It should not be used in "exporters" which export counters that
43+
// may have started before the process itself. Use only if you know what you
44+
// are doing, as this may result in incorrect rate calculations.
45+
UseStartTimeMetric bool `mapstructure:"use_start_time_metric"`
46+
StartTimeMetricRegex string `mapstructure:"start_time_metric_regex"`
47+
48+
// UseCollectorStartTimeFallback enables using a fallback start time if a
49+
// start time is otherwise unavailable when adjusting metrics. This would
50+
// happen if the UseStartTimeMetric is used but the application doesn't emit
51+
// a process_start_time_seconds metric or a metric that matches the
52+
// StartTimeMetricRegex provided.
53+
//
54+
// If enabled, the fallback start time used for adjusted metrics is an
55+
// approximation of the collector start time.
56+
//
57+
// This option should be used when the collector start time is a good
58+
// approximation of the process start time - for example in serverless
59+
// workloads when the collector is deployed as a sidecar.
60+
UseCollectorStartTimeFallback bool `mapstructure:"use_collector_start_time_fallback"`
61+
}
62+
4163
// Validate checks the receiver configuration is valid.
4264
func (cfg *Config) Validate() error {
4365
if !containsScrapeConfig(cfg) && cfg.TargetAllocator == nil {

receiver/prometheusreceiver/config_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ func TestLoadConfig(t *testing.T) {
4343
r1 := cfg.(*Config)
4444
assert.Equal(t, "demo", r1.PrometheusConfig.ScrapeConfigs[0].JobName)
4545
assert.Equal(t, 5*time.Second, time.Duration(r1.PrometheusConfig.ScrapeConfigs[0].ScrapeInterval))
46-
assert.True(t, r1.UseStartTimeMetric)
46+
assert.True(t, r1.AdjustOpts.UseStartTimeMetric)
4747
assert.True(t, r1.TrimMetricSuffixes)
48-
assert.Equal(t, "^(.+_)*process_start_time_seconds$", r1.StartTimeMetricRegex)
48+
assert.Equal(t, "^(.+_)*process_start_time_seconds$", r1.AdjustOpts.StartTimeMetricRegex)
4949
assert.True(t, r1.ReportExtraScrapeMetrics)
5050

5151
assert.Equal(t, "http://my-targetallocator-service", r1.TargetAllocator.Endpoint)

receiver/prometheusreceiver/internal/appendable.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ func NewAppendable(
3737
useStartTimeMetric bool,
3838
startTimeMetricRegex *regexp.Regexp,
3939
useCreatedMetric bool,
40+
useCollectorStartTimeFallback bool,
4041
enableNativeHistograms bool,
4142
externalLabels labels.Labels,
4243
trimSuffixes bool,
@@ -45,7 +46,7 @@ func NewAppendable(
4546
if !useStartTimeMetric {
4647
metricAdjuster = NewInitialPointAdjuster(set.Logger, gcInterval, useCreatedMetric)
4748
} else {
48-
metricAdjuster = NewStartTimeMetricAdjuster(set.Logger, startTimeMetricRegex)
49+
metricAdjuster = NewStartTimeMetricAdjuster(set.Logger, startTimeMetricRegex, useCollectorStartTimeFallback)
4950
}
5051

5152
obsrecv, err := receiverhelper.NewObsReport(receiverhelper.ObsReportSettings{ReceiverID: set.ID, Transport: transport, ReceiverCreateSettings: set})

receiver/prometheusreceiver/internal/starttimemetricadjuster.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package internal // import "github.com/open-telemetry/opentelemetry-collector-co
66
import (
77
"errors"
88
"regexp"
9+
"time"
910

1011
"go.opentelemetry.io/collector/pdata/pmetric"
1112
"go.uber.org/zap"
@@ -19,21 +20,32 @@ var (
1920

2021
type startTimeMetricAdjuster struct {
2122
startTimeMetricRegex *regexp.Regexp
23+
fallbackStartTime *time.Time
2224
logger *zap.Logger
2325
}
2426

2527
// NewStartTimeMetricAdjuster returns a new MetricsAdjuster that adjust metrics' start times based on a start time metric.
26-
func NewStartTimeMetricAdjuster(logger *zap.Logger, startTimeMetricRegex *regexp.Regexp) MetricsAdjuster {
28+
func NewStartTimeMetricAdjuster(logger *zap.Logger, startTimeMetricRegex *regexp.Regexp, useCollectorStartTimeFallback bool) MetricsAdjuster {
29+
var fallbackStartTime *time.Time
30+
if useCollectorStartTimeFallback {
31+
now := time.Now()
32+
fallbackStartTime = &now
33+
}
2734
return &startTimeMetricAdjuster{
2835
startTimeMetricRegex: startTimeMetricRegex,
36+
fallbackStartTime: fallbackStartTime,
2937
logger: logger,
3038
}
3139
}
3240

3341
func (stma *startTimeMetricAdjuster) AdjustMetrics(metrics pmetric.Metrics) error {
3442
startTime, err := stma.getStartTime(metrics)
3543
if err != nil {
36-
return err
44+
if stma.fallbackStartTime == nil {
45+
return err
46+
}
47+
stma.logger.Warn("Couldn't get start time for metrics. Using fallback start time.", zap.Error(err))
48+
startTime = float64(stma.fallbackStartTime.Unix())
3749
}
3850

3951
startTimeTs := timestampFromFloat64(startTime)

receiver/prometheusreceiver/internal/starttimemetricadjuster_test.go

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package internal
66
import (
77
"regexp"
88
"testing"
9+
"time"
910

1011
"github.com/stretchr/testify/assert"
1112
"go.opentelemetry.io/collector/pdata/pcommon"
@@ -113,7 +114,7 @@ func TestStartTimeMetricMatch(t *testing.T) {
113114

114115
for _, tt := range tests {
115116
t.Run(tt.name, func(t *testing.T) {
116-
stma := NewStartTimeMetricAdjuster(zap.NewNop(), tt.startTimeMetricRegex)
117+
stma := NewStartTimeMetricAdjuster(zap.NewNop(), tt.startTimeMetricRegex, false)
117118
if tt.expectedErr != nil {
118119
assert.ErrorIs(t, stma.AdjustMetrics(tt.inputs), tt.expectedErr)
119120
return
@@ -154,3 +155,100 @@ func TestStartTimeMetricMatch(t *testing.T) {
154155
})
155156
}
156157
}
158+
159+
func TestStartTimeMetricFallback(t *testing.T) {
160+
const startTime = pcommon.Timestamp(123 * 1e9)
161+
const currentTime = pcommon.Timestamp(126 * 1e9)
162+
mockStartTime := time.Now().Add(-10 * time.Hour)
163+
mockStartTimeSeconds := float64(mockStartTime.Unix())
164+
processStartTime := mockStartTime.Add(-10 * time.Hour)
165+
processStartTimeSeconds := float64(processStartTime.Unix())
166+
167+
tests := []struct {
168+
name string
169+
inputs pmetric.Metrics
170+
startTimeMetricRegex *regexp.Regexp
171+
expectedStartTime pcommon.Timestamp
172+
expectedErr error
173+
}{
174+
{
175+
name: "regexp_match_sum_metric_no_fallback",
176+
inputs: metrics(
177+
sumMetric("test_sum_metric", doublePoint(nil, startTime, currentTime, 16)),
178+
histogramMetric("test_histogram_metric", histogramPoint(nil, startTime, currentTime, []float64{1, 2}, []uint64{2, 3, 4})),
179+
summaryMetric("test_summary_metric", summaryPoint(nil, startTime, currentTime, 10, 100, []float64{10, 50, 90}, []float64{9, 15, 48})),
180+
sumMetric("example_process_start_time_seconds", doublePoint(nil, startTime, currentTime, processStartTimeSeconds)),
181+
sumMetric("process_start_time_seconds", doublePoint(nil, startTime, currentTime, processStartTimeSeconds)),
182+
exponentialHistogramMetric("test_exponential_histogram_metric", exponentialHistogramPointSimplified(nil, startTime, currentTime, 3, 1, -5, 3)),
183+
),
184+
startTimeMetricRegex: regexp.MustCompile("^.*_process_start_time_seconds$"),
185+
expectedStartTime: timestampFromFloat64(processStartTimeSeconds),
186+
},
187+
{
188+
name: "regexp_match_sum_metric_fallback",
189+
inputs: metrics(
190+
sumMetric("test_sum_metric", doublePoint(nil, startTime, currentTime, 16)),
191+
histogramMetric("test_histogram_metric", histogramPoint(nil, startTime, currentTime, []float64{1, 2}, []uint64{2, 3, 4})),
192+
summaryMetric("test_summary_metric", summaryPoint(nil, startTime, currentTime, 10, 100, []float64{10, 50, 90}, []float64{9, 15, 48})),
193+
),
194+
startTimeMetricRegex: regexp.MustCompile("^.*_process_start_time_seconds$"),
195+
expectedStartTime: timestampFromFloat64(mockStartTimeSeconds),
196+
},
197+
{
198+
name: "match_default_sum_start_time_metric_fallback",
199+
inputs: metrics(
200+
sumMetric("test_sum_metric", doublePoint(nil, startTime, currentTime, 16)),
201+
histogramMetric("test_histogram_metric", histogramPoint(nil, startTime, currentTime, []float64{1, 2}, []uint64{2, 3, 4})),
202+
summaryMetric("test_summary_metric", summaryPoint(nil, startTime, currentTime, 10, 100, []float64{10, 50, 90}, []float64{9, 15, 48})),
203+
),
204+
expectedStartTime: timestampFromFloat64(mockStartTimeSeconds),
205+
},
206+
}
207+
208+
for _, tt := range tests {
209+
t.Run(tt.name, func(t *testing.T) {
210+
stma := NewStartTimeMetricAdjuster(zap.NewNop(), tt.startTimeMetricRegex, true)
211+
if tt.expectedErr != nil {
212+
assert.ErrorIs(t, stma.AdjustMetrics(tt.inputs), tt.expectedErr)
213+
return
214+
}
215+
216+
// Make sure the right adjuster is used and one that has the fallback time set.
217+
metricAdjuster, ok := stma.(*startTimeMetricAdjuster)
218+
assert.True(t, ok)
219+
assert.NotNil(t, metricAdjuster.fallbackStartTime)
220+
221+
// To test that the adjuster is using the fallback correctly, override the fallback time to use
222+
// directly.
223+
metricAdjuster.fallbackStartTime = &mockStartTime
224+
225+
assert.NoError(t, stma.AdjustMetrics(tt.inputs))
226+
for i := 0; i < tt.inputs.ResourceMetrics().Len(); i++ {
227+
rm := tt.inputs.ResourceMetrics().At(i)
228+
for j := 0; j < rm.ScopeMetrics().Len(); j++ {
229+
ilm := rm.ScopeMetrics().At(j)
230+
for k := 0; k < ilm.Metrics().Len(); k++ {
231+
metric := ilm.Metrics().At(k)
232+
switch metric.Type() {
233+
case pmetric.MetricTypeSum:
234+
dps := metric.Sum().DataPoints()
235+
for l := 0; l < dps.Len(); l++ {
236+
assert.Equal(t, tt.expectedStartTime, dps.At(l).StartTimestamp())
237+
}
238+
case pmetric.MetricTypeSummary:
239+
dps := metric.Summary().DataPoints()
240+
for l := 0; l < dps.Len(); l++ {
241+
assert.Equal(t, tt.expectedStartTime, dps.At(l).StartTimestamp())
242+
}
243+
case pmetric.MetricTypeHistogram:
244+
dps := metric.Histogram().DataPoints()
245+
for l := 0; l < dps.Len(); l++ {
246+
assert.Equal(t, tt.expectedStartTime, dps.At(l).StartTimestamp())
247+
}
248+
}
249+
}
250+
}
251+
}
252+
})
253+
}
254+
}

receiver/prometheusreceiver/metrics_receiver.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ func (r *pReceiver) initPrometheusComponents(ctx context.Context, logger log.Log
123123
}()
124124

125125
var startTimeMetricRegex *regexp.Regexp
126-
if r.cfg.StartTimeMetricRegex != "" {
127-
startTimeMetricRegex, err = regexp.Compile(r.cfg.StartTimeMetricRegex)
126+
if r.cfg.AdjustOpts.StartTimeMetricRegex != "" {
127+
startTimeMetricRegex, err = regexp.Compile(r.cfg.AdjustOpts.StartTimeMetricRegex)
128128
if err != nil {
129129
return err
130130
}
@@ -134,9 +134,10 @@ func (r *pReceiver) initPrometheusComponents(ctx context.Context, logger log.Log
134134
r.consumer,
135135
r.settings,
136136
gcInterval(r.cfg.PrometheusConfig),
137-
r.cfg.UseStartTimeMetric,
137+
r.cfg.AdjustOpts.UseStartTimeMetric,
138138
startTimeMetricRegex,
139139
useCreatedMetricGate.IsEnabled(),
140+
r.cfg.AdjustOpts.UseCollectorStartTimeFallback,
140141
enableNativeHistogramsGate.IsEnabled(),
141142
r.cfg.PrometheusConfig.GlobalConfig.ExternalLabels,
142143
r.cfg.TrimMetricSuffixes,

receiver/prometheusreceiver/metrics_receiver_helper_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -687,8 +687,8 @@ func testComponent(t *testing.T, targets []*testData, alterConfig func(*Config),
687687
defer mp.Close()
688688

689689
config := &Config{
690-
PrometheusConfig: cfg,
691-
StartTimeMetricRegex: "",
690+
PrometheusConfig: cfg,
691+
AdjustOpts: MetricAdjusterOpts{StartTimeMetricRegex: ""},
692692
}
693693
if alterConfig != nil {
694694
alterConfig(config)

receiver/prometheusreceiver/metrics_receiver_report_extra_scrape_metrics_test.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,11 @@ func testScraperMetrics(t *testing.T, targets []*testData, reportExtraScrapeMetr
5252

5353
cms := new(consumertest.MetricsSink)
5454
receiver := newPrometheusReceiver(receivertest.NewNopSettings(), &Config{
55-
PrometheusConfig: cfg,
56-
UseStartTimeMetric: false,
57-
StartTimeMetricRegex: "",
55+
PrometheusConfig: cfg,
56+
AdjustOpts: MetricAdjusterOpts{
57+
UseStartTimeMetric: false,
58+
StartTimeMetricRegex: "",
59+
},
5860
ReportExtraScrapeMetrics: reportExtraScrapeMetrics,
5961
}, cms)
6062

receiver/prometheusreceiver/metrics_receiver_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,7 +1424,7 @@ func TestStartTimeMetric(t *testing.T) {
14241424
},
14251425
}
14261426
testComponent(t, targets, func(c *Config) {
1427-
c.UseStartTimeMetric = true
1427+
c.AdjustOpts.UseStartTimeMetric = true
14281428
})
14291429
}
14301430

@@ -1475,8 +1475,8 @@ func TestStartTimeMetricRegex(t *testing.T) {
14751475
},
14761476
}
14771477
testComponent(t, targets, func(c *Config) {
1478-
c.StartTimeMetricRegex = "^(.+_)*process_start_time_seconds$"
1479-
c.UseStartTimeMetric = true
1478+
c.AdjustOpts.StartTimeMetricRegex = "^(.+_)*process_start_time_seconds$"
1479+
c.AdjustOpts.UseStartTimeMetric = true
14801480
})
14811481
}
14821482

0 commit comments

Comments
 (0)