Skip to content

Commit 4dff345

Browse files
NickAngeArthurSens
andauthored
[exporter/prometheusremotewrite] : WAL read success/failure telemetry (#40272)
<!--Ex. Fixing a bug - Describe the bug and how this fixes the issue. Ex. Adding a feature - Explain what this achieves.--> #### Description This PR introduces below metrics for WAL: - `otelcol_exporter_prometheusremotewrite_wal_reads`: Number of WAL reads - `otelcol_exporter_prometheusremotewrite_wal_reads_failures`: Number of WAL reads that failed <!-- Issue number (e.g. #1234) or full URL to issue, if applicable. --> #### Link to tracking issue Part of #39556 <!--Describe what testing was performed and which tests were added.--> #### Testing Added a UT `TestWALRead_Telemetry`. During WAL startup it tries to read from the WAL but it fails because there is nothing to read. For that reason both metrics have failed <!--Describe the documentation added.--> #### Documentation <!--Please delete paragraphs that you did not use before submitting.--> --------- Co-authored-by: Arthur Silva Sens <[email protected]>
1 parent a691692 commit 4dff345

File tree

8 files changed

+204
-2
lines changed

8 files changed

+204
-2
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: prometheusremotewriteexproter
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: |
11+
Adds wal metrics to the Prometheus Remote Write Exporter. The new metrics are:
12+
- `otelcol_exporter_prometheusremotewrite_wal_reads`: The total number of WAL reads.
13+
- `otelcol_exporter_prometheusremotewrite_wal_reads_failures`: The total number of WAL reads failures.
14+
15+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
16+
issues: [39556]
17+
18+
# (Optional) One or more lines of additional information to render under the primary note.
19+
# These lines will be padded with 2 spaces and then inserted directly into the document.
20+
# Use pipe (|) for multiline entries.
21+
subtext:
22+
23+
# If your change doesn't affect end users or the exported elements of any package,
24+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
25+
# Optional: The change log or logs in which this entry should be included.
26+
# e.g. '[user]' or '[user, api]'
27+
# Include 'user' if the change is relevant to end users.
28+
# Include 'api' if there is a change to a library API.
29+
# Default: '[user]'
30+
change_logs: [user]

exporter/prometheusremotewriteexporter/documentation.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,22 @@ Number of Prometheus time series that were translated from OTel metrics
3838
| ---- | ----------- | ---------- | --------- |
3939
| 1 | Sum | Int | true |
4040

41+
### otelcol_exporter_prometheusremotewrite_wal_reads
42+
43+
Number of WAL reads
44+
45+
| Unit | Metric Type | Value Type | Monotonic |
46+
| ---- | ----------- | ---------- | --------- |
47+
| 1 | Sum | Int | true |
48+
49+
### otelcol_exporter_prometheusremotewrite_wal_reads_failures
50+
51+
Number of WAL reads that failed
52+
53+
| Unit | Metric Type | Value Type | Monotonic |
54+
| ---- | ----------- | ---------- | --------- |
55+
| 1 | Sum | Int | true |
56+
4157
### otelcol_exporter_prometheusremotewrite_wal_writes
4258

4359
Number of WAL writes

exporter/prometheusremotewriteexporter/internal/metadata/generated_telemetry.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/prometheusremotewriteexporter/internal/metadatatest/generated_telemetrytest.go

Lines changed: 32 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/prometheusremotewriteexporter/internal/metadatatest/generated_telemetrytest_test.go

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/prometheusremotewriteexporter/metadata.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,17 @@ telemetry:
6060
sum:
6161
value_type: int
6262
monotonic: true
63+
exporter_prometheusremotewrite_wal_reads:
64+
enabled: true
65+
description: Number of WAL reads
66+
unit: "1"
67+
sum:
68+
value_type: int
69+
monotonic: true
70+
exporter_prometheusremotewrite_wal_reads_failures:
71+
enabled: true
72+
description: Number of WAL reads that failed
73+
unit: "1"
74+
sum:
75+
value_type: int
76+
monotonic: true

exporter/prometheusremotewriteexporter/wal.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import (
2727
type prwWalTelemetry interface {
2828
recordWALWrites(ctx context.Context)
2929
recordWALWritesFailures(ctx context.Context)
30+
recordWALReads(ctx context.Context)
31+
recordWALReadsFailures(ctx context.Context)
3032
}
3133

3234
type prwWalTelemetryOTel struct {
@@ -42,6 +44,14 @@ func (p *prwWalTelemetryOTel) recordWALWritesFailures(ctx context.Context) {
4244
p.telemetryBuilder.ExporterPrometheusremotewriteWalWritesFailures.Add(ctx, 1, metric.WithAttributes(p.otelAttrs...))
4345
}
4446

47+
func (p *prwWalTelemetryOTel) recordWALReads(ctx context.Context) {
48+
p.telemetryBuilder.ExporterPrometheusremotewriteWalReads.Add(ctx, 1, metric.WithAttributes(p.otelAttrs...))
49+
}
50+
51+
func (p *prwWalTelemetryOTel) recordWALReadsFailures(ctx context.Context) {
52+
p.telemetryBuilder.ExporterPrometheusremotewriteWalReadsFailures.Add(ctx, 1, metric.WithAttributes(p.otelAttrs...))
53+
}
54+
4555
func newPRWWalTelemetry(set exporter.Settings) (prwWalTelemetry, error) {
4656
telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings)
4757
if err != nil {
@@ -389,6 +399,7 @@ func (prweWAL *prweWAL) readPrompbFromWAL(ctx context.Context, index uint64) (wr
389399
if prweWAL.wal == nil {
390400
return nil, errors.New("attempt to read from closed WAL")
391401
}
402+
prweWAL.telemetry.recordWALReads(ctx)
392403
protoBlob, err = prweWAL.wal.Read(index)
393404
if err == nil { // The read succeeded.
394405
req := new(prompb.WriteRequest)
@@ -403,7 +414,6 @@ func (prweWAL *prweWAL) readPrompbFromWAL(ctx context.Context, index uint64) (wr
403414
return req, nil
404415
}
405416
prweWAL.mu.Unlock()
406-
407417
// If WAL was empty, let's wait for a notification from
408418
// the writer go routine.
409419
if errors.Is(err, wal.ErrNotFound) {
@@ -417,6 +427,8 @@ func (prweWAL *prweWAL) readPrompbFromWAL(ctx context.Context, index uint64) (wr
417427
}
418428

419429
if !errors.Is(err, wal.ErrNotFound) {
430+
// record all failures apart ErrNotFound
431+
prweWAL.telemetry.recordWALReadsFailures(ctx)
420432
return nil, err
421433
}
422434
}

exporter/prometheusremotewriteexporter/wal_test.go

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import (
88
"io"
99
"net/http"
1010
"net/http/httptest"
11+
"os"
12+
"path/filepath"
1113
"sort"
1214
"testing"
1315
"time"
@@ -231,7 +233,7 @@ func TestExportWithWALEnabled(t *testing.T) {
231233
assert.NoError(t, err)
232234
}
233235

234-
func TestWAL_Telemetry(t *testing.T) {
236+
func TestWALWrite_Telemetry(t *testing.T) {
235237
tel := componenttest.NewTelemetry()
236238
t.Cleanup(func() {
237239
require.NoError(t, tel.Shutdown(context.Background()))
@@ -291,3 +293,77 @@ func TestWAL_Telemetry(t *testing.T) {
291293
[]metricdata.DataPoint[int64]{{Value: 1}},
292294
metricdatatest.IgnoreTimestamp())
293295
}
296+
297+
func TestWALRead_Telemetry(t *testing.T) {
298+
// Skip flaky test in CI, because it's flaky and hard to reliably test; still useful for local testing.
299+
t.Skip("Skipping in CI: test is flaky;still useful for local testing")
300+
tel := componenttest.NewTelemetry()
301+
t.Cleanup(func() {
302+
require.NoError(t, tel.Shutdown(context.Background()))
303+
})
304+
set := metadatatest.NewSettings(tel)
305+
306+
// Create a temporary directory for the WAL
307+
tempDir := t.TempDir()
308+
cfg := &Config{
309+
WAL: &WALConfig{
310+
BufferSize: 1,
311+
Directory: tempDir,
312+
},
313+
TargetInfo: &TargetInfo{}, // Declared just to avoid nil pointer dereference.
314+
RemoteWriteProtoMsg: config.RemoteWriteProtoMsgV2,
315+
}
316+
317+
server := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
318+
// Do nothing
319+
}))
320+
defer server.Close()
321+
322+
clientConfig := confighttp.NewDefaultClientConfig()
323+
clientConfig.Endpoint = server.URL
324+
cfg.ClientConfig = clientConfig
325+
326+
prw, err := newPRWExporter(cfg, set)
327+
require.NotNil(t, prw)
328+
require.NoError(t, err)
329+
330+
err = prw.Start(context.Background(), componenttest.NewNopHost())
331+
require.NoError(t, err)
332+
t.Cleanup(func() {
333+
assert.NoError(t, prw.Shutdown(context.Background()))
334+
})
335+
336+
// Verify initial WAL reads metric
337+
metadatatest.AssertEqualExporterPrometheusremotewriteWalReads(t, tel,
338+
[]metricdata.DataPoint[int64]{{Value: 1}},
339+
metricdatatest.IgnoreTimestamp())
340+
wal := prw.wal
341+
342+
// Create some test data
343+
metrics := map[string]*prompb.TimeSeries{
344+
"test_metric": {
345+
Labels: []prompb.Label{{Name: "__name__", Value: "test_metric"}},
346+
Samples: []prompb.Sample{{Value: 1, Timestamp: 100}},
347+
},
348+
}
349+
350+
// Write a successful WAL write first
351+
err = prw.handleExport(context.Background(), metrics, nil)
352+
require.NoError(t, err)
353+
err = wal.wal.Close()
354+
require.NoError(t, err)
355+
356+
// Write corrupted data
357+
corruptedData := []byte{0x80}
358+
firstWalFile := filepath.Join(wal.walPath, "00000000000000000001")
359+
err = os.WriteFile(firstWalFile, corruptedData, 0o600)
360+
require.NoError(t, err)
361+
// write the corrupted data and start reading from the index
362+
363+
err = prw.Start(context.Background(), componenttest.NewNopHost())
364+
// Unable to start the WAL cause there is a corrupted entry
365+
require.Error(t, err)
366+
_, err = tel.GetMetric("otelcol_exporter_prometheusremotewrite_wal_reads_failures")
367+
// verify that the metric exists, so it's incremented
368+
require.NoError(t, err)
369+
}

0 commit comments

Comments
 (0)