Skip to content

[prometheuswritereceiver] target info #38812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Apr 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
131a3b8
test cases
perebaj Mar 19, 2025
1ca4ec8
improve testcase
perebaj Mar 19, 2025
b9dfc9f
mock consumer metrics
perebaj Mar 21, 2025
288c6bd
clean interRequestCache for each run
perebaj Mar 21, 2025
f2c0671
add target info cache
perebaj Mar 21, 2025
bd80734
Update receiver/prometheusremotewritereceiver/receiver.go
perebaj Mar 21, 2025
07fe080
Update receiver/prometheusremotewritereceiver/receiver.go
perebaj Mar 21, 2025
82d52a7
Update receiver/prometheusremotewritereceiver/receiver.go
perebaj Mar 21, 2025
2ddf810
Update receiver/prometheusremotewritereceiver/receiver.go
perebaj Mar 21, 2025
9f19aed
metric name target_info
perebaj Apr 21, 2025
499f0de
Merge branch 'target-info' of github.com:perebaj/opentelemetry-collec…
perebaj Apr 21, 2025
ddce741
remove t.Run
perebaj Apr 21, 2025
14f989d
fix conflict on go.modules and go.sum
perebaj Apr 21, 2025
5a08906
validate expected metric in the end of request
perebaj Apr 21, 2025
2451c27
add receiverhelper.ObsReport
perebaj Apr 21, 2025
f262c7c
Merge branch 'main' into target-info
perebaj Apr 21, 2025
42390ff
Update .chloggen/target-info.yaml
perebaj Apr 22, 2025
3026ee4
Update .chloggen/target-info.yaml
perebaj Apr 22, 2025
7739e72
remove obsreport
perebaj Apr 22, 2025
5afb0bc
Merge branch 'target-info' of github.com:perebaj/opentelemetry-collec…
perebaj Apr 22, 2025
05a8845
remove useless validation ls.Has(labels.MetricName)
perebaj Apr 22, 2025
d2ae2f4
remove useless ReplaceAll _ .
perebaj Apr 22, 2025
bc70d0c
remove old comments
perebaj Apr 22, 2025
3ce627e
replace METRIC_TYPE_INFO by GAUGE
perebaj Apr 22, 2025
14fbe4c
assert stats
perebaj Apr 22, 2025
c4e8771
tests sending target info in different orders
perebaj Apr 22, 2025
2922d19
Merge branch 'main' into target-info
perebaj Apr 22, 2025
da08b4c
conflict go.sum/mod
perebaj Apr 22, 2025
130c2b8
remove replace library & add unit test for taget info multiple requests
perebaj Apr 23, 2025
d13f346
improve comments
perebaj Apr 23, 2025
eceb950
Merge branch 'main' into target-info
perebaj Apr 23, 2025
892426a
rename cache variable
perebaj Apr 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/target-info.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: 'enhancement'

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: prometheusremotewritereceiver

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Cache `target_info` metrics so it can be used to populate metrics' Resource Attributes.

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [37277]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: You can read more about resource attributes handling in https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#resource-attributes-1

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [user]
50 changes: 37 additions & 13 deletions receiver/prometheusremotewritereceiver/receiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,18 @@ func newRemoteWriteReceiver(settings receiver.Settings, cfg *Config, nextConsume
server: &http.Server{
ReadTimeout: 60 * time.Second,
},
rmCache: make(map[uint64]pmetric.ResourceMetrics),
}, nil
}

type prometheusRemoteWriteReceiver struct {
settings receiver.Settings
nextConsumer consumer.Metrics

config *Config
server *http.Server
wg sync.WaitGroup
config *Config
server *http.Server
wg sync.WaitGroup
rmCache map[uint64]pmetric.ResourceMetrics
}

// MetricIdentity contains all the components that uniquely identify a metric
Expand Down Expand Up @@ -163,14 +165,16 @@ func (prw *prometheusRemoteWriteReceiver) handlePRW(w http.ResponseWriter, req *
return
}

_, stats, err := prw.translateV2(req.Context(), &prw2Req)
m, stats, err := prw.translateV2(req.Context(), &prw2Req)
stats.SetHeaders(w)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest) // Following instructions at https://prometheus.io/docs/specs/remote_write_spec_2_0/#invalid-samples
return
}

w.WriteHeader(http.StatusNoContent)
// TODO(@perebaj): Evaluate if we should use the obsreport here. Ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/38812#discussion_r2053094391
_ = prw.nextConsumer.ConsumeMetrics(req.Context(), m)
}

// parseProto parses the content-type header and returns the version of the remote-write protocol.
Expand Down Expand Up @@ -213,11 +217,6 @@ func (prw *prometheusRemoteWriteReceiver) translateV2(_ context.Context, req *wr
otelMetrics = pmetric.NewMetrics()
labelsBuilder = labels.NewScratchBuilder(0)
stats = promremote.WriteResponseStats{}
// Prometheus Remote-Write can send multiple time series with the same labels in the same request.
// Instead of creating a whole new OTLP metric, we just append the new sample to the existing OTLP metric.
// This cache is called "intra" because in the future we'll have a "interRequestCache" to cache resourceAttributes
// between requests based on the metric "target_info".
intraRequestCache = make(map[uint64]pmetric.ResourceMetrics)
// The key is composed by: resource_hash:scope_name:scope_version:metric_name:unit:type
metricCache = make(map[uint64]pmetric.Metric)
)
Expand All @@ -232,16 +231,41 @@ func (prw *prometheusRemoteWriteReceiver) translateV2(_ context.Context, req *wr
continue
}

// If the metric name is equal to target_info, we use its labels as attributes of the resource
// Ref: https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#resource-attributes-1
if ls.Get(labels.MetricName) == "target_info" {
var rm pmetric.ResourceMetrics
hashedLabels := xxhash.Sum64String(ls.Get("job") + string([]byte{'\xff'}) + ls.Get("instance"))

if existingRM, ok := prw.rmCache[hashedLabels]; ok {
rm = existingRM
} else {
rm = otelMetrics.ResourceMetrics().AppendEmpty()
}

attrs := rm.Resource().Attributes()
parseJobAndInstance(attrs, ls.Get("job"), ls.Get("instance"))

// Add the remaining labels as resource attributes
for _, l := range ls {
if l.Name != "job" && l.Name != "instance" && l.Name != labels.MetricName {
attrs.PutStr(l.Name, l.Value)
}
}
prw.rmCache[hashedLabels] = rm
continue
}

// For metrics other than target_info, we need to follow the standard process of creating a metric.
var rm pmetric.ResourceMetrics
hashedLabels := xxhash.Sum64String(ls.Get("job") + string([]byte{'\xff'}) + ls.Get("instance"))
intraCacheEntry, ok := intraRequestCache[hashedLabels]
existingRM, ok := prw.rmCache[hashedLabels]
if ok {
// We found the same time series in the same request, so we should append to the same OTLP metric.
rm = intraCacheEntry
rm = existingRM
} else {
rm = otelMetrics.ResourceMetrics().AppendEmpty()
parseJobAndInstance(rm.Resource().Attributes(), ls.Get("job"), ls.Get("instance"))
intraRequestCache[hashedLabels] = rm
prw.rmCache[hashedLabels] = rm
}

scopeName, scopeVersion := prw.extractScopeInfo(ls)
Expand Down
234 changes: 234 additions & 0 deletions receiver/prometheusremotewritereceiver/receiver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/http/httptest"
"sync"
"testing"
"time"

Expand All @@ -18,6 +20,7 @@ import (
writev2 "github.com/prometheus/prometheus/prompb/io/prometheus/write/v2"
"github.com/prometheus/prometheus/storage/remote"
"github.com/stretchr/testify/assert"
"go.opentelemetry.io/collector/consumer"
"go.opentelemetry.io/collector/consumer/consumertest"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/pmetric"
Expand Down Expand Up @@ -406,8 +409,72 @@ func TestTranslateV2(t *testing.T) {
return expected
}(),
},
{
name: "service with target_info metric",
request: &writev2.Request{
Symbols: []string{
"",
"job", "production/service_a", // 1, 2
"instance", "host1", // 3, 4
"machine_type", "n1-standard-1", // 5, 6
"cloud_provider", "gcp", // 7, 8
"region", "us-central1", // 9, 10
"datacenter", "sdc", // 11, 12
"__name__", "normal_metric", // 13, 14
"d", "e", // 15, 16
"__name__", "target_info", // 17, 18
},
Timeseries: []writev2.TimeSeries{
// Generating 2 metrics, one have the target_info in the name and the other is a normal gauge.
// The target_info metric should be translated to use the resource attributes.
// The normal_metric should be translated as usual.
{
// target_info metric
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 17, 18},
},
{
// normal metric
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{13, 14, 1, 2, 3, 4, 15, 16},
Samples: []writev2.Sample{{Value: 1, Timestamp: 1}},
},
},
},
expectedMetrics: func() pmetric.Metrics {
metrics := pmetric.NewMetrics()

rm := metrics.ResourceMetrics().AppendEmpty()
attrs := rm.Resource().Attributes()
attrs.PutStr("service.namespace", "production")
attrs.PutStr("service.name", "service_a")
attrs.PutStr("service.instance.id", "host1")
attrs.PutStr("machine_type", "n1-standard-1")
attrs.PutStr("cloud_provider", "gcp")
attrs.PutStr("region", "us-central1")
attrs.PutStr("datacenter", "sdc")

sm := rm.ScopeMetrics().AppendEmpty()
sm.Scope().SetName("OpenTelemetry Collector")
sm.Scope().SetVersion("latest")

m := sm.Metrics().AppendEmpty()
m.SetName("normal_metric")
m.SetUnit("")
m.SetDescription("")

dp := m.SetEmptyGauge().DataPoints().AppendEmpty()
dp.SetDoubleValue(1.0)
dp.SetTimestamp(pcommon.Timestamp(1 * int64(time.Millisecond)))
dp.Attributes().PutStr("d", "e")

return metrics
}(),
},
} {
t.Run(tc.name, func(t *testing.T) {
// since we are using the rmCache to store values across requests, we need to clear it after each test, otherwise it will affect the next test
prwReceiver.rmCache = make(map[uint64]pmetric.ResourceMetrics)
metrics, stats, err := prwReceiver.translateV2(ctx, tc.request)
if tc.expectError != "" {
assert.ErrorContains(t, err, tc.expectError)
Expand All @@ -420,3 +487,170 @@ func TestTranslateV2(t *testing.T) {
})
}
}

type nonMutatingConsumer struct{}

// Capabilities returns the base consumer capabilities.
func (bc nonMutatingConsumer) Capabilities() consumer.Capabilities {
return consumer.Capabilities{MutatesData: false}
}

type MockConsumer struct {
nonMutatingConsumer
mu sync.Mutex
metrics []pmetric.Metrics
dataPoints int
}

func (m *MockConsumer) ConsumeMetrics(_ context.Context, md pmetric.Metrics) error {
m.mu.Lock()
defer m.mu.Unlock()
m.metrics = append(m.metrics, md)
m.dataPoints += md.DataPointCount()
return nil
}

func TestTargetInfoWithMultipleRequests(t *testing.T) {
tests := []struct {
name string
requests []*writev2.Request
}{
{
name: "target_info first, normal metric second",
requests: []*writev2.Request{
{
Symbols: []string{
"",
"job", "production/service_a", // 1, 2
"instance", "host1", // 3, 4
"machine_type", "n1-standard-1", // 5, 6
"cloud_provider", "gcp", // 7, 8
"region", "us-central1", // 9, 10
"__name__", "target_info", // 11, 12
},
Timeseries: []writev2.TimeSeries{
{
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
},
},
},
{
Symbols: []string{
"",
"job", "production/service_a", // 1, 2
"instance", "host1", // 3, 4
"__name__", "normal_metric", // 5, 6
"foo", "bar", // 7, 8
},
Timeseries: []writev2.TimeSeries{
{
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{5, 6, 1, 2, 3, 4, 7, 8},
Samples: []writev2.Sample{{Value: 2, Timestamp: 2}},
},
},
},
},
},
{
name: "normal metric first, target_info second",
requests: []*writev2.Request{
{
Symbols: []string{
"",
"job", "production/service_a", // 1, 2
"instance", "host1", // 3, 4
"__name__", "normal_metric", // 5, 6
"foo", "bar", // 7, 8
},
Timeseries: []writev2.TimeSeries{
{
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{5, 6, 1, 2, 3, 4, 7, 8},
Samples: []writev2.Sample{{Value: 2, Timestamp: 2}},
},
},
},
{
Symbols: []string{
"",
"job", "production/service_a", // 1, 2
"instance", "host1", // 3, 4
"machine_type", "n1-standard-1", // 5, 6
"cloud_provider", "gcp", // 7, 8
"region", "us-central1", // 9, 10
"__name__", "target_info", // 11, 12
},
Timeseries: []writev2.TimeSeries{
{
Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE},
LabelsRefs: []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
},
},
},
},
},
}

// Using the same expected metrics for both tests, because we are just checking if the order of the requests changes the result.
expectedMetrics := func() pmetric.Metrics {
metrics := pmetric.NewMetrics()
rm := metrics.ResourceMetrics().AppendEmpty()
attrs := rm.Resource().Attributes()
attrs.PutStr("service.namespace", "production")
attrs.PutStr("service.name", "service_a")
attrs.PutStr("service.instance.id", "host1")
attrs.PutStr("machine_type", "n1-standard-1")
attrs.PutStr("cloud_provider", "gcp")
attrs.PutStr("region", "us-central1")

sm := rm.ScopeMetrics().AppendEmpty()
sm.Scope().SetName("OpenTelemetry Collector")
sm.Scope().SetVersion("latest")
m1 := sm.Metrics().AppendEmpty()
m1.SetName("normal_metric")
m1.SetUnit("")
m1.SetDescription("")
dp1 := m1.SetEmptyGauge().DataPoints().AppendEmpty()
dp1.SetDoubleValue(2.0)
dp1.SetTimestamp(pcommon.Timestamp(2 * int64(time.Millisecond)))
dp1.Attributes().PutStr("foo", "bar")

return metrics
}()

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mockConsumer := new(MockConsumer)
prwReceiver := setupMetricsReceiver(t)
prwReceiver.nextConsumer = mockConsumer

ts := httptest.NewServer(http.HandlerFunc(prwReceiver.handlePRW))
defer ts.Close()

for _, req := range tt.requests {
pBuf := proto.NewBuffer(nil)
// we don't need to compress the body to use the snappy compression in the unit test
// because the encoder is just initialized when we initialize the http server.
// so we can just use the uncompressed body.
err := pBuf.Marshal(req)
assert.NoError(t, err)

resp, err := http.Post(
ts.URL,
fmt.Sprintf("application/x-protobuf;proto=%s", promconfig.RemoteWriteProtoMsgV2),
bytes.NewBuffer(pBuf.Bytes()),
)
assert.NoError(t, err)
defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
assert.NoError(t, err)
assert.Equal(t, http.StatusNoContent, resp.StatusCode, string(body))
}

assert.NoError(t, pmetrictest.CompareMetrics(expectedMetrics, mockConsumer.metrics[0]))
})
}
}
Loading