Skip to content

Commit 6131199

Browse files
authored
feat: add memory limiter to drop data when a soft limit is reached (#1827)
## Problem At the moment, if there is pressure in the pipeline for any reason, and batches are failed to export, they will start building up in the queues of the collector exporter and grow memory unboundly. Since we don't set any memory request or limit on the node collectors ds, they will just go on to consume more and more of the available memory on the node: 1. Will show a pick in resource consumption on the cluster metrics. 2. Starve other pods on the same node, which now has less spare memory to grow into. 3. If the issue is not transient, the memory will just keep increasing over time 4. The amount of data in the retry buffers, will keep the CPU busy attempting to retry the rejected or unsuccessful batches. ## Levels of Protections To prevent the above issues, we imply few level of protections, listed from first line to last resort: 1. setting GOMEMLIMIT to a (now hardcoded constant) `352MiB`. At this point, go runtime GC should kick in and start reclaiming memory aggressively. 2. Setting the otel collector soft limit to (now hardcoded constant) `384MiB`. When the heap allocations reach this amount, the collector will start dropping batches of data after they are exported from the `batch` processor, instead of streaming them down the pipeline. 3. Setting the otel collector hard limit to `512MiB`. When the heap reaches this number, a forced GC is performed. 4. Setting the memory request to `256MiB`. This ensures we have at least this amount of memory to handle normal traffic and some slack for spikes without running into OOM. the rest of the memory is consumed from available memory on the node which by handy for more buffering, but may also cause OOM if the node has no resources. ## Future Work - Add configuration options to set these values, preferably as a spectrum for trace-offs: "resource-stability", "resource-spikecapacity" - drop the data as it received not after it is batched - open-telemetry/opentelemetry-collector#11726 - drop data at receiver when it's implemented in collector - open-telemetry/opentelemetry-collector#9591
1 parent 5d612b2 commit 6131199

File tree

7 files changed

+103
-23
lines changed

7 files changed

+103
-23
lines changed

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
TAG ?= $(shell odigos version --cluster)
22
ODIGOS_CLI_VERSION ?= $(shell odigos version --cli)
3-
ORG := keyval
3+
ORG ?= keyval
44

55
.PHONY: build-odiglet
66
build-odiglet:
@@ -62,13 +62,18 @@ push-scheduler:
6262
push-collector:
6363
docker buildx build --platform linux/amd64,linux/arm64/v8 --push -t $(ORG)/odigos-collector:$(TAG) collector -f collector/Dockerfile
6464

65+
.PHONY: push-ui
66+
push-ui:
67+
docker buildx build --platform linux/amd64,linux/arm64/v8 --push -t $(ORG)/odigos-ui:$(TAG) . -f frontend/Dockerfile
68+
6569
.PHONY: push-images
6670
push-images:
6771
make push-autoscaler TAG=$(TAG)
6872
make push-scheduler TAG=$(TAG)
6973
make push-odiglet TAG=$(TAG)
7074
make push-instrumentor TAG=$(TAG)
7175
make push-collector TAG=$(TAG)
76+
make push-ui TAG=$(TAG)
7277

7378
.PHONY: load-to-kind-odiglet
7479
load-to-kind-odiglet:
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package common
2+
3+
import (
4+
odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
5+
"github.com/odigos-io/odigos/common/config"
6+
)
7+
8+
func GetMemoryLimiterConfig(memorySettings odigosv1.CollectorsGroupResourcesSettings) config.GenericMap {
9+
// check_interval is currently hardcoded to 1s
10+
// this seems to be a reasonable value for the memory limiter and what the processor uses in docs.
11+
// preforming memory checks is expensive, so we trade off performance with fast reaction time to memory pressure.
12+
return config.GenericMap{
13+
"check_interval": "1s",
14+
"limit_mib": memorySettings.MemoryLimiterLimitMiB,
15+
"spike_limit_mib": memorySettings.MemoryLimiterSpikeLimitMiB,
16+
}
17+
}

autoscaler/controllers/datacollection/configmap.go

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ import (
99

1010
"github.com/ghodss/yaml"
1111
odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
12+
"github.com/odigos-io/odigos/autoscaler/controllers/common"
1213
commonconf "github.com/odigos-io/odigos/autoscaler/controllers/common"
1314
"github.com/odigos-io/odigos/autoscaler/controllers/datacollection/custom"
14-
"github.com/odigos-io/odigos/common"
15+
odigoscommon "github.com/odigos-io/odigos/common"
1516
"github.com/odigos-io/odigos/common/config"
1617
"github.com/odigos-io/odigos/common/consts"
1718
constsK8s "github.com/odigos-io/odigos/k8sutils/pkg/consts"
@@ -124,23 +125,26 @@ func getDesiredConfigMap(apps *odigosv1.InstrumentedApplicationList, dests *odig
124125
return &desired, nil
125126
}
126127

127-
func calculateConfigMapData(collectorsGroup *odigosv1.CollectorsGroup, apps *odigosv1.InstrumentedApplicationList, dests *odigosv1.DestinationList, processors []*odigosv1.Processor,
128+
func calculateConfigMapData(nodeCG *odigosv1.CollectorsGroup, apps *odigosv1.InstrumentedApplicationList, dests *odigosv1.DestinationList, processors []*odigosv1.Processor,
128129
setTracesLoadBalancer bool, disableNameProcessor bool) (string, error) {
129130

130-
ownMetricsPort := collectorsGroup.Spec.CollectorOwnMetricsPort
131+
ownMetricsPort := nodeCG.Spec.CollectorOwnMetricsPort
131132

132133
empty := struct{}{}
133134

134135
processorsCfg, tracesProcessors, metricsProcessors, logsProcessors, errs := config.GetCrdProcessorsConfigMap(commonconf.ToProcessorConfigurerArray(processors))
135136
for name, err := range errs {
136-
log.Log.V(0).Info(err.Error(), "processor", name)
137+
log.Log.V(0).Error(err, "processor", name)
137138
}
138139

139140
if !disableNameProcessor {
140141
processorsCfg["odigosresourcename"] = empty
141142
}
142143

144+
memoryLimiterConfiguration := common.GetMemoryLimiterConfig(nodeCG.Spec.ResourcesSettings)
145+
143146
processorsCfg["batch"] = empty
147+
processorsCfg["memory_limiter"] = memoryLimiterConfiguration
144148
processorsCfg["resource"] = config.GenericMap{
145149
"attributes": []config.GenericMap{{
146150
"key": "k8s.node.name",
@@ -266,13 +270,13 @@ func calculateConfigMapData(collectorsGroup *odigosv1.CollectorsGroup, apps *odi
266270
collectLogs := false
267271
for _, dst := range dests.Items {
268272
for _, s := range dst.Spec.Signals {
269-
if s == common.LogsObservabilitySignal && !custom.DestRequiresCustom(dst.Spec.Type) {
273+
if s == odigoscommon.LogsObservabilitySignal && !custom.DestRequiresCustom(dst.Spec.Type) {
270274
collectLogs = true
271275
}
272-
if s == common.TracesObservabilitySignal || dst.Spec.Type == common.PrometheusDestinationType {
276+
if s == odigoscommon.TracesObservabilitySignal || dst.Spec.Type == odigoscommon.PrometheusDestinationType {
273277
collectTraces = true
274278
}
275-
if s == common.MetricsObservabilitySignal && !custom.DestRequiresCustom(dst.Spec.Type) {
279+
if s == odigoscommon.MetricsObservabilitySignal && !custom.DestRequiresCustom(dst.Spec.Type) {
276280
collectMetrics = true
277281
}
278282
}
@@ -364,7 +368,7 @@ func getConfigMap(ctx context.Context, c client.Client, namespace string) (*v1.C
364368
return configMap, nil
365369
}
366370

367-
func getSignalsFromOtelcolConfig(otelcolConfigContent string) ([]common.ObservabilitySignal, error) {
371+
func getSignalsFromOtelcolConfig(otelcolConfigContent string) ([]odigoscommon.ObservabilitySignal, error) {
368372
config := config.Config{}
369373
err := yaml.Unmarshal([]byte(otelcolConfigContent), &config)
370374
if err != nil {
@@ -389,22 +393,28 @@ func getSignalsFromOtelcolConfig(otelcolConfigContent string) ([]common.Observab
389393
}
390394
}
391395

392-
signals := []common.ObservabilitySignal{}
396+
signals := []odigoscommon.ObservabilitySignal{}
393397
if tracesEnabled {
394-
signals = append(signals, common.TracesObservabilitySignal)
398+
signals = append(signals, odigoscommon.TracesObservabilitySignal)
395399
}
396400
if metricsEnabled {
397-
signals = append(signals, common.MetricsObservabilitySignal)
401+
signals = append(signals, odigoscommon.MetricsObservabilitySignal)
398402
}
399403
if logsEnabled {
400-
signals = append(signals, common.LogsObservabilitySignal)
404+
signals = append(signals, odigoscommon.LogsObservabilitySignal)
401405
}
402406

403407
return signals, nil
404408
}
405409

406410
func getCommonProcessors(disableNameProcessor bool) []string {
407-
processors := []string{"batch"}
411+
// memory limiter is placed right after batch processor an not the first processor in pipeline
412+
// this is so that instrumented application always succeeds in sending data to the collector
413+
// (on it being added to a batch) and checking the memory limit later after the batch
414+
// where memory rejection would drop the data instead of backpressuring the application.
415+
// Read more about it here: https://github.com/open-telemetry/opentelemetry-collector/issues/11726
416+
// Also related: https://github.com/open-telemetry/opentelemetry-collector/issues/9591
417+
processors := []string{"batch", "memory_limiter"}
408418
if !disableNameProcessor {
409419
processors = append(processors, "odigosresourcename")
410420
}

autoscaler/controllers/datacollection/daemonset.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
appsv1 "k8s.io/api/apps/v1"
1717
corev1 "k8s.io/api/core/v1"
1818
apierrors "k8s.io/apimachinery/pkg/api/errors"
19+
"k8s.io/apimachinery/pkg/api/resource"
1920
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021
"k8s.io/apimachinery/pkg/runtime"
2122
"k8s.io/apimachinery/pkg/types"
@@ -191,6 +192,9 @@ func getDesiredDaemonSet(datacollection *odigosv1.CollectorsGroup, configData st
191192
rollingUpdate.MaxSurge = &maxSurge
192193
}
193194

195+
requestMemoryRequestQuantity := resource.MustParse(fmt.Sprintf("%dMi", datacollection.Spec.ResourcesSettings.MemoryRequestMiB))
196+
requestMemoryLimitQuantity := resource.MustParse(fmt.Sprintf("%dMi", datacollection.Spec.ResourcesSettings.MemoryLimitMiB))
197+
194198
desiredDs := &appsv1.DaemonSet{
195199
ObjectMeta: metav1.ObjectMeta{
196200
Name: consts.OdigosNodeCollectorDaemonSetName,
@@ -302,6 +306,10 @@ func getDesiredDaemonSet(datacollection *odigosv1.CollectorsGroup, configData st
302306
},
303307
},
304308
},
309+
{
310+
Name: "GOMEMLIMIT",
311+
Value: fmt.Sprintf("%dMiB", datacollection.Spec.ResourcesSettings.GomemlimitMiB),
312+
},
305313
},
306314
LivenessProbe: &corev1.Probe{
307315
ProbeHandler: corev1.ProbeHandler{
@@ -319,6 +327,14 @@ func getDesiredDaemonSet(datacollection *odigosv1.CollectorsGroup, configData st
319327
},
320328
},
321329
},
330+
Resources: corev1.ResourceRequirements{
331+
Requests: corev1.ResourceList{
332+
corev1.ResourceMemory: requestMemoryRequestQuantity,
333+
},
334+
Limits: corev1.ResourceList{
335+
corev1.ResourceMemory: requestMemoryLimitQuantity,
336+
},
337+
},
322338
SecurityContext: &corev1.SecurityContext{
323339
Privileged: boolPtr(true),
324340
},

autoscaler/controllers/gateway/configmap.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,7 @@ func addSelfTelemetryPipeline(c *config.Config, ownTelemetryPort int32) error {
113113

114114
func syncConfigMap(dests *odigosv1.DestinationList, allProcessors *odigosv1.ProcessorList, gateway *odigosv1.CollectorsGroup, ctx context.Context, c client.Client, scheme *runtime.Scheme) (string, []odigoscommon.ObservabilitySignal, error) {
115115
logger := log.FromContext(ctx)
116-
117-
memoryLimiterConfiguration := config.GenericMap{
118-
"check_interval": "1s",
119-
"limit_mib": gateway.Spec.ResourcesSettings.MemoryLimiterLimitMiB,
120-
"spike_limit_mib": gateway.Spec.ResourcesSettings.MemoryLimiterSpikeLimitMiB,
121-
}
116+
memoryLimiterConfiguration := common.GetMemoryLimiterConfig(gateway.Spec.ResourcesSettings)
122117

123118
processors := common.FilterAndSortProcessorsByOrderHint(allProcessors, odigosv1.CollectorsGroupRoleClusterGateway)
124119

scheduler/controllers/nodecollectorsgroup/common.go

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,44 @@ import (
66

77
odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1"
88
"github.com/odigos-io/odigos/common"
9-
"github.com/odigos-io/odigos/k8sutils/pkg/consts"
109
k8sutilsconsts "github.com/odigos-io/odigos/k8sutils/pkg/consts"
1110
"github.com/odigos-io/odigos/k8sutils/pkg/env"
1211
"github.com/odigos-io/odigos/k8sutils/pkg/utils"
1312
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1413
"sigs.k8s.io/controller-runtime/pkg/client"
1514
)
1615

16+
func getMemorySettings(odigosConfig common.OdigosConfiguration) odigosv1.CollectorsGroupResourcesSettings {
17+
// TODO: currently using hardcoded values, should be configurable.
18+
//
19+
// memory request is expensive on daemonsets since it will consume this memory
20+
// on each node in the cluster. setting to 256, but allowing memory to spike higher
21+
// to consume more available memory on the node.
22+
// if the node has memory to spare, we can use it to buffer more data before dropping,
23+
// but it also means that if no memory is available, collector might get killed by OOM killer.
24+
//
25+
// we can trade-off the memory request:
26+
// - more memory request: more memory allocated per collector on each node, but more buffer for bursts and transient failures.
27+
// - less memory request: efficient use of cluster resources, but data might be dropped earlier on spikes.
28+
// currently choosing 256MiB as a balance (~200MiB left for heap to handle batches and export queues).
29+
//
30+
// we can trade-off how high the memory limit is set above the request:
31+
// - limit is set to request: collector most stable (no OOM) but smaller buffer for bursts and early data drop.
32+
// - limit is set way above request: in case of memory spike, collector will use extra memory available on the node to buffer data, but might get killed by OOM killer if this memory is not available.
33+
// currently choosing 512MiB as a balance (200MiB guaranteed for heap, and the rest ~300MiB of buffer from node before start dropping).
34+
//
35+
return odigosv1.CollectorsGroupResourcesSettings{
36+
MemoryRequestMiB: 256,
37+
MemoryLimitMiB: 512 + 64,
38+
MemoryLimiterLimitMiB: 512,
39+
MemoryLimiterSpikeLimitMiB: 128, // meaning that collector will start dropping data at 512-128=384MiB
40+
GomemlimitMiB: 512 - 128 - 32, // start aggressive GC 32 MiB before soft limit and dropping data
41+
}
42+
}
43+
1744
func newNodeCollectorGroup(odigosConfig common.OdigosConfiguration) *odigosv1.CollectorsGroup {
1845

19-
ownMetricsPort := consts.OdigosNodeCollectorOwnTelemetryPortDefault
46+
ownMetricsPort := k8sutilsconsts.OdigosNodeCollectorOwnTelemetryPortDefault
2047
if odigosConfig.CollectorNode != nil && odigosConfig.CollectorNode.CollectorOwnMetricsPort != 0 {
2148
ownMetricsPort = odigosConfig.CollectorNode.CollectorOwnMetricsPort
2249
}
@@ -27,12 +54,13 @@ func newNodeCollectorGroup(odigosConfig common.OdigosConfiguration) *odigosv1.Co
2754
APIVersion: "odigos.io/v1alpha1",
2855
},
2956
ObjectMeta: metav1.ObjectMeta{
30-
Name: consts.OdigosNodeCollectorDaemonSetName,
57+
Name: k8sutilsconsts.OdigosNodeCollectorDaemonSetName,
3158
Namespace: env.GetCurrentNamespace(),
3259
},
3360
Spec: odigosv1.CollectorsGroupSpec{
3461
Role: odigosv1.CollectorsGroupRoleNodeCollector,
3562
CollectorOwnMetricsPort: ownMetricsPort,
63+
ResourcesSettings: getMemorySettings(odigosConfig),
3664
},
3765
}
3866
}

tests/common/assert/pipeline-ready.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ spec:
5959
resources:
6060
requests:
6161
(memory != null): true
62+
limits:
63+
(memory != null): true
6264
volumeMounts:
6365
- mountPath: /conf
6466
name: collector-conf
@@ -150,6 +152,13 @@ spec:
150152
fieldRef:
151153
apiVersion: v1
152154
fieldPath: metadata.name
155+
- name: GOMEMLIMIT
156+
(value != null): true
157+
resources:
158+
requests:
159+
(memory != null): true
160+
limits:
161+
(memory != null): true
153162
hostNetwork: true
154163
nodeSelector:
155164
kubernetes.io/os: linux

0 commit comments

Comments
 (0)