Skip to content

Commit e42da8b

Browse files
authored
[receiver/kubeletstats] Add k8s.{container,pod}.memory.node.utilization metrics (#33591)
**Description:** <Describe what has changed.> <!--Ex. Fixing a bug - Describe the bug and how this fixes the issue. Ex. Adding a feature - Explain what this achieves.--> Similar to #32295 and #33390, this PR adds the `k8s.{container,pod}.memory.node.utilization` metrics. **Link to tracking Issue:** <Issue number if applicable> #27885 **Testing:** <Describe what testing was performed and which tests were added.> Added unit test. **Documentation:** <Describe the documentation added.> Added ### Manual testing 1. Using the following target Pod: ```yaml apiVersion: v1 kind: Pod metadata: name: memory-demo spec: containers: - name: memory-demo-ctr image: polinux/stress resources: requests: memory: "8070591Ki" limits: memory: "9070591Ki" command: ["stress"] args: ["--vm", "1", "--vm-bytes", "800M", "--vm-hang", "4"] ``` 2. ![memGood](https://github.com/open-telemetry/opentelemetry-collector-contrib/assets/11754898/fae04b30-59ca-4d70-8446-f54b5a085cf7) On a node of 32,5G memory the 800Mb container/Pod consumes the `0.8/32.5=0.0246...=0.025`. --------- Signed-off-by: ChrsMark <[email protected]>
1 parent 948fa91 commit e42da8b

24 files changed

+926
-47
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: kubeletstatsreceiver
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Add `k8s.pod.memory.node.utilization` and `k8s.container.memory.node.utilization` metrics"
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [33591]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: [user]

receiver/kubeletstatsreceiver/README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,10 @@ receivers:
218218
- pod
219219
```
220220

221-
### Collect k8s.container.cpu.node.utilization, `k8s.pod.cpu.node.utilization` as ratio of total node's capacity
221+
### Collect `k8s.{container,pod}.{cpu,memory}.node.utilization` as ratio of total node's capacity
222222

223-
In order to calculate the `k8s.container.cpu.node.utilization` or `k8s.pod.cpu.node.utilization` metrics, the
223+
In order to calculate the `k8s.container.cpu.node.utilization`, `k8s.pod.cpu.node.utilization`,
224+
`k8s.container.memory.node.utilization` and `k8s.pod.memory.node.utilization` metrics, the
224225
information of the node's capacity must be retrieved from the k8s API. In this, the `k8s_api_config` needs to be set.
225226
In addition, the node name must be identified properly. The `K8S_NODE_NAME` env var can be set using the
226227
downward API inside the collector pod spec as follows:
@@ -248,6 +249,10 @@ receivers:
248249
enabled: true
249250
k8s.pod.cpu.node.utilization:
250251
enabled: true
252+
k8s.container.memory.node.utilization:
253+
enabled: true
254+
k8s.pod.memory.node.utilization:
255+
enabled: true
251256
```
252257

253258
### Optional parameters

receiver/kubeletstatsreceiver/config.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,17 @@ func (cfg *Config) Unmarshal(componentParser *confmap.Conf) error {
120120
}
121121

122122
func (cfg *Config) Validate() error {
123-
if cfg.Metrics.K8sContainerCPUNodeUtilization.Enabled && cfg.NodeName == "" {
124-
return errors.New("for k8s.container.cpu.node.utilization node setting is required. Check the readme on how to set the required setting")
125-
} else if cfg.Metrics.K8sPodCPUNodeUtilization.Enabled && cfg.NodeName == "" {
126-
return errors.New("for k8s.pod.cpu.node.utilization node setting is required. Check the readme on how to set the required setting")
123+
if cfg.NodeName == "" {
124+
switch {
125+
case cfg.Metrics.K8sContainerCPUNodeUtilization.Enabled:
126+
return errors.New("for k8s.container.cpu.node.utilization node setting is required. Check the readme on how to set the required setting")
127+
case cfg.Metrics.K8sPodCPUNodeUtilization.Enabled:
128+
return errors.New("for k8s.pod.cpu.node.utilization node setting is required. Check the readme on how to set the required setting")
129+
case cfg.Metrics.K8sContainerMemoryNodeUtilization.Enabled:
130+
return errors.New("for k8s.container.memory.node.utilization node setting is required. Check the readme on how to set the required setting")
131+
case cfg.Metrics.K8sPodMemoryNodeUtilization.Enabled:
132+
return errors.New("for k8s.pod.memory.node.utilization node setting is required. Check the readme on how to set the required setting")
133+
}
127134
}
128135
return nil
129136
}

receiver/kubeletstatsreceiver/config_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,62 @@ func TestLoadConfig(t *testing.T) {
229229
},
230230
expectedValidationErr: "for k8s.pod.cpu.node.utilization node setting is required. Check the readme on how to set the required setting",
231231
},
232+
{
233+
id: component.NewIDWithName(metadata.Type, "container_memory_node_utilization"),
234+
expected: &Config{
235+
ControllerConfig: scraperhelper.ControllerConfig{
236+
CollectionInterval: duration,
237+
InitialDelay: time.Second,
238+
},
239+
ClientConfig: kube.ClientConfig{
240+
APIConfig: k8sconfig.APIConfig{
241+
AuthType: "tls",
242+
},
243+
},
244+
MetricGroupsToCollect: []kubelet.MetricGroup{
245+
kubelet.ContainerMetricGroup,
246+
kubelet.PodMetricGroup,
247+
kubelet.NodeMetricGroup,
248+
},
249+
MetricsBuilderConfig: metadata.MetricsBuilderConfig{
250+
Metrics: metadata.MetricsConfig{
251+
K8sContainerMemoryNodeUtilization: metadata.MetricConfig{
252+
Enabled: true,
253+
},
254+
},
255+
ResourceAttributes: metadata.DefaultResourceAttributesConfig(),
256+
},
257+
},
258+
expectedValidationErr: "for k8s.container.memory.node.utilization node setting is required. Check the readme on how to set the required setting",
259+
},
260+
{
261+
id: component.NewIDWithName(metadata.Type, "pod_memory_node_utilization"),
262+
expected: &Config{
263+
ControllerConfig: scraperhelper.ControllerConfig{
264+
CollectionInterval: duration,
265+
InitialDelay: time.Second,
266+
},
267+
ClientConfig: kube.ClientConfig{
268+
APIConfig: k8sconfig.APIConfig{
269+
AuthType: "tls",
270+
},
271+
},
272+
MetricGroupsToCollect: []kubelet.MetricGroup{
273+
kubelet.ContainerMetricGroup,
274+
kubelet.PodMetricGroup,
275+
kubelet.NodeMetricGroup,
276+
},
277+
MetricsBuilderConfig: metadata.MetricsBuilderConfig{
278+
Metrics: metadata.MetricsConfig{
279+
K8sPodMemoryNodeUtilization: metadata.MetricConfig{
280+
Enabled: true,
281+
},
282+
},
283+
ResourceAttributes: metadata.DefaultResourceAttributesConfig(),
284+
},
285+
},
286+
expectedValidationErr: "for k8s.pod.memory.node.utilization node setting is required. Check the readme on how to set the required setting",
287+
},
232288
}
233289

234290
for _, tt := range tests {

receiver/kubeletstatsreceiver/documentation.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,14 @@ Container cpu utilization as a ratio of the container's requests
426426
| ---- | ----------- | ---------- |
427427
| 1 | Gauge | Double |
428428
429+
### k8s.container.memory.node.utilization
430+
431+
Container memory utilization as a ratio of the node's capacity
432+
433+
| Unit | Metric Type | Value Type |
434+
| ---- | ----------- | ---------- |
435+
| 1 | Gauge | Double |
436+
429437
### k8s.container.memory_limit_utilization
430438
431439
Container memory utilization as a ratio of the container's limits
@@ -490,6 +498,14 @@ Pod cpu utilization as a ratio of the pod's total container requests. If any con
490498
| ---- | ----------- | ---------- |
491499
| 1 | Gauge | Double |
492500
501+
### k8s.pod.memory.node.utilization
502+
503+
Pod memory utilization as a ratio of the node's capacity
504+
505+
| Unit | Metric Type | Value Type |
506+
| ---- | ----------- | ---------- |
507+
| 1 | Gauge | Double |
508+
493509
### k8s.pod.memory_limit_utilization
494510
495511
Pod memory utilization as a ratio of the pod's total container limits. If any container is missing a limit the metric is not emitted.

receiver/kubeletstatsreceiver/internal/kubelet/accumulator.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func (a *metricDataAccumulator) nodeStats(s stats.NodeStats) {
5757
currentTime := pcommon.NewTimestampFromTime(a.time)
5858
addUptimeMetric(a.mbs.NodeMetricsBuilder, metadata.NodeUptimeMetrics.Uptime, s.StartTime, currentTime)
5959
addCPUMetrics(a.mbs.NodeMetricsBuilder, metadata.NodeCPUMetrics, s.CPU, currentTime, resources{}, 0)
60-
addMemoryMetrics(a.mbs.NodeMetricsBuilder, metadata.NodeMemoryMetrics, s.Memory, currentTime, resources{})
60+
addMemoryMetrics(a.mbs.NodeMetricsBuilder, metadata.NodeMemoryMetrics, s.Memory, currentTime, resources{}, 0)
6161
addFilesystemMetrics(a.mbs.NodeMetricsBuilder, metadata.NodeFilesystemMetrics, s.Fs, currentTime)
6262
addNetworkMetrics(a.mbs.NodeMetricsBuilder, metadata.NodeNetworkMetrics, s.Network, currentTime)
6363
// todo s.Runtime.ImageFs
@@ -76,8 +76,8 @@ func (a *metricDataAccumulator) podStats(s stats.PodStats) {
7676

7777
currentTime := pcommon.NewTimestampFromTime(a.time)
7878
addUptimeMetric(a.mbs.PodMetricsBuilder, metadata.PodUptimeMetrics.Uptime, s.StartTime, currentTime)
79-
addCPUMetrics(a.mbs.PodMetricsBuilder, metadata.PodCPUMetrics, s.CPU, currentTime, a.metadata.podResources[s.PodRef.UID], a.metadata.cpuNodeLimit)
80-
addMemoryMetrics(a.mbs.PodMetricsBuilder, metadata.PodMemoryMetrics, s.Memory, currentTime, a.metadata.podResources[s.PodRef.UID])
79+
addCPUMetrics(a.mbs.PodMetricsBuilder, metadata.PodCPUMetrics, s.CPU, currentTime, a.metadata.podResources[s.PodRef.UID], a.metadata.nodeCapacity.CPUCapacity)
80+
addMemoryMetrics(a.mbs.PodMetricsBuilder, metadata.PodMemoryMetrics, s.Memory, currentTime, a.metadata.podResources[s.PodRef.UID], a.metadata.nodeCapacity.MemoryCapacity)
8181
addFilesystemMetrics(a.mbs.PodMetricsBuilder, metadata.PodFilesystemMetrics, s.EphemeralStorage, currentTime)
8282
addNetworkMetrics(a.mbs.PodMetricsBuilder, metadata.PodNetworkMetrics, s.Network, currentTime)
8383

@@ -110,8 +110,8 @@ func (a *metricDataAccumulator) containerStats(sPod stats.PodStats, s stats.Cont
110110
currentTime := pcommon.NewTimestampFromTime(a.time)
111111
resourceKey := sPod.PodRef.UID + s.Name
112112
addUptimeMetric(a.mbs.ContainerMetricsBuilder, metadata.ContainerUptimeMetrics.Uptime, s.StartTime, currentTime)
113-
addCPUMetrics(a.mbs.ContainerMetricsBuilder, metadata.ContainerCPUMetrics, s.CPU, currentTime, a.metadata.containerResources[resourceKey], a.metadata.cpuNodeLimit)
114-
addMemoryMetrics(a.mbs.ContainerMetricsBuilder, metadata.ContainerMemoryMetrics, s.Memory, currentTime, a.metadata.containerResources[resourceKey])
113+
addCPUMetrics(a.mbs.ContainerMetricsBuilder, metadata.ContainerCPUMetrics, s.CPU, currentTime, a.metadata.containerResources[resourceKey], a.metadata.nodeCapacity.CPUCapacity)
114+
addMemoryMetrics(a.mbs.ContainerMetricsBuilder, metadata.ContainerMemoryMetrics, s.Memory, currentTime, a.metadata.containerResources[resourceKey], a.metadata.nodeCapacity.MemoryCapacity)
115115
addFilesystemMetrics(a.mbs.ContainerMetricsBuilder, metadata.ContainerFilesystemMetrics, s.Rootfs, currentTime)
116116

117117
a.m = append(a.m, a.mbs.ContainerMetricsBuilder.Emit(

receiver/kubeletstatsreceiver/internal/kubelet/accumulator_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func TestMetadataErrorCases(t *testing.T) {
5353
},
5454
},
5555
},
56-
}, NodeLimits{}, nil),
56+
}, NodeCapacity{}, nil),
5757
testScenario: func(acc metricDataAccumulator) {
5858
now := metav1.Now()
5959
podStats := stats.PodStats{
@@ -79,7 +79,7 @@ func TestMetadataErrorCases(t *testing.T) {
7979
metricGroupsToCollect: map[MetricGroup]bool{
8080
VolumeMetricGroup: true,
8181
},
82-
metadata: NewMetadata([]MetadataLabel{MetadataLabelVolumeType}, nil, NodeLimits{}, nil),
82+
metadata: NewMetadata([]MetadataLabel{MetadataLabelVolumeType}, nil, NodeCapacity{}, nil),
8383
testScenario: func(acc metricDataAccumulator) {
8484
podStats := stats.PodStats{
8585
PodRef: stats.PodReference{
@@ -121,7 +121,7 @@ func TestMetadataErrorCases(t *testing.T) {
121121
},
122122
},
123123
},
124-
}, NodeLimits{}, nil),
124+
}, NodeCapacity{}, nil),
125125
testScenario: func(acc metricDataAccumulator) {
126126
podStats := stats.PodStats{
127127
PodRef: stats.PodReference{
@@ -165,7 +165,7 @@ func TestMetadataErrorCases(t *testing.T) {
165165
},
166166
},
167167
},
168-
}, NodeLimits{}, nil),
168+
}, NodeCapacity{}, nil),
169169
detailedPVCLabelsSetterOverride: func(*metadata.ResourceBuilder, string, string, string) error {
170170
// Mock failure cases.
171171
return errors.New("")

receiver/kubeletstatsreceiver/internal/kubelet/mem.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ import (
1010
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kubeletstatsreceiver/internal/metadata"
1111
)
1212

13-
func addMemoryMetrics(mb *metadata.MetricsBuilder, memoryMetrics metadata.MemoryMetrics, s *stats.MemoryStats, currentTime pcommon.Timestamp, r resources) {
13+
func addMemoryMetrics(
14+
mb *metadata.MetricsBuilder,
15+
memoryMetrics metadata.MemoryMetrics,
16+
s *stats.MemoryStats,
17+
currentTime pcommon.Timestamp,
18+
r resources,
19+
nodeMemoryLimit float64) {
1420
if s == nil {
1521
return
1622
}
@@ -29,5 +35,8 @@ func addMemoryMetrics(mb *metadata.MetricsBuilder, memoryMetrics metadata.Memory
2935
if r.memoryRequest > 0 {
3036
memoryMetrics.RequestUtilization(mb, currentTime, float64(*s.UsageBytes)/float64(r.memoryRequest))
3137
}
38+
if nodeMemoryLimit > 0 {
39+
memoryMetrics.NodeUtilization(mb, currentTime, float64(*s.UsageBytes)/nodeMemoryLimit)
40+
}
3241
}
3342
}

receiver/kubeletstatsreceiver/internal/kubelet/metadata.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ type Metadata struct {
5252
DetailedPVCResourceSetter func(rb *metadata.ResourceBuilder, volCacheID, volumeClaim, namespace string) error
5353
podResources map[string]resources
5454
containerResources map[string]resources
55-
cpuNodeLimit float64
55+
nodeCapacity NodeCapacity
5656
}
5757

5858
type resources struct {
@@ -62,9 +62,12 @@ type resources struct {
6262
memoryLimit int64
6363
}
6464

65-
type NodeLimits struct {
66-
Name string
67-
CPUNanoCoresLimit float64
65+
type NodeCapacity struct {
66+
Name string
67+
// node's CPU capacity in cores
68+
CPUCapacity float64
69+
// node's Memory capacity in bytes
70+
MemoryCapacity float64
6871
}
6972

7073
func getContainerResources(r *v1.ResourceRequirements) resources {
@@ -80,15 +83,15 @@ func getContainerResources(r *v1.ResourceRequirements) resources {
8083
}
8184
}
8285

83-
func NewMetadata(labels []MetadataLabel, podsMetadata *v1.PodList, nodeResourceLimits NodeLimits,
86+
func NewMetadata(labels []MetadataLabel, podsMetadata *v1.PodList, nodeCap NodeCapacity,
8487
detailedPVCResourceSetter func(rb *metadata.ResourceBuilder, volCacheID, volumeClaim, namespace string) error) Metadata {
8588
m := Metadata{
8689
Labels: getLabelsMap(labels),
8790
PodsMetadata: podsMetadata,
8891
DetailedPVCResourceSetter: detailedPVCResourceSetter,
8992
podResources: make(map[string]resources),
9093
containerResources: make(map[string]resources),
91-
cpuNodeLimit: nodeResourceLimits.CPUNanoCoresLimit,
94+
nodeCapacity: nodeCap,
9295
}
9396

9497
if podsMetadata != nil {

0 commit comments

Comments
 (0)