Skip to content

Commit 00fe12d

Browse files
BinaryFissionGamesdjaglowski
authored andcommitted
[receiver/hostmetrics] Divide by logical cores when calculating process.cpu.utilization (open-telemetry#31378)
**Description:** When calculating the process.cpu.utilization metric, values over 1 were possible since the number of cores was not taken into account (a single process may run on multiple logical cores, this effectively multplying the maximum amount of CPU time the process may take). This PR adds a division by the number of logical cores to the calculation for cpu utilization. **Link to tracking Issue:** Closes open-telemetry#31368 **Testing:** * Added some unit tests * Tested locally on my system with the program I posted in the issue: ```json { "name": "process.cpu.utilization", "description": "Percentage of total CPU time used by the process since last scrape, expressed as a value between 0 and 1. On the first scrape, no data point is emitted for this metric.", "unit": "1", "gauge": { "dataPoints": [ { "attributes": [{ "key": "state", "value": { "stringValue": "user" } }], "startTimeUnixNano": "1708562810521000000", "timeUnixNano": "1708562890771386000", "asDouble": 0.8811268516953904 }, { "attributes": [ { "key": "state", "value": { "stringValue": "system" } } ], "startTimeUnixNano": "1708562810521000000", "timeUnixNano": "1708562890771386000", "asDouble": 0.0029471002907659667 }, { "attributes": [{ "key": "state", "value": { "stringValue": "wait" } }], "startTimeUnixNano": "1708562810521000000", "timeUnixNano": "1708562890771386000", "asDouble": 0 } ] } } ``` In activity monitor, this process was clocking in around ~1000% - ~1100% cpu, on my machine that has 12 logical cores. So the value of around 90% total utilization seems correct here. **Documentation:** N/A --------- Co-authored-by: Daniel Jaglowski <[email protected]>
1 parent 9ab861f commit 00fe12d

File tree

6 files changed

+163
-12
lines changed

6 files changed

+163
-12
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: bug_fix
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: hostmetricsreceiver
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Adds the receiver.hostmetrics.normalizeProcessCPUUtilization feature gate to optionally normalize process.cpu.utilization values.
11+
12+
subtext: >
13+
When enabled, the receiver.hostmetrics.normalizeProcessCPUUtilization feature gate will cause process.cpu.utilization
14+
values to be divided by the number of logical cores on the system. This is necessary to produce a value on the interval of
15+
[0-1], as the description of process.cpu.utilization the metric says.
16+
17+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
18+
issues: [31368]

receiver/hostmetricsreceiver/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,18 @@ Currently, the hostmetrics receiver does not set any Resource attributes on the
192192
export OTEL_RESOURCE_ATTRIBUTES="service.name=<the name of your service>,service.namespace=<the namespace of your service>,service.instance.id=<uuid of the instance>"
193193
```
194194
195+
## Feature Gates
196+
197+
See the [Collector feature gates](https://github.com/open-telemetry/opentelemetry-collector/blob/main/featuregate/README.md#collector-feature-gates) for an overview of feature gates in the collector.
198+
199+
### `receiver.hostmetrics.normalizeProcessCPUUtilization`
200+
201+
When enabled, normalizes the `process.cpu.utilization` metric onto the interval [0-1] by dividing the value by the number of logical processors. With this feature gate disabled, the value of the `process.cpu.utilization` metric may exceed 1.
202+
203+
For example, if you have 4 logical cores on your system, and a process is occupying 2 logical cores for an entire scrape interval, with this feature gate disabled a `process.cpu.utilization` metric will be emitted with a value of 2. if this feature gate is enabled in the same scenario, the value of the emitted metric will be 0.5.
204+
205+
The schedule for this feature gate is:
206+
- Introduced in v0.97.0 (March 2024) as `alpha` - disabled by default.
207+
- Moved to `beta` in v0.99.0 (April 2024) - enabled by default.
208+
- Moved to `stable` in v0.101.0 (May 2024) - cannot be disabled.
209+
- Removed three releases after `stable`.

receiver/hostmetricsreceiver/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ require (
1515
go.opentelemetry.io/collector/component v0.96.1-0.20240306115632-b2693620eff6
1616
go.opentelemetry.io/collector/confmap v0.96.1-0.20240306115632-b2693620eff6
1717
go.opentelemetry.io/collector/consumer v0.96.1-0.20240306115632-b2693620eff6
18+
go.opentelemetry.io/collector/featuregate v1.3.1-0.20240306115632-b2693620eff6
1819
go.opentelemetry.io/collector/otelcol v0.96.1-0.20240306115632-b2693620eff6
1920
go.opentelemetry.io/collector/pdata v1.3.1-0.20240306115632-b2693620eff6
2021
go.opentelemetry.io/collector/receiver v0.96.1-0.20240306115632-b2693620eff6
@@ -101,7 +102,6 @@ require (
101102
go.opentelemetry.io/collector/connector v0.96.1-0.20240306115632-b2693620eff6 // indirect
102103
go.opentelemetry.io/collector/exporter v0.96.1-0.20240306115632-b2693620eff6 // indirect
103104
go.opentelemetry.io/collector/extension v0.96.1-0.20240306115632-b2693620eff6 // indirect
104-
go.opentelemetry.io/collector/featuregate v1.3.1-0.20240306115632-b2693620eff6 // indirect
105105
go.opentelemetry.io/collector/processor v0.96.1-0.20240306115632-b2693620eff6 // indirect
106106
go.opentelemetry.io/collector/service v0.96.1-0.20240306115632-b2693620eff6 // indirect
107107
go.opentelemetry.io/contrib/config v0.4.0 // indirect

receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/shirou/gopsutil/v3/common"
14+
"github.com/shirou/gopsutil/v3/cpu"
1415
"github.com/shirou/gopsutil/v3/process"
1516
"go.opentelemetry.io/collector/component"
1617
"go.opentelemetry.io/collector/pdata/pcommon"
@@ -48,6 +49,7 @@ type scraper struct {
4849
excludeFS filterset.FilterSet
4950
scrapeProcessDelay time.Duration
5051
ucals map[int32]*ucal.CPUUtilizationCalculator
52+
logicalCores int
5153

5254
// for mocking
5355
getProcessCreateTime func(p processHandle, ctx context.Context) (int64, error)
@@ -84,6 +86,13 @@ func newProcessScraper(settings receiver.CreateSettings, cfg *Config) (*scraper,
8486
}
8587
}
8688

89+
logicalCores, err := cpu.Counts(true)
90+
if err != nil {
91+
return nil, fmt.Errorf("error getting number of logical cores: %w", err)
92+
}
93+
94+
scraper.logicalCores = logicalCores
95+
8796
return scraper, nil
8897
}
8998

@@ -284,7 +293,7 @@ func (s *scraper) scrapeAndAppendCPUTimeMetric(ctx context.Context, now pcommon.
284293
s.ucals[pid] = &ucal.CPUUtilizationCalculator{}
285294
}
286295

287-
err = s.ucals[pid].CalculateAndRecord(now, times, s.recordCPUUtilization)
296+
err = s.ucals[pid].CalculateAndRecord(now, s.logicalCores, times, s.recordCPUUtilization)
288297
return err
289298
}
290299

receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,18 @@ import (
77
"time"
88

99
"github.com/shirou/gopsutil/v3/cpu"
10+
"go.opentelemetry.io/collector/featuregate"
1011
"go.opentelemetry.io/collector/pdata/pcommon"
1112
)
1213

14+
var normalizeProcessCPUUtilizationFeatureGate = featuregate.GlobalRegistry().MustRegister(
15+
"receiver.hostmetrics.normalizeProcessCPUUtilization",
16+
featuregate.StageAlpha,
17+
featuregate.WithRegisterDescription("When enabled, normalizes the process.cpu.utilization metric onto the interval [0-1] by dividing the value by the number of logical processors."),
18+
featuregate.WithRegisterFromVersion("v0.97.0"),
19+
featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/31368"),
20+
)
21+
1322
// CPUUtilization stores the utilization percents [0-1] for the different cpu states
1423
type CPUUtilization struct {
1524
User float64
@@ -27,9 +36,9 @@ type CPUUtilizationCalculator struct {
2736
// CalculateAndRecord calculates the cpu utilization for the different cpu states comparing previously
2837
// stored []cpu.TimesStat and time.Time and current []cpu.TimesStat and current time.Time
2938
// If no previous data is stored it will return empty slice of CPUUtilization and no error
30-
func (c *CPUUtilizationCalculator) CalculateAndRecord(now pcommon.Timestamp, currentCPUStats *cpu.TimesStat, recorder func(pcommon.Timestamp, CPUUtilization)) error {
39+
func (c *CPUUtilizationCalculator) CalculateAndRecord(now pcommon.Timestamp, logicalCores int, currentCPUStats *cpu.TimesStat, recorder func(pcommon.Timestamp, CPUUtilization)) error {
3140
if c.previousCPUStats != nil {
32-
recorder(now, cpuUtilization(c.previousCPUStats, c.previousReadTime, currentCPUStats, now))
41+
recorder(now, cpuUtilization(logicalCores, c.previousCPUStats, c.previousReadTime, currentCPUStats, now))
3342
}
3443
c.previousCPUStats = currentCPUStats
3544
c.previousReadTime = now
@@ -38,14 +47,26 @@ func (c *CPUUtilizationCalculator) CalculateAndRecord(now pcommon.Timestamp, cur
3847
}
3948

4049
// cpuUtilization calculates the difference between 2 cpu.TimesStat using spent time between them
41-
func cpuUtilization(startStats *cpu.TimesStat, startTime pcommon.Timestamp, endStats *cpu.TimesStat, endTime pcommon.Timestamp) CPUUtilization {
50+
func cpuUtilization(logicalCores int, startStats *cpu.TimesStat, startTime pcommon.Timestamp, endStats *cpu.TimesStat, endTime pcommon.Timestamp) CPUUtilization {
4251
elapsedTime := time.Duration(endTime - startTime).Seconds()
4352
if elapsedTime <= 0 {
4453
return CPUUtilization{}
4554
}
55+
56+
userUtilization := (endStats.User - startStats.User) / elapsedTime
57+
systemUtilization := (endStats.System - startStats.System) / elapsedTime
58+
ioWaitUtilization := (endStats.Iowait - startStats.Iowait) / elapsedTime
59+
60+
if normalizeProcessCPUUtilizationFeatureGate.IsEnabled() && logicalCores > 0 {
61+
// Normalize onto the [0-1] interval by dividing by the number of logical cores
62+
userUtilization /= float64(logicalCores)
63+
systemUtilization /= float64(logicalCores)
64+
ioWaitUtilization /= float64(logicalCores)
65+
}
66+
4667
return CPUUtilization{
47-
User: (endStats.User - startStats.User) / elapsedTime,
48-
System: (endStats.System - startStats.System) / elapsedTime,
49-
Iowait: (endStats.Iowait - startStats.Iowait) / elapsedTime,
68+
User: userUtilization,
69+
System: systemUtilization,
70+
Iowait: ioWaitUtilization,
5071
}
5172
}

receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator_test.go

Lines changed: 92 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99

1010
"github.com/shirou/gopsutil/v3/cpu"
1111
"github.com/stretchr/testify/assert"
12+
"github.com/stretchr/testify/require"
13+
"go.opentelemetry.io/collector/featuregate"
1214
"go.opentelemetry.io/collector/pdata/pcommon"
1315
)
1416

@@ -24,21 +26,25 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) {
2426
t.Parallel()
2527
testCases := []struct {
2628
name string
29+
logicalCores int
2730
currentReadTime pcommon.Timestamp
2831
currentCPUStat *cpu.TimesStat
2932
previousReadTime pcommon.Timestamp
3033
previousCPUStat *cpu.TimesStat
3134
expectedUtilization *CPUUtilization
35+
normalize bool
3236
}{
3337
{
34-
name: "no previous times",
38+
name: "no previous times",
39+
logicalCores: 1,
3540
currentCPUStat: &cpu.TimesStat{
3641
User: 8260.4,
3742
},
3843
expectedUtilization: &CPUUtilization{},
3944
},
4045
{
4146
name: "no delta time should return utilization=0",
47+
logicalCores: 1,
4248
previousReadTime: 1640097430772858000,
4349
currentReadTime: 1640097430772858000,
4450
previousCPUStat: &cpu.TimesStat{
@@ -51,6 +57,71 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) {
5157
},
5258
{
5359
name: "one second time delta",
60+
logicalCores: 1,
61+
previousReadTime: 1640097430772858000,
62+
currentReadTime: 1640097431772858000,
63+
previousCPUStat: &cpu.TimesStat{
64+
User: 8258.4,
65+
System: 6193.3,
66+
Iowait: 34.201,
67+
},
68+
currentCPUStat: &cpu.TimesStat{
69+
User: 8258.5,
70+
System: 6193.6,
71+
Iowait: 34.202,
72+
},
73+
expectedUtilization: &CPUUtilization{
74+
User: 0.1,
75+
System: 0.3,
76+
Iowait: 0.001,
77+
},
78+
},
79+
{
80+
name: "one second time delta, 2 logical cores, normalized",
81+
logicalCores: 2,
82+
previousReadTime: 1640097430772858000,
83+
currentReadTime: 1640097431772858000,
84+
previousCPUStat: &cpu.TimesStat{
85+
User: 8258.4,
86+
System: 6193.3,
87+
Iowait: 34.201,
88+
},
89+
currentCPUStat: &cpu.TimesStat{
90+
User: 8258.5,
91+
System: 6193.6,
92+
Iowait: 34.202,
93+
},
94+
expectedUtilization: &CPUUtilization{
95+
User: 0.05,
96+
System: 0.15,
97+
Iowait: 0.0005,
98+
},
99+
normalize: true,
100+
},
101+
{
102+
name: "one second time delta, 2 logical cores, not normalized",
103+
logicalCores: 2,
104+
previousReadTime: 1640097430772858000,
105+
currentReadTime: 1640097431772858000,
106+
previousCPUStat: &cpu.TimesStat{
107+
User: 8258.4,
108+
System: 6193.3,
109+
Iowait: 34.201,
110+
},
111+
currentCPUStat: &cpu.TimesStat{
112+
User: 8258.5,
113+
System: 6193.6,
114+
Iowait: 34.202,
115+
},
116+
expectedUtilization: &CPUUtilization{
117+
User: 0.1,
118+
System: 0.3,
119+
Iowait: 0.001,
120+
},
121+
},
122+
{
123+
name: "0 logical cores",
124+
logicalCores: 0,
54125
previousReadTime: 1640097430772858000,
55126
currentReadTime: 1640097431772858000,
56127
previousCPUStat: &cpu.TimesStat{
@@ -73,13 +144,13 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) {
73144
for _, test := range testCases {
74145
test := test
75146
t.Run(test.name, func(t *testing.T) {
76-
t.Parallel()
147+
setNormalizeProcessCPUUtilizationFeatureGate(t, test.normalize)
77148
recorder := inMemoryRecorder{}
78149
calculator := CPUUtilizationCalculator{
79150
previousReadTime: test.previousReadTime,
80151
previousCPUStats: test.previousCPUStat,
81152
}
82-
err := calculator.CalculateAndRecord(test.currentReadTime, test.currentCPUStat, recorder.record)
153+
err := calculator.CalculateAndRecord(test.currentReadTime, test.logicalCores, test.currentCPUStat, recorder.record)
83154
assert.NoError(t, err)
84155
assert.InDelta(t, test.expectedUtilization.System, recorder.cpuUtilization.System, 0.00001)
85156
assert.InDelta(t, test.expectedUtilization.User, recorder.cpuUtilization.User, 0.00001)
@@ -108,9 +179,26 @@ func Test_cpuUtilization(t *testing.T) {
108179
Iowait: 0.024,
109180
}
110181

111-
actualUtilization := cpuUtilization(startStat, startTime, endStat, halfSecondLater)
182+
actualUtilization := cpuUtilization(1, startStat, startTime, endStat, halfSecondLater)
112183
assert.InDelta(t, expectedUtilization.User, actualUtilization.User, 0.00001)
113184
assert.InDelta(t, expectedUtilization.System, actualUtilization.System, 0.00001)
114185
assert.InDelta(t, expectedUtilization.Iowait, actualUtilization.Iowait, 0.00001)
115186

116187
}
188+
189+
func setNormalizeProcessCPUUtilizationFeatureGate(t *testing.T, val bool) {
190+
wasEnabled := normalizeProcessCPUUtilizationFeatureGate.IsEnabled()
191+
err := featuregate.GlobalRegistry().Set(
192+
normalizeProcessCPUUtilizationFeatureGate.ID(),
193+
val,
194+
)
195+
require.NoError(t, err)
196+
197+
t.Cleanup(func() {
198+
err := featuregate.GlobalRegistry().Set(
199+
normalizeProcessCPUUtilizationFeatureGate.ID(),
200+
wasEnabled,
201+
)
202+
require.NoError(t, err)
203+
})
204+
}

0 commit comments

Comments
 (0)