Skip to content

Commit 548b8e2

Browse files
authored
Jitter certificate lifetimes (#361)
* Log cert lifetime limit violations * Apply jitter to issued certificate durations * Document jitter * Changelog * Make jitter configurable * Satisfy the clippy one * Update test to allow room for jitter
1 parent ccf2bb0 commit 548b8e2

File tree

8 files changed

+92
-11
lines changed

8 files changed

+92
-11
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ All notable changes to this project will be documented in this file.
1515
- `autoTLS` certificate authorities will now be rotated regularly ([#350]).
1616
- [BREAKING] This changes the format of the CA secrets. Old secrets will be migrated automatically, but manual intervention will be required to downgrade back to 23.11.x.
1717
- `autoTLS` certificate authority lifetimes are now configurable ([#357]).
18+
- Certificate lifetimes are now jittered ([#361]).
1819

1920
[#333]: https://github.com/stackabletech/secret-operator/pull/333
2021
[#341]: https://github.com/stackabletech/secret-operator/pull/341
2122
[#350]: https://github.com/stackabletech/secret-operator/pull/350
2223
[#352]: https://github.com/stackabletech/secret-operator/pull/352
2324
[#357]: https://github.com/stackabletech/secret-operator/pull/357
25+
[#361]: https://github.com/stackabletech/secret-operator/pull/361
2426

2527

2628
## [23.11.0] - 2023-11-24

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/modules/secret-operator/pages/secretclass.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ We have spent a considerate amount of time thinking about this issue and decided
5050
3. Pods consuming the certificate can request a longer lifetime or shutdown expiration buffer via annotations
5151
on the xref:volume.adoc[Volume]. If they request a longer lifetime than the SecretClass allows,
5252
it will be shortened to the maximum allowed lifetime.
53+
4. To avoid stampeding herds during restarts and spread out the load, certificate durations are lowered by up to 20%.
5354
4. The Pods will be evicted `6h` before the certificate expires, to ensure that no Pods are left running with expired secrets. Consumers can override this buffer using Volume annotations.
5455
This buffer must be long enough that the product is guaranteed to gracefully shut down.
5556

docs/modules/secret-operator/pages/volume.adoc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,18 @@ in a rolling fashion.
7979

8080
The format is documented in xref:concepts:duration.adoc[].
8181

82+
=== `secrets.stackable.tech/backend.autotls.cert.jitter-factor`
83+
84+
*Required*: false
85+
86+
*Default value*: `0.2`
87+
88+
*Backends*: xref:secretclass.adoc#backend-autotls[]
89+
90+
Up to this part of the Certificate's lifetime may be removed for jittering.
91+
92+
Must be within 0.0 and 1.0.
93+
8294
=== `secrets.stackable.tech/kerberos.service.names`
8395

8496
*Required*: false

rust/operator-binary/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ tonic.workspace = true
3939
tracing.workspace = true
4040
uuid.workspace = true
4141
yasna.workspace = true
42+
rand.workspace = true
4243

4344
[dev-dependencies]
4445
serde_yaml.workspace = true

rust/operator-binary/src/backend/mod.rs

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pub mod scope;
88
pub mod tls;
99

1010
use async_trait::async_trait;
11-
use serde::{Deserialize, Deserializer};
11+
use serde::{de::Unexpected, Deserialize, Deserializer};
1212
use snafu::{OptionExt, Snafu};
1313
use stackable_operator::{
1414
k8s_openapi::chrono::{DateTime, FixedOffset},
@@ -26,10 +26,7 @@ use scope::SecretScope;
2626

2727
use crate::format::{SecretData, SecretFormat};
2828

29-
use self::{
30-
pod_info::SchedulingPodInfo,
31-
tls::{DEFAULT_CERT_LIFETIME, DEFAULT_CERT_RESTART_BUFFER},
32-
};
29+
use self::pod_info::SchedulingPodInfo;
3330

3431
/// Configuration provided by the `Volume` selecting what secret data should be provided
3532
///
@@ -111,14 +108,27 @@ pub struct SecretVolumeSelector {
111108
default = "default_cert_restart_buffer"
112109
)]
113110
pub autotls_cert_restart_buffer: Duration,
111+
112+
/// The part of the certificate's lifetime that may be removed for jittering.
113+
/// Must be within 0.0 and 1.0.
114+
#[serde(
115+
rename = "secrets.stackable.tech/backend.autotls.cert.jitter-factor",
116+
deserialize_with = "SecretVolumeSelector::deserialize_str_as_f64",
117+
default = "default_cert_jitter_factor"
118+
)]
119+
pub autotls_cert_jitter_factor: f64,
114120
}
115121

116122
fn default_cert_restart_buffer() -> Duration {
117-
DEFAULT_CERT_RESTART_BUFFER
123+
tls::DEFAULT_CERT_RESTART_BUFFER
118124
}
119125

120126
fn default_cert_lifetime() -> Duration {
121-
DEFAULT_CERT_LIFETIME
127+
tls::DEFAULT_CERT_LIFETIME
128+
}
129+
130+
fn default_cert_jitter_factor() -> f64 {
131+
tls::DEFAULT_CERT_JITTER_FACTOR
122132
}
123133

124134
#[derive(Snafu, Debug)]
@@ -183,6 +193,16 @@ impl SecretVolumeSelector {
183193
let full_str = String::deserialize(de)?;
184194
Ok(full_str.split(',').map(str::to_string).collect())
185195
}
196+
197+
fn deserialize_str_as_f64<'de, D: Deserializer<'de>>(de: D) -> Result<f64, D::Error> {
198+
let str = String::deserialize(de)?;
199+
str.parse().map_err(|_| {
200+
<D::Error as serde::de::Error>::invalid_value(
201+
Unexpected::Str(&str),
202+
&"a string containing a f64",
203+
)
204+
})
205+
}
186206
}
187207

188208
#[derive(Debug)]

rust/operator-binary/src/backend/tls/mod.rs

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! Dynamically provisions TLS certificates
22
3-
use std::cmp::min;
3+
use std::ops::Range;
44

55
use async_trait::async_trait;
66
use openssl::{
@@ -19,6 +19,7 @@ use openssl::{
1919
X509Builder, X509NameBuilder,
2020
},
2121
};
22+
use rand::Rng;
2223
use snafu::{OptionExt, ResultExt, Snafu};
2324
use stackable_operator::{
2425
k8s_openapi::chrono::{self, FixedOffset, TimeZone},
@@ -62,6 +63,9 @@ pub const DEFAULT_CERT_LIFETIME: Duration = Duration::from_hours_unchecked(24);
6263
/// - which is the purpose of the buffer.
6364
pub const DEFAULT_CERT_RESTART_BUFFER: Duration = Duration::from_hours_unchecked(6);
6465

66+
/// We randomize the certificate lifetimes slightly, in order to avoid all pods of a set restarting/failing at the same time.
67+
pub const DEFAULT_CERT_JITTER_FACTOR: f64 = 0.2;
68+
6569
#[derive(Debug, Snafu)]
6670
pub enum Error {
6771
#[snafu(display("failed to get addresses for scope {scope}"))]
@@ -96,6 +100,9 @@ pub enum Error {
96100
expires_at: OffsetDateTime,
97101
restart_at: OffsetDateTime,
98102
},
103+
104+
#[snafu(display("invalid jitter factor {requested} requested, must be within {range:?}"))]
105+
JitterOutOfRange { requested: f64, range: Range<f64> },
99106
}
100107
type Result<T, E = Error> = std::result::Result<T, E>;
101108

@@ -116,6 +123,7 @@ impl SecretBackendError for Error {
116123
Error::SerializeCertificate { .. } => tonic::Code::FailedPrecondition,
117124
Error::InvalidCertLifetime { .. } => tonic::Code::Internal,
118125
Error::TooShortCertLifetimeRequiresTimeTravel { .. } => tonic::Code::InvalidArgument,
126+
Error::JitterOutOfRange { .. } => tonic::Code::InvalidArgument,
119127
}
120128
}
121129
}
@@ -178,7 +186,41 @@ impl SecretBackend for TlsGenerate {
178186

179187
// We need to check that the cert lifetime it is not longer than allowed,
180188
// by capping it to the maximum configured at the SecretClass.
181-
let cert_lifetime = min(cert_lifetime, self.max_cert_lifetime);
189+
let cert_lifetime = if cert_lifetime > self.max_cert_lifetime {
190+
tracing::info!(
191+
certificate.lifetime.requested = %cert_lifetime,
192+
certificate.lifetime.maximum = %self.max_cert_lifetime,
193+
certificate.lifetime = %self.max_cert_lifetime,
194+
"Pod requested a certificate to have a longer lifetime than the configured maximum, reducing",
195+
);
196+
self.max_cert_lifetime
197+
} else {
198+
cert_lifetime
199+
};
200+
201+
// Jitter the certificate lifetimes
202+
let jitter_factor_cap = selector.autotls_cert_jitter_factor;
203+
let jitter_factor_allowed_range = 0.0..1.0;
204+
if !jitter_factor_allowed_range.contains(&jitter_factor_cap) {
205+
return JitterOutOfRangeSnafu {
206+
requested: jitter_factor_cap,
207+
range: jitter_factor_allowed_range,
208+
}
209+
.fail();
210+
}
211+
let jitter_factor = rand::thread_rng().gen_range(0.0..jitter_factor_cap);
212+
let jitter_amount = Duration::from(cert_lifetime.mul_f64(jitter_factor));
213+
let unjittered_cert_lifetime = cert_lifetime;
214+
let cert_lifetime = cert_lifetime - jitter_amount;
215+
tracing::info!(
216+
certificate.lifetime.requested = %unjittered_cert_lifetime,
217+
certificate.lifetime.jitter = %jitter_amount,
218+
certificate.lifetime.jitter.factor = jitter_factor,
219+
certificate.lifetime.jitter.factor.cap = jitter_factor_cap,
220+
certificate.lifetime = %cert_lifetime,
221+
"Applying jitter to certificate lifetime",
222+
);
223+
182224
let not_after = now + cert_lifetime;
183225
let expire_pod_after = not_after - cert_restart_buffer;
184226
if expire_pod_after <= now {

tests/templates/kuttl/tls/consumer.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,16 @@ spec:
2626
notAfterDate=`date -d "${notAfter}" '+%s'`
2727
nowDate=`date '+%s'`
2828
diff="$((${notAfterDate}-${nowDate}))"
29-
if test "${diff}" -lt "$((71*3600))"; then echo "Cert had a lifetime of less than 71 hours!" && exit 1; fi
29+
# Allow for 20% jitter
30+
if test "${diff}" -lt "$((57*3600))"; then echo "Cert had a lifetime of less than 57 hours!" && exit 1; fi
3031
if test "${diff}" -gt "$((72*3600))"; then echo "Cert had a lifetime of greater than 72 hours!" && exit 1; fi
3132
3233
notAfter=`cat /stackable/tls-42h/tls.crt | openssl x509 -noout -enddate| sed -e 's#notAfter=##'`
3334
notAfterDate=`date -d "${notAfter}" '+%s'`
3435
nowDate=`date '+%s'`
3536
diff="$((${notAfterDate}-${nowDate}))"
36-
if test "${diff}" -lt "$((41*3600))"; then echo "Cert had a lifetime of less than 41 hours!" && exit 1; fi
37+
# Allow for 20% jitter
38+
if test "${diff}" -lt "$((33*3600))"; then echo "Cert had a lifetime of less than 33 hours!" && exit 1; fi
3739
if test "${diff}" -gt "$((42*3600))"; then echo "Cert had a lifetime of greater than 42 hours!" && exit 1; fi
3840
3941
cat /stackable/tls-3d/tls.crt | openssl x509 -noout -text | grep "DNS:my-tls-service.$NAMESPACE.svc.cluster.local"

0 commit comments

Comments
 (0)