diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f9be01e..665fa3cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,12 +15,14 @@ All notable changes to this project will be documented in this file. - `autoTLS` certificate authorities will now be rotated regularly ([#350]). - [BREAKING] This changes the format of the CA secrets. Old secrets will be migrated automatically, but manual intervention will be required to downgrade back to 23.11.x. - `autoTLS` certificate authority lifetimes are now configurable ([#357]). +- Certificate lifetimes are now jittered ([#361]). [#333]: https://github.com/stackabletech/secret-operator/pull/333 [#341]: https://github.com/stackabletech/secret-operator/pull/341 [#350]: https://github.com/stackabletech/secret-operator/pull/350 [#352]: https://github.com/stackabletech/secret-operator/pull/352 [#357]: https://github.com/stackabletech/secret-operator/pull/357 +[#361]: https://github.com/stackabletech/secret-operator/pull/361 ## [23.11.0] - 2023-11-24 diff --git a/Cargo.lock b/Cargo.lock index cda70396..ffdbea00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2568,6 +2568,7 @@ dependencies = [ "pin-project", "prost", "prost-types", + "rand", "serde", "serde_json", "serde_yaml", diff --git a/docs/modules/secret-operator/pages/secretclass.adoc b/docs/modules/secret-operator/pages/secretclass.adoc index 8ed859c5..faf6295c 100644 --- a/docs/modules/secret-operator/pages/secretclass.adoc +++ b/docs/modules/secret-operator/pages/secretclass.adoc @@ -50,6 +50,7 @@ We have spent a considerate amount of time thinking about this issue and decided 3. Pods consuming the certificate can request a longer lifetime or shutdown expiration buffer via annotations on the xref:volume.adoc[Volume]. If they request a longer lifetime than the SecretClass allows, it will be shortened to the maximum allowed lifetime. +4. To avoid stampeding herds during restarts and spread out the load, certificate durations are lowered by up to 20%. 4. The Pods will be evicted `6h` before the certificate expires, to ensure that no Pods are left running with expired secrets. Consumers can override this buffer using Volume annotations. This buffer must be long enough that the product is guaranteed to gracefully shut down. diff --git a/docs/modules/secret-operator/pages/volume.adoc b/docs/modules/secret-operator/pages/volume.adoc index fe31b68e..4ee9c21f 100644 --- a/docs/modules/secret-operator/pages/volume.adoc +++ b/docs/modules/secret-operator/pages/volume.adoc @@ -79,6 +79,18 @@ in a rolling fashion. The format is documented in xref:concepts:duration.adoc[]. +=== `secrets.stackable.tech/backend.autotls.cert.jitter-factor` + +*Required*: false + +*Default value*: `0.2` + +*Backends*: xref:secretclass.adoc#backend-autotls[] + +Up to this part of the Certificate's lifetime may be removed for jittering. + +Must be within 0.0 and 1.0. + === `secrets.stackable.tech/kerberos.service.names` *Required*: false diff --git a/rust/operator-binary/Cargo.toml b/rust/operator-binary/Cargo.toml index 026dd2e0..3058dbe7 100644 --- a/rust/operator-binary/Cargo.toml +++ b/rust/operator-binary/Cargo.toml @@ -39,6 +39,7 @@ tonic.workspace = true tracing.workspace = true uuid.workspace = true yasna.workspace = true +rand.workspace = true [dev-dependencies] serde_yaml.workspace = true diff --git a/rust/operator-binary/src/backend/mod.rs b/rust/operator-binary/src/backend/mod.rs index 491e59e3..b53a071f 100644 --- a/rust/operator-binary/src/backend/mod.rs +++ b/rust/operator-binary/src/backend/mod.rs @@ -8,7 +8,7 @@ pub mod scope; pub mod tls; use async_trait::async_trait; -use serde::{Deserialize, Deserializer}; +use serde::{de::Unexpected, Deserialize, Deserializer}; use snafu::{OptionExt, Snafu}; use stackable_operator::{ k8s_openapi::chrono::{DateTime, FixedOffset}, @@ -26,10 +26,7 @@ use scope::SecretScope; use crate::format::{SecretData, SecretFormat}; -use self::{ - pod_info::SchedulingPodInfo, - tls::{DEFAULT_CERT_LIFETIME, DEFAULT_CERT_RESTART_BUFFER}, -}; +use self::pod_info::SchedulingPodInfo; /// Configuration provided by the `Volume` selecting what secret data should be provided /// @@ -111,14 +108,27 @@ pub struct SecretVolumeSelector { default = "default_cert_restart_buffer" )] pub autotls_cert_restart_buffer: Duration, + + /// The part of the certificate's lifetime that may be removed for jittering. + /// Must be within 0.0 and 1.0. + #[serde( + rename = "secrets.stackable.tech/backend.autotls.cert.jitter-factor", + deserialize_with = "SecretVolumeSelector::deserialize_str_as_f64", + default = "default_cert_jitter_factor" + )] + pub autotls_cert_jitter_factor: f64, } fn default_cert_restart_buffer() -> Duration { - DEFAULT_CERT_RESTART_BUFFER + tls::DEFAULT_CERT_RESTART_BUFFER } fn default_cert_lifetime() -> Duration { - DEFAULT_CERT_LIFETIME + tls::DEFAULT_CERT_LIFETIME +} + +fn default_cert_jitter_factor() -> f64 { + tls::DEFAULT_CERT_JITTER_FACTOR } #[derive(Snafu, Debug)] @@ -183,6 +193,16 @@ impl SecretVolumeSelector { let full_str = String::deserialize(de)?; Ok(full_str.split(',').map(str::to_string).collect()) } + + fn deserialize_str_as_f64<'de, D: Deserializer<'de>>(de: D) -> Result { + let str = String::deserialize(de)?; + str.parse().map_err(|_| { + ::invalid_value( + Unexpected::Str(&str), + &"a string containing a f64", + ) + }) + } } #[derive(Debug)] diff --git a/rust/operator-binary/src/backend/tls/mod.rs b/rust/operator-binary/src/backend/tls/mod.rs index e058e9c3..5d238e49 100644 --- a/rust/operator-binary/src/backend/tls/mod.rs +++ b/rust/operator-binary/src/backend/tls/mod.rs @@ -1,6 +1,6 @@ //! Dynamically provisions TLS certificates -use std::cmp::min; +use std::ops::Range; use async_trait::async_trait; use openssl::{ @@ -19,6 +19,7 @@ use openssl::{ X509Builder, X509NameBuilder, }, }; +use rand::Rng; use snafu::{OptionExt, ResultExt, Snafu}; use stackable_operator::{ k8s_openapi::chrono::{self, FixedOffset, TimeZone}, @@ -62,6 +63,9 @@ pub const DEFAULT_CERT_LIFETIME: Duration = Duration::from_hours_unchecked(24); /// - which is the purpose of the buffer. pub const DEFAULT_CERT_RESTART_BUFFER: Duration = Duration::from_hours_unchecked(6); +/// We randomize the certificate lifetimes slightly, in order to avoid all pods of a set restarting/failing at the same time. +pub const DEFAULT_CERT_JITTER_FACTOR: f64 = 0.2; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("failed to get addresses for scope {scope}"))] @@ -96,6 +100,9 @@ pub enum Error { expires_at: OffsetDateTime, restart_at: OffsetDateTime, }, + + #[snafu(display("invalid jitter factor {requested} requested, must be within {range:?}"))] + JitterOutOfRange { requested: f64, range: Range }, } type Result = std::result::Result; @@ -116,6 +123,7 @@ impl SecretBackendError for Error { Error::SerializeCertificate { .. } => tonic::Code::FailedPrecondition, Error::InvalidCertLifetime { .. } => tonic::Code::Internal, Error::TooShortCertLifetimeRequiresTimeTravel { .. } => tonic::Code::InvalidArgument, + Error::JitterOutOfRange { .. } => tonic::Code::InvalidArgument, } } } @@ -178,7 +186,41 @@ impl SecretBackend for TlsGenerate { // We need to check that the cert lifetime it is not longer than allowed, // by capping it to the maximum configured at the SecretClass. - let cert_lifetime = min(cert_lifetime, self.max_cert_lifetime); + let cert_lifetime = if cert_lifetime > self.max_cert_lifetime { + tracing::info!( + certificate.lifetime.requested = %cert_lifetime, + certificate.lifetime.maximum = %self.max_cert_lifetime, + certificate.lifetime = %self.max_cert_lifetime, + "Pod requested a certificate to have a longer lifetime than the configured maximum, reducing", + ); + self.max_cert_lifetime + } else { + cert_lifetime + }; + + // Jitter the certificate lifetimes + let jitter_factor_cap = selector.autotls_cert_jitter_factor; + let jitter_factor_allowed_range = 0.0..1.0; + if !jitter_factor_allowed_range.contains(&jitter_factor_cap) { + return JitterOutOfRangeSnafu { + requested: jitter_factor_cap, + range: jitter_factor_allowed_range, + } + .fail(); + } + let jitter_factor = rand::thread_rng().gen_range(0.0..jitter_factor_cap); + let jitter_amount = Duration::from(cert_lifetime.mul_f64(jitter_factor)); + let unjittered_cert_lifetime = cert_lifetime; + let cert_lifetime = cert_lifetime - jitter_amount; + tracing::info!( + certificate.lifetime.requested = %unjittered_cert_lifetime, + certificate.lifetime.jitter = %jitter_amount, + certificate.lifetime.jitter.factor = jitter_factor, + certificate.lifetime.jitter.factor.cap = jitter_factor_cap, + certificate.lifetime = %cert_lifetime, + "Applying jitter to certificate lifetime", + ); + let not_after = now + cert_lifetime; let expire_pod_after = not_after - cert_restart_buffer; if expire_pod_after <= now { diff --git a/tests/templates/kuttl/tls/consumer.yaml b/tests/templates/kuttl/tls/consumer.yaml index c5892397..1b168f6e 100644 --- a/tests/templates/kuttl/tls/consumer.yaml +++ b/tests/templates/kuttl/tls/consumer.yaml @@ -26,14 +26,16 @@ spec: notAfterDate=`date -d "${notAfter}" '+%s'` nowDate=`date '+%s'` diff="$((${notAfterDate}-${nowDate}))" - if test "${diff}" -lt "$((71*3600))"; then echo "Cert had a lifetime of less than 71 hours!" && exit 1; fi + # Allow for 20% jitter + if test "${diff}" -lt "$((57*3600))"; then echo "Cert had a lifetime of less than 57 hours!" && exit 1; fi if test "${diff}" -gt "$((72*3600))"; then echo "Cert had a lifetime of greater than 72 hours!" && exit 1; fi notAfter=`cat /stackable/tls-42h/tls.crt | openssl x509 -noout -enddate| sed -e 's#notAfter=##'` notAfterDate=`date -d "${notAfter}" '+%s'` nowDate=`date '+%s'` diff="$((${notAfterDate}-${nowDate}))" - if test "${diff}" -lt "$((41*3600))"; then echo "Cert had a lifetime of less than 41 hours!" && exit 1; fi + # Allow for 20% jitter + if test "${diff}" -lt "$((33*3600))"; then echo "Cert had a lifetime of less than 33 hours!" && exit 1; fi if test "${diff}" -gt "$((42*3600))"; then echo "Cert had a lifetime of greater than 42 hours!" && exit 1; fi cat /stackable/tls-3d/tls.crt | openssl x509 -noout -text | grep "DNS:my-tls-service.$NAMESPACE.svc.cluster.local"