Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Therefore, this application will not run into any issues if it is restarted, res
| METRICS | Expose metrics in Prometheus format at `:${METRICS_PORT}/metrics` | no | `""` |
| SLOW_MODE | If enabled, every time a node is terminated during an execution, the current execution will stop rather than continuing to the next ASG | no | `false` |
| EAGER_CORDONING | If enabled, all outdated nodes will get cordoned before any rolling update action. The default mode is to cordon a node just before draining it. See [#41](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/issues/41) for possible consequences of enabling this. | no | `false` |
| EXCLUDE_FROM_EXTERNAL_LOAD_BALANCERS | If enabled, node label `node.kubernetes.io/exclude-from-external-load-balancers=true` will be added to nodes before draining. See [#131](https://github.com/TwiN/aws-eks-asg-rolling-update-handler/pull/131) for more information | no | `false` |

**NOTE:** Only one of `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` or `AUTO_SCALING_GROUP_NAMES` must be set.

Expand Down
88 changes: 46 additions & 42 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,48 +12,51 @@ import (
var cfg *config

const (
EnvEnvironment = "ENVIRONMENT"
EnvDebug = "DEBUG"
EnvIgnoreDaemonSets = "IGNORE_DAEMON_SETS"
EnvDeleteLocalData = "DELETE_LOCAL_DATA" // Deprecated: in favor of DeleteEmptyDirData (DELETE_EMPTY_DIR_DATA)
EnvDeleteEmptyDirData = "DELETE_EMPTY_DIR_DATA"
EnvClusterName = "CLUSTER_NAME"
EnvAutodiscoveryTags = "AUTODISCOVERY_TAGS"
EnvAutoScalingGroupNames = "AUTO_SCALING_GROUP_NAMES"
EnvAwsRegion = "AWS_REGION"
EnvExecutionInterval = "EXECUTION_INTERVAL"
EnvExecutionTimeout = "EXECUTION_TIMEOUT"
EnvPodTerminationGracePeriod = "POD_TERMINATION_GRACE_PERIOD"
EnvMetrics = "METRICS"
EnvMetricsPort = "METRICS_PORT"
EnvSlowMode = "SLOW_MODE"
EnvEagerCordoning = "EAGER_CORDONING"
EnvEnvironment = "ENVIRONMENT"
EnvDebug = "DEBUG"
EnvIgnoreDaemonSets = "IGNORE_DAEMON_SETS"
EnvDeleteLocalData = "DELETE_LOCAL_DATA" // Deprecated: in favor of DeleteEmptyDirData (DELETE_EMPTY_DIR_DATA)
EnvDeleteEmptyDirData = "DELETE_EMPTY_DIR_DATA"
EnvClusterName = "CLUSTER_NAME"
EnvAutodiscoveryTags = "AUTODISCOVERY_TAGS"
EnvAutoScalingGroupNames = "AUTO_SCALING_GROUP_NAMES"
EnvAwsRegion = "AWS_REGION"
EnvExecutionInterval = "EXECUTION_INTERVAL"
EnvExecutionTimeout = "EXECUTION_TIMEOUT"
EnvPodTerminationGracePeriod = "POD_TERMINATION_GRACE_PERIOD"
EnvMetrics = "METRICS"
EnvMetricsPort = "METRICS_PORT"
EnvSlowMode = "SLOW_MODE"
EnvEagerCordoning = "EAGER_CORDONING"
EnvExcludeFromExternalLoadBalancers = "EXLUDE_FROM_EXTERNAL_LOAD_BALANCERS"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed you made a typo in the environment variable 😂

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved by 3b8647b

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh damn it :D thanks for fixing this!

)

type config struct {
Environment string // Optional
Debug bool // Defaults to false
AutoScalingGroupNames []string // Required if AutodiscoveryTags not provided
AutodiscoveryTags string // Required if AutoScalingGroupNames not provided
AwsRegion string // Defaults to us-west-2
IgnoreDaemonSets bool // Defaults to true
DeleteEmptyDirData bool // Defaults to true
ExecutionInterval time.Duration // Defaults to 20s
ExecutionTimeout time.Duration // Defaults to 900s
PodTerminationGracePeriod int // Defaults to -1
Metrics bool // Defaults to false
MetricsPort int // Defaults to 8080
SlowMode bool // Defaults to false
EagerCordoning bool // Defaults to false
Environment string // Optional
Debug bool // Defaults to false
AutoScalingGroupNames []string // Required if AutodiscoveryTags not provided
AutodiscoveryTags string // Required if AutoScalingGroupNames not provided
AwsRegion string // Defaults to us-west-2
IgnoreDaemonSets bool // Defaults to true
DeleteEmptyDirData bool // Defaults to true
ExecutionInterval time.Duration // Defaults to 20s
ExecutionTimeout time.Duration // Defaults to 900s
PodTerminationGracePeriod int // Defaults to -1
Metrics bool // Defaults to false
MetricsPort int // Defaults to 8080
SlowMode bool // Defaults to false
EagerCordoning bool // Defaults to false
ExcludeFromExternalLoadBalancers bool // Defaults to false
}

// Initialize is used to initialize the application's configuration
func Initialize() error {
cfg = &config{
Environment: strings.ToLower(os.Getenv(EnvEnvironment)),
Debug: strings.ToLower(os.Getenv(EnvDebug)) == "true",
SlowMode: strings.ToLower(os.Getenv(EnvSlowMode)) == "true",
EagerCordoning: strings.ToLower(os.Getenv(EnvEagerCordoning)) == "true",
Environment: strings.ToLower(os.Getenv(EnvEnvironment)),
Debug: strings.ToLower(os.Getenv(EnvDebug)) == "true",
SlowMode: strings.ToLower(os.Getenv(EnvSlowMode)) == "true",
EagerCordoning: strings.ToLower(os.Getenv(EnvEagerCordoning)) == "true",
ExcludeFromExternalLoadBalancers: strings.ToLower(os.Getenv(EnvExcludeFromExternalLoadBalancers)) == "true",
}
if clusterName := os.Getenv(EnvClusterName); len(clusterName) > 0 {
// See "Prerequisites" in https://docs.aws.amazon.com/eks/latest/userguide/autoscaling.html
Expand Down Expand Up @@ -135,21 +138,22 @@ func Initialize() error {

// Set sets the application's configuration and is intended to be used for testing purposes.
// See Initialize() for production
func Set(autoScalingGroupNames []string, ignoreDaemonSets, deleteEmptyDirData, eagerCordoning bool) {
func Set(autoScalingGroupNames []string, ignoreDaemonSets, deleteEmptyDirData, eagerCordoning bool, excludeFromExternalLoadBalancers bool) {
cfg = &config{
AutoScalingGroupNames: autoScalingGroupNames,
IgnoreDaemonSets: ignoreDaemonSets,
DeleteEmptyDirData: deleteEmptyDirData,
EagerCordoning: eagerCordoning,
ExecutionInterval: time.Second * 20,
ExecutionTimeout: time.Second * 900,
AutoScalingGroupNames: autoScalingGroupNames,
IgnoreDaemonSets: ignoreDaemonSets,
DeleteEmptyDirData: deleteEmptyDirData,
EagerCordoning: eagerCordoning,
ExcludeFromExternalLoadBalancers: excludeFromExternalLoadBalancers,
ExecutionInterval: time.Second * 20,
ExecutionTimeout: time.Second * 900,
}
}

func Get() *config {
if cfg == nil {
log.Println("Config wasn't initialized prior to being called. Assuming this is a test.")
Set(nil, true, true, false)
Set(nil, true, true, false, false)
}
return cfg
}
2 changes: 1 addition & 1 deletion config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func TestInitialize_withMissingRequiredValues(t *testing.T) {
}

func TestSet(t *testing.T) {
Set([]string{"asg-a", "asg-b", "asg-c"}, true, true, false)
Set([]string{"asg-a", "asg-b", "asg-c"}, true, true, false, false)
config := Get()
if len(config.AutoScalingGroupNames) != 3 {
t.Error()
Expand Down
4 changes: 3 additions & 1 deletion k8s/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"github.com/TwiN/gocache/v2"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/autoscaling"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/kubectl/pkg/drain"
Expand All @@ -20,6 +20,8 @@ const (
AnnotationRollingUpdateDrainedTimestamp = "aws-eks-asg-rolling-update-handler.twin.sh/drained-at"
AnnotationRollingUpdateTerminatedTimestamp = "aws-eks-asg-rolling-update-handler.twin.sh/terminated-at"

LabelExcludeFromExternalLoadBalancers = "node.kubernetes.io/exclude-from-external-load-balancers"

nodesCacheKey = "nodes"
)

Expand Down
20 changes: 19 additions & 1 deletion k8s/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"log"

"github.com/aws/aws-sdk-go/service/autoscaling"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
)

// CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes calculates the resources available in the target nodes
Expand Down Expand Up @@ -103,3 +103,21 @@ func AnnotateNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.I
}
return nil
}

// Label Node adds an Label to the Kubernetes node represented by a given AWS instance
func LabelNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.Instance, key, value string) error {
node, err := client.GetNodeByAutoScalingInstance(instance)
if err != nil {
return err
}
labels := node.GetLabels()
if currentValue := labels[key]; currentValue != value {
labels[key] = value
node.SetLabels(labels)
err = client.UpdateNode(node)
if err != nil {
return err
}
}
return nil
}
3 changes: 2 additions & 1 deletion k8stest/k8stest.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/autoscaling"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand Down Expand Up @@ -100,6 +100,7 @@ func CreateTestNode(name, availabilityZone, instanceId, allocatableCpu, allocata
}
node.SetName(name)
node.SetAnnotations(make(map[string]string))
node.SetLabels(make(map[string]string))
return node
}

Expand Down
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
if hasEnoughResources {
log.Printf("[%s][%s] Updated nodes have enough resources available", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
if minutesSinceDrained == -1 {
if config.Get().ExcludeFromExternalLoadBalancers {
log.Printf("[%s][%s] Label node to exlude from external load balancers", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
k8s.LabelNodeByAutoScalingInstance(client, outdatedInstance, k8s.LabelExcludeFromExternalLoadBalancers, "true")
}
log.Printf("[%s][%s] Draining node", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
err := client.Drain(node.Name, config.Get().IgnoreDaemonSets, config.Get().DeleteEmptyDirData, config.Get().PodTerminationGracePeriod)
if err != nil {
Expand Down
97 changes: 94 additions & 3 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -683,8 +683,8 @@ func TestHasAcceptableNumberOfUpdatedNonReadyNodes(t *testing.T) {
}

func TestHandleRollingUpgrade_withEagerCordoning(t *testing.T) {
config.Set(nil, true, true, true)
defer config.Set(nil, true, true, false)
config.Set(nil, true, true, true, false)
defer config.Set(nil, true, true, false, false)

oldInstance1 := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
oldInstance2 := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService")
Expand Down Expand Up @@ -718,7 +718,8 @@ func TestHandleRollingUpgrade_withEagerCordoning(t *testing.T) {

func TestHandleRollingUpgrade_withEagerCordoningDisabled(t *testing.T) {
// explicitly setting this, but eager cordoning is disabled by default anyways
config.Set(nil, true, true, false)
config.Set(nil, true, true, false, true)
defer config.Set(nil, true, true, true, false)

oldInstance1 := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
oldInstance2 := cloudtest.CreateTestAutoScalingInstance("old-2", "v1", nil, "InService")
Expand Down Expand Up @@ -746,3 +747,93 @@ func TestHandleRollingUpgrade_withEagerCordoningDisabled(t *testing.T) {
t.Error("Eager cordoning is not enabled, so no node should have been cordoned on the first execution")
}
}

func TestHandleRollingUpgrade_withExcludeFromExternalLoadBalancers(t *testing.T) {
config.Set(nil, true, true, false, true)
defer config.Set(nil, true, true, false, false)

oldInstance := cloudtest.CreateTestAutoScalingInstance("old-1", "v1", nil, "InService")
asg := cloudtest.CreateTestAutoScalingGroup("asg", "v2", nil, []*autoscaling.Instance{oldInstance}, false)

oldNode := k8stest.CreateTestNode("old-node-1", aws.StringValue(oldInstance.AvailabilityZone), aws.StringValue(oldInstance.InstanceId), "1000m", "1000Mi")
oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning)

mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
mockEc2Service := cloudtest.NewMockEC2Service(nil)
mockAutoScalingService := cloudtest.NewMockAutoScalingService([]*autoscaling.Group{asg})

// First run (Node rollout process gets marked as started)
err := HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}
if mockClient.Counter["UpdateNode"] != 1 {
t.Error("Node should've been annotated, meaning that UpdateNode should've been called once")
}

// Second run (ASG's desired capacity gets increased)
err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}
if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
t.Error("ASG should've been increased because there's no updated nodes yet")
}
asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
if aws.Int64Value(asg.DesiredCapacity) != 2 {
t.Error("The desired capacity of the ASG should've been increased to 2")
}

// Third run (Nothing changed)
err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}
if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
t.Error("Desired capacity shouldn't have been updated")
}
asg = mockAutoScalingService.AutoScalingGroups[aws.StringValue(asg.AutoScalingGroupName)]
if aws.Int64Value(asg.DesiredCapacity) != 2 {
t.Error("The desired capacity of the ASG should've stayed at 2")
}

// Fourth run (new instance has been registered to ASG, but is pending)
newInstance := cloudtest.CreateTestAutoScalingInstance("new-1", "v2", nil, "Pending")
asg.Instances = append(asg.Instances, newInstance)
err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}
if mockAutoScalingService.Counter["SetDesiredCapacity"] != 1 {
t.Error("Desired capacity shouldn't have been updated")
}

// Fifth run (new instance is now InService, but node has still not joined cluster (GetNodeByAutoScalingInstance should return not found))
newInstance.SetLifecycleState("InService")
err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}

// Sixth run (new instance has joined the cluster, but Kubelet isn't ready to accept pods yet)
newNode := k8stest.CreateTestNode("new-node-1", aws.StringValue(newInstance.AvailabilityZone), aws.StringValue(newInstance.InstanceId), "1000m", "1000Mi")
newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}
mockClient.Nodes[newNode.Name] = newNode
err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}

// Seventh run (Kubelet is ready to accept new pods. Old node gets drained and terminated)
newNode = mockClient.Nodes[newNode.Name]
newNode.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}
mockClient.Nodes[newNode.Name] = newNode
err = HandleRollingUpgrade(mockClient, mockEc2Service, mockAutoScalingService, []*autoscaling.Group{asg})
if err != nil {
t.Error("unexpected error:", err)
}
oldNode = mockClient.Nodes[oldNode.Name]
if _, ok := oldNode.GetLabels()[k8s.LabelExcludeFromExternalLoadBalancers]; !ok {
t.Error("Node should've been labeled")
}
}