Skip to content

Commit 4644bfb

Browse files
authored
[Feature] Force delete Pods which are stuck in init phase (#1181)
1 parent 2d005bf commit 4644bfb

File tree

3 files changed

+62
-1
lines changed

3 files changed

+62
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
- (Improvement) Use inspector for ArangoMember
1919
- (DebugPackage) Collect logs from pods
2020
- (Bugfix) Move Agency CommitIndex log message to Trace
21+
- (Feature) Force delete Pods which are stuck in init phase
2122

2223
## [1.2.20](https://github.com/arangodb/kube-arangodb/tree/1.2.20) (2022-10-25)
2324
- (Feature) Add action progress

pkg/deployment/resources/pod_inspector.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ const (
5656
// we will mark the pod as scheduled for termination
5757
recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon
5858
maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens)
59+
forcePodDeletionGracePeriod = 15 * time.Minute
5960
)
6061

6162
func (r *Resources) handleRestartedPod(pod *core.Pod, memberStatus *api.MemberStatus, wasTerminated, markAsTerminated *bool) {
@@ -396,12 +397,30 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
396397

397398
// Check if any additional deletion request is required
398399
if !k8sutil.IsPodAlive(pod) {
400+
var gps int64 = 10
401+
402+
forceDelete := false
403+
if t := k8sutil.PodStopTime(pod); !t.IsZero() {
404+
if time.Since(t) > forcePodDeletionGracePeriod {
405+
forceDelete = true
406+
}
407+
} else if t := pod.DeletionTimestamp; t != nil {
408+
if time.Since(t.Time) > forcePodDeletionGracePeriod {
409+
forceDelete = true
410+
}
411+
}
412+
413+
if forceDelete {
414+
gps = 0
415+
log.Str("pod-name", pod.GetName()).Warn("Enforcing deletion of Pod")
416+
}
417+
399418
// Pod is dead, but still not removed. Send additional deletion request
400419
nctx, c := globals.GetGlobals().Timeouts().Kubernetes().WithTimeout(ctx)
401420
defer c()
402421

403422
if err := cachedStatus.PodsModInterface().V1().Delete(nctx, pod.GetName(), meta.DeleteOptions{
404-
GracePeriodSeconds: util.NewInt64(10),
423+
GracePeriodSeconds: util.NewInt64(gps),
405424
Preconditions: meta.NewUIDPreconditions(string(pod.GetUID())),
406425
}); err != nil {
407426
if kerrors.IsNotFound(err) {

pkg/util/k8sutil/pods.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,47 @@ func IsContainerAlive(container core.ContainerStatus) bool {
384384
return container.State.Running != nil
385385
}
386386

387+
// PodStopTime returns time when pod has been stopped
388+
func PodStopTime(pod *core.Pod) time.Time {
389+
var t time.Time
390+
391+
if q := ContainersRecentStopTime(pod.Status.ContainerStatuses); q.After(t) {
392+
t = q
393+
}
394+
395+
if q := ContainersRecentStopTime(pod.Status.InitContainerStatuses); q.After(t) {
396+
t = q
397+
}
398+
399+
if q := ContainersRecentStopTime(pod.Status.EphemeralContainerStatuses); q.After(t) {
400+
t = q
401+
}
402+
403+
return t
404+
}
405+
406+
// ContainersRecentStopTime returns most recent termination time of pods
407+
func ContainersRecentStopTime(containers []core.ContainerStatus) time.Time {
408+
var t time.Time
409+
410+
for _, c := range containers {
411+
if v := ContainerStopTime(c); v.After(t) {
412+
t = v
413+
}
414+
}
415+
416+
return t
417+
}
418+
419+
// ContainerStopTime returns time of the Container stop. If container is running, time.Zero is returned
420+
func ContainerStopTime(container core.ContainerStatus) time.Time {
421+
if p := container.State.Terminated; p != nil {
422+
return p.FinishedAt.Time
423+
}
424+
425+
return time.Time{}
426+
}
427+
387428
// ClusterJWTVolumeMount creates a volume mount structure for a cluster JWT secret (token).
388429
func ClusterJWTVolumeMount() core.VolumeMount {
389430
return core.VolumeMount{

0 commit comments

Comments
 (0)