Skip to content

Commit 3192ea5

Browse files
committed
Remove remote client of insecurely setup cluster
1 parent e9b3abf commit 3192ea5

File tree

3 files changed

+193
-0
lines changed

3 files changed

+193
-0
lines changed

pkg/controller/admissionchecks/multikueue/multikueuecluster.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ func (c *clustersReconciler) Reconcile(ctx context.Context, req reconcile.Reques
424424
err = validateKubeconfig(kubeConfig)
425425
if err != nil {
426426
log.Error(err, "validating kubeconfig failed")
427+
c.stopAndRemoveCluster(req.Name)
427428
if updateErr := c.updateStatus(ctx, cluster, false, "InsecureKubeConfig", fmt.Sprintf("insecure kubeconfig: %v", err)); updateErr != nil {
428429
return reconcile.Result{}, fmt.Errorf("failed to update MultiKueueCluster status: %w after detecting insecure kubeconfig: %w", updateErr, err)
429430
}

pkg/controller/admissionchecks/multikueue/multikueuecluster_test.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,31 @@ func TestUpdateConfig(t *testing.T) {
424424
},
425425
wantErr: fmt.Errorf("validating kubeconfig failed: %w", errors.New("tokenFile is not allowed")),
426426
},
427+
"remove client with invalid kubeconfig": {
428+
reconcileFor: "worker1",
429+
clusters: []kueue.MultiKueueCluster{
430+
*utiltestingapi.MakeMultiKueueCluster("worker1").
431+
KubeConfig(kueue.SecretLocationType, "worker1").
432+
Generation(1).
433+
Obj(),
434+
},
435+
secrets: []corev1.Secret{
436+
makeTestSecret("worker1", testKubeconfigInsecure("worker1", ptr.To("/path/to/tokenfile"))),
437+
},
438+
remoteClients: map[string]*remoteClient{
439+
"worker1": newTestClient(ctx, "worker1 old kubeconfig", cancelCalled),
440+
},
441+
wantClusters: []kueue.MultiKueueCluster{
442+
*utiltestingapi.MakeMultiKueueCluster("worker1").
443+
KubeConfig(kueue.SecretLocationType, "worker1").
444+
Active(metav1.ConditionFalse, "InsecureKubeConfig", "insecure kubeconfig: tokenFile is not allowed", 1).
445+
Generation(1).
446+
Obj(),
447+
},
448+
wantRemoteClients: map[string]*remoteClient{},
449+
wantCancelCalled: 1,
450+
wantErr: fmt.Errorf("validating kubeconfig failed: %w", errors.New("tokenFile is not allowed")),
451+
},
427452
"skip insecure kubeconfig validation": {
428453
reconcileFor: "worker1",
429454
clusters: []kueue.MultiKueueCluster{

test/integration/multikueue/setup_test.go

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,18 @@ import (
2727
"github.com/onsi/gomega"
2828
corev1 "k8s.io/api/core/v1"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/apimachinery/pkg/types"
3031
"k8s.io/client-go/tools/clientcmd"
3132
"sigs.k8s.io/controller-runtime/pkg/client"
3233
"sigs.k8s.io/controller-runtime/pkg/manager"
3334

3435
config "sigs.k8s.io/kueue/apis/config/v1beta2"
3536
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta2"
37+
workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job"
3638
"sigs.k8s.io/kueue/pkg/features"
3739
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
3840
utiltestingapi "sigs.k8s.io/kueue/pkg/util/testing/v1beta2"
41+
testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job"
3942
"sigs.k8s.io/kueue/test/util"
4043
)
4144

@@ -722,4 +725,168 @@ var _ = ginkgo.Describe("MultiKueue", ginkgo.Ordered, ginkgo.ContinueOnFailure,
722725
})
723726
})
724727
})
728+
729+
ginkgo.It("Should properly detect insecure kubeconfig of MultiKueueClusters and remove remote client", func() {
730+
var w1KubeconfigInvalidBytes []byte
731+
ginkgo.By("Create a kubeconfig with an invalid certificate authority path", func() {
732+
cfg, err := worker1TestCluster.kubeConfigBytes()
733+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
734+
735+
w1KubeconfigInvalid, err := clientcmd.Load(cfg)
736+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
737+
gomega.Expect(w1KubeconfigInvalid).NotTo(gomega.BeNil())
738+
739+
w1KubeconfigInvalid.Clusters["default-cluster"].CertificateAuthority = "/some/random/path"
740+
w1KubeconfigInvalidBytes, err = clientcmd.Write(*w1KubeconfigInvalid)
741+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
742+
})
743+
744+
secret := &corev1.Secret{
745+
ObjectMeta: metav1.ObjectMeta{
746+
Name: "testing-secret",
747+
Namespace: managersConfigNamespace.Name,
748+
},
749+
Data: map[string][]byte{
750+
kueue.MultiKueueConfigSecretKey: w1KubeconfigInvalidBytes,
751+
},
752+
}
753+
754+
ginkgo.By("creating the secret, with insecure kubeconfig", func() {
755+
gomega.Expect(managerTestCluster.client.Create(managerTestCluster.ctx, secret)).Should(gomega.Succeed())
756+
ginkgo.DeferCleanup(func() error { return managerTestCluster.client.Delete(managerTestCluster.ctx, secret) })
757+
})
758+
759+
secretKey := client.ObjectKeyFromObject(secret)
760+
clusterKey := client.ObjectKeyFromObject(workerCluster1)
761+
acKey := client.ObjectKeyFromObject(multiKueueAC)
762+
cqKey := client.ObjectKeyFromObject(managerCq)
763+
764+
ginkgo.By("updating the cluster, the worker1 cluster becomes inactive", func() {
765+
updatedCluster := kueue.MultiKueueCluster{}
766+
ginkgo.By("updating the cluster spec", func() {
767+
gomega.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, clusterKey, &updatedCluster)).To(gomega.Succeed())
768+
updatedCluster.Spec.KubeConfig.LocationType = kueue.SecretLocationType
769+
updatedCluster.Spec.KubeConfig.Location = secret.Name
770+
gomega.Expect(managerTestCluster.client.Update(managerTestCluster.ctx, &updatedCluster)).To(gomega.Succeed())
771+
})
772+
773+
ginkgo.By("wait for the status update", func() {
774+
gomega.Eventually(func(g gomega.Gomega) {
775+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, clusterKey, &updatedCluster)).To(gomega.Succeed())
776+
g.Expect(updatedCluster.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(metav1.Condition{
777+
Type: kueue.MultiKueueClusterActive,
778+
Status: metav1.ConditionFalse,
779+
Reason: "InsecureKubeConfig",
780+
Message: "insecure kubeconfig: certificate-authority file paths are not allowed, use certificate-authority-data for cluster default-cluster",
781+
}, util.IgnoreConditionTimestampsAndObservedGeneration)))
782+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
783+
784+
gomega.Eventually(func(g gomega.Gomega) {
785+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, acKey, multiKueueAC)).To(gomega.Succeed())
786+
g.Expect(multiKueueAC.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(metav1.Condition{
787+
Type: kueue.AdmissionCheckActive,
788+
Status: metav1.ConditionTrue,
789+
Reason: "SomeActiveClusters",
790+
Message: "Inactive clusters: [worker1]",
791+
}, util.IgnoreConditionTimestampsAndObservedGeneration)))
792+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
793+
794+
gomega.Eventually(func(g gomega.Gomega) {
795+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, cqKey, managerCq)).To(gomega.Succeed())
796+
g.Expect(managerCq.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(metav1.Condition{
797+
Type: kueue.ClusterQueueActive,
798+
Status: metav1.ConditionTrue,
799+
Reason: "Ready",
800+
Message: "Can admit new workloads",
801+
}, util.IgnoreConditionTimestampsAndObservedGeneration)))
802+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
803+
})
804+
})
805+
806+
job := testingjob.MakeJob("job1", managerNs.Name).
807+
Queue(kueue.LocalQueueName(managerLq.Name)).
808+
Obj()
809+
ginkgo.By("create a job and reserve quota, only existing remoteClients are part of the NominatedClusterNames", func() {
810+
util.MustCreate(managerTestCluster.ctx, managerTestCluster.client, job)
811+
wlLookupKey := types.NamespacedName{Name: workloadjob.GetWorkloadNameForJob(job.Name, job.UID), Namespace: managerNs.Name}
812+
813+
ginkgo.By("setting workload reservation in the management cluster", func() {
814+
admission := utiltestingapi.MakeAdmission(managerCq.Name).Obj()
815+
util.SetQuotaReservation(managerTestCluster.ctx, managerTestCluster.client, wlLookupKey, admission)
816+
})
817+
818+
ginkgo.By("verify remote clients are managed correctly: worker1 was removed and worker2 is still active", func() {
819+
managerWl := &kueue.Workload{}
820+
gomega.Eventually(func(g gomega.Gomega) {
821+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, wlLookupKey, managerWl)).To(gomega.Succeed())
822+
g.Expect(managerWl.Status.NominatedClusterNames).NotTo(gomega.ContainElements(workerCluster1.Name))
823+
g.Expect(managerWl.Status.NominatedClusterNames).To(gomega.ContainElements(workerCluster2.Name))
824+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
825+
})
826+
827+
ginkgo.By("unsetting workload reservation in the management cluster to cleanup workloads", func() {
828+
util.SetQuotaReservation(managerTestCluster.ctx, managerTestCluster.client, wlLookupKey, nil)
829+
})
830+
})
831+
832+
ginkgo.By("updating the secret with a valid kubeconfig", func() {
833+
w1Kubeconfig, err := worker1TestCluster.kubeConfigBytes()
834+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
835+
updatedSecret := &corev1.Secret{}
836+
gomega.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, secretKey, updatedSecret)).To(gomega.Succeed())
837+
updatedSecret.Data[kueue.MultiKueueConfigSecretKey] = w1Kubeconfig
838+
gomega.Expect(managerTestCluster.client.Update(managerTestCluster.ctx, updatedSecret)).To(gomega.Succeed())
839+
})
840+
841+
ginkgo.By("the worker1 cluster becomes active", func() {
842+
cluster := kueue.MultiKueueCluster{}
843+
gomega.EventuallyWithOffset(1, func(g gomega.Gomega) {
844+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, clusterKey, &cluster)).To(gomega.Succeed())
845+
g.Expect(cluster.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(metav1.Condition{
846+
Type: kueue.MultiKueueClusterActive,
847+
Status: metav1.ConditionTrue,
848+
Reason: "Active",
849+
Message: "Connected",
850+
}, util.IgnoreConditionTimestampsAndObservedGeneration)))
851+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
852+
853+
ac := kueue.AdmissionCheck{}
854+
gomega.EventuallyWithOffset(1, func(g gomega.Gomega) {
855+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, acKey, &ac)).To(gomega.Succeed())
856+
g.Expect(ac.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(metav1.Condition{
857+
Type: kueue.AdmissionCheckActive,
858+
Status: metav1.ConditionTrue,
859+
Reason: "Active",
860+
Message: "The admission check is active",
861+
}, util.IgnoreConditionTimestampsAndObservedGeneration)))
862+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
863+
864+
cq := &kueue.ClusterQueue{}
865+
gomega.EventuallyWithOffset(1, func(g gomega.Gomega) {
866+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, cqKey, cq)).To(gomega.Succeed())
867+
g.Expect(cq.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(metav1.Condition{
868+
Type: kueue.ClusterQueueActive,
869+
Status: metav1.ConditionTrue,
870+
Reason: "Ready",
871+
Message: "Can admit new workloads",
872+
}, util.IgnoreConditionTimestampsAndObservedGeneration)))
873+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
874+
})
875+
876+
ginkgo.By("reserve quota again and verify that existing remoteClients are part of the NominatedClusterNames", func() {
877+
wlLookupKey := types.NamespacedName{Name: workloadjob.GetWorkloadNameForJob(job.Name, job.UID), Namespace: managerNs.Name}
878+
ginkgo.By("setting workload reservation in the management cluster", func() {
879+
admission := utiltestingapi.MakeAdmission(managerCq.Name).Obj()
880+
util.SetQuotaReservation(managerTestCluster.ctx, managerTestCluster.client, wlLookupKey, admission)
881+
})
882+
883+
ginkgo.By("verify remote clients are managed correctly: both worker1 and worker2 are active", func() {
884+
managerWl := &kueue.Workload{}
885+
gomega.Eventually(func(g gomega.Gomega) {
886+
g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, wlLookupKey, managerWl)).To(gomega.Succeed())
887+
g.Expect(managerWl.Status.NominatedClusterNames).To(gomega.ContainElements(workerCluster1.Name, workerCluster2.Name))
888+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
889+
})
890+
})
891+
})
725892
})

0 commit comments

Comments
 (0)