Skip to content

Commit 0a13235

Browse files
committed
[YUNIKORN-2146] skip preempted placeholder in allocate (#711)
Skip preempted placeholders in tryPlaceholderAllocate() as the replacement will most likely never happen on time. Not skipping could also cause other allocations to keep waiting for the confirmation that preemption has worked. Closes: #711 Signed-off-by: Wilfred Spiegelenburg <[email protected]>
1 parent 51df651 commit 0a13235

File tree

2 files changed

+73
-2
lines changed

2 files changed

+73
-2
lines changed

pkg/scheduler/objects/application.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,9 +1119,9 @@ func (sa *Application) tryPlaceholderAllocate(nodeIterator func() NodeIterator,
11191119
// walk over the placeholders, allow for processing all as we can have multiple task groups
11201120
phAllocs := sa.getPlaceholderAllocations()
11211121
for _, ph := range phAllocs {
1122-
// we could have already released this placeholder and are waiting for the shim to confirm
1122+
// we could have already released preempted this placeholder and are waiting for the shim to confirm
11231123
// and check that we have the correct task group before trying to swap
1124-
if ph.IsReleased() || request.GetTaskGroup() != ph.GetTaskGroup() {
1124+
if ph.IsReleased() || ph.IsPreempted() || request.GetTaskGroup() != ph.GetTaskGroup() {
11251125
continue
11261126
}
11271127
// before we check anything we need to check the resources equality

pkg/scheduler/partition_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3003,6 +3003,77 @@ func TestPlaceholderMatch(t *testing.T) {
30033003
assertLimits(t, getTestUserGroup(), resources.Multiply(phRes, 3))
30043004
}
30053005

3006+
func TestPreemptedPlaceholderSkip(t *testing.T) {
3007+
setupUGM()
3008+
partition, err := newBasePartition()
3009+
assert.NilError(t, err, "partition create failed")
3010+
tgRes := resources.NewResourceFromMap(map[string]resources.Quantity{"first": 10, "second": 10})
3011+
phRes := resources.NewResourceFromMap(map[string]resources.Quantity{"first": 2, "second": 2})
3012+
3013+
// add node to allow allocation
3014+
setupNode(t, nodeID1, partition, tgRes)
3015+
3016+
// add the app with placeholder request
3017+
app := newApplicationTG(appID1, "default", "root.default", tgRes)
3018+
err = partition.AddApplication(app)
3019+
assert.NilError(t, err, "app-1 should have been added to the partition")
3020+
// add an ask for a placeholder and allocate
3021+
ask := newAllocationAskTG(phID, appID1, taskGroup, phRes, true)
3022+
err = app.AddAllocationAsk(ask)
3023+
assert.NilError(t, err, "failed to add placeholder ask ph-1 to app")
3024+
// try to allocate a placeholder via normal allocate
3025+
ph := partition.tryAllocate()
3026+
if ph == nil {
3027+
t.Fatal("expected placeholder ph-1 to be allocated")
3028+
}
3029+
phUUID := ph.GetUUID()
3030+
assert.Equal(t, phID, ph.GetAllocationKey(), "expected allocation of ph-1 to be returned")
3031+
assert.Equal(t, 1, len(app.GetAllPlaceholderData()), "placeholder data should be created on allocate")
3032+
3033+
// add a new ask the same task group as the placeholder
3034+
ask = newAllocationAskTG(allocID, appID1, taskGroup, phRes, false)
3035+
err = app.AddAllocationAsk(ask)
3036+
assert.NilError(t, err, "failed to add ask alloc-1 to app")
3037+
alloc := partition.tryAllocate()
3038+
if alloc != nil {
3039+
t.Fatal("expected ask not to be allocated (matched task group)")
3040+
}
3041+
3042+
// mark the placeholder as preempted (shortcut not interested in usage accounting etc.)
3043+
ph.MarkPreempted()
3044+
3045+
// replace the placeholder should NOT work
3046+
alloc = partition.tryPlaceholderAllocate()
3047+
if alloc != nil {
3048+
t.Fatal("allocation should not have matched placeholder")
3049+
}
3050+
3051+
// release placeholder: do what the context would do after the shim processing
3052+
release := &si.AllocationRelease{
3053+
PartitionName: "test",
3054+
ApplicationID: appID1,
3055+
UUID: phUUID,
3056+
TerminationType: si.TerminationType_PREEMPTED_BY_SCHEDULER,
3057+
}
3058+
released, confirmed := partition.removeAllocation(release)
3059+
assert.Equal(t, len(released), 0, "expecting no released allocation")
3060+
if confirmed != nil {
3061+
t.Fatal("confirmed allocation should be nil")
3062+
}
3063+
assert.Equal(t, int64(1), app.GetAllPlaceholderData()[0].TimedOut, "placeholder data should show the preemption")
3064+
3065+
// normal allocate should work as we have no placeholders left
3066+
alloc = partition.tryAllocate()
3067+
if alloc == nil {
3068+
t.Fatal("expected ask to be allocated (no placeholder left)")
3069+
}
3070+
assert.Equal(t, allocID, alloc.GetAllocationKey(), "expected allocation of alloc-1 to be returned")
3071+
assert.Equal(t, 1, len(app.GetAllPlaceholderData()), "placeholder data should not be updated")
3072+
assert.Equal(t, int64(1), app.GetAllPlaceholderData()[0].Count, "placeholder data should show 1 available placeholder")
3073+
assert.Equal(t, int64(0), app.GetAllPlaceholderData()[0].Replaced, "placeholder data should show no replacements")
3074+
assert.Equal(t, int64(1), app.GetAllPlaceholderData()[0].TimedOut, "placeholder data should show the preemption")
3075+
}
3076+
30063077
// simple direct replace with one node
30073078
func TestTryPlaceholderAllocate(t *testing.T) {
30083079
setupUGM()

0 commit comments

Comments
 (0)