Skip to content

Commit 381bc9a

Browse files
Yongjun Zhangwilfred-s
authored andcommitted
[YUNIKORN-2030] headroom check for reserved allocations (#793)
During the allocation cycle for reservations we fall back to trying all reservations on all nodes for an application. During this fallback a headroom check was missing which caused queue update failures to be logged. The allocation did not succeed but it caused scheduling delays and log spew. Closes: #793 Signed-off-by: Wilfred Spiegelenburg <[email protected]> (cherry picked from commit 908d1cb)
1 parent d24a25a commit 381bc9a

File tree

3 files changed

+48
-9
lines changed

3 files changed

+48
-9
lines changed

pkg/scheduler/objects/application.go

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,12 @@ func (sa *Application) tryPlaceholderAllocate(nodeIterator func() NodeIterator,
12231223
return allocResult
12241224
}
12251225

1226+
// check ask against both user headRoom and queue headRoom
1227+
func (sa *Application) checkHeadRooms(ask *AllocationAsk, userHeadroom *resources.Resource, headRoom *resources.Resource) bool {
1228+
// check if this fits in the users' headroom first, if that fits check the queues' headroom
1229+
return userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) && headRoom.FitInMaxUndef(ask.GetAllocatedResource())
1230+
}
1231+
12261232
// Try a reserved allocation of an outstanding reservation
12271233
func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, nodeIterator func() NodeIterator) *Allocation {
12281234
sa.Lock()
@@ -1250,13 +1256,8 @@ func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, nodeIte
12501256
alloc := newUnreservedAllocation(reserve.nodeID, unreserveAsk)
12511257
return alloc
12521258
}
1253-
// check if this fits in the users' headroom first, if that fits check the queues' headroom
1254-
if !userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) {
1255-
continue
1256-
}
12571259

1258-
// check if this fits in the queue's headroom
1259-
if !headRoom.FitInMaxUndef(ask.GetAllocatedResource()) {
1260+
if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
12601261
continue
12611262
}
12621263

@@ -1280,12 +1281,16 @@ func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, nodeIte
12801281
// lets try this on all other nodes
12811282
for _, reserve := range sa.reservations {
12821283
// Other nodes cannot be tried if the ask has a required node
1283-
if reserve.ask.GetRequiredNode() != "" {
1284+
ask := reserve.ask
1285+
if ask.GetRequiredNode() != "" {
12841286
continue
12851287
}
12861288
iterator := nodeIterator()
12871289
if iterator != nil {
1288-
alloc := sa.tryNodesNoReserve(reserve.ask, iterator, reserve.nodeID)
1290+
if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
1291+
continue
1292+
}
1293+
alloc := sa.tryNodesNoReserve(ask, iterator, reserve.nodeID)
12891294
// have a candidate return it, including the node that was reserved
12901295
if alloc != nil {
12911296
return alloc
@@ -1490,7 +1495,7 @@ func (sa *Application) tryNode(node *Node, ask *AllocationAsk) *Allocation {
14901495
alloc := NewAllocation(node.NodeID, ask)
14911496
if node.AddAllocation(alloc) {
14921497
if err := sa.queue.IncAllocatedResource(alloc.GetAllocatedResource(), false); err != nil {
1493-
log.Log(log.SchedApplication).Warn("queue update failed unexpectedly",
1498+
log.Log(log.SchedApplication).DPanic("queue update failed unexpectedly",
14941499
zap.Error(err))
14951500
// revert the node update
14961501
node.RemoveAllocation(alloc.GetAllocationID())

pkg/scheduler/objects/application_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2601,6 +2601,39 @@ func TestGetRateLimitedAppLog(t *testing.T) {
26012601
assert.Check(t, l != nil)
26022602
}
26032603

2604+
func TestTryAllocateWithReservedHeadRoomChecking(t *testing.T) {
2605+
defer func() {
2606+
if r := recover(); r != nil {
2607+
t.Fatalf("reserved headroom test regression: %v", r)
2608+
}
2609+
}()
2610+
2611+
res, err := resources.NewResourceFromConf(map[string]string{"memory": "2G"})
2612+
assert.NilError(t, err, "failed to create basic resource")
2613+
var headRoom *resources.Resource
2614+
headRoom, err = resources.NewResourceFromConf(map[string]string{"memory": "1G"})
2615+
assert.NilError(t, err, "failed to create basic resource")
2616+
2617+
app := newApplication(appID1, "default", "root")
2618+
ask := newAllocationAsk(aKey, appID1, res)
2619+
var queue *Queue
2620+
queue, err = createRootQueue(map[string]string{"memory": "1G"})
2621+
assert.NilError(t, err, "queue create failed")
2622+
app.queue = queue
2623+
err = app.AddAllocationAsk(ask)
2624+
assert.NilError(t, err, "ask should have been added to app")
2625+
2626+
node1 := newNodeRes(nodeID1, res)
2627+
node2 := newNodeRes(nodeID2, res)
2628+
// reserve that works
2629+
err = app.Reserve(node1, ask)
2630+
assert.NilError(t, err, "reservation should not have failed")
2631+
2632+
iter := getNodeIteratorFn(node1, node2)
2633+
alloc := app.tryReservedAllocate(headRoom, iter)
2634+
assert.Assert(t, alloc == nil, "Alloc is expected to be nil due to insufficient headroom")
2635+
}
2636+
26042637
func TestUpdateRunnableStatus(t *testing.T) {
26052638
app := newApplication(appID0, "default", "root.unknown")
26062639
assert.Assert(t, app.runnableInQueue)

pkg/scheduler/objects/utilities_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ const (
4242
aKey = "alloc-1"
4343
aAllocationID = "alloc-allocationid-1"
4444
nodeID1 = "node-1"
45+
nodeID2 = "node-2"
4546
instType1 = "itype-1"
4647
)
4748

0 commit comments

Comments
 (0)