Skip to content

Commit 764606e

Browse files
steinsgatetedcraigcondit
authored andcommitted
[YUNIKORN-522] Smoke test for dynamic queue clean up (#404)
Closes: #404 Signed-off-by: Craig Condit <[email protected]>
1 parent 7493411 commit 764606e

File tree

2 files changed

+182
-0
lines changed

2 files changed

+182
-0
lines changed

pkg/scheduler/objects/application.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1607,3 +1607,8 @@ func (sa *Application) GetAllPlaceholderData() []*PlaceholderData {
16071607
}
16081608
return placeholders
16091609
}
1610+
1611+
// test only
1612+
func SetCompletingTimeout(duration time.Duration) {
1613+
completingTimeout = duration
1614+
}

pkg/scheduler/tests/smoke_test.go

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,3 +1447,180 @@ func TestDupReleasesInGangScheduling(t *testing.T) {
14471447
ms.scheduler.MultiStepSchedule(5)
14481448
ms.mockRM.waitForAllocations(t, 1, 1000)
14491449
}
1450+
1451+
//nolint:funlen
1452+
func TestDynamicQueueCleanUp(t *testing.T) {
1453+
configData := `
1454+
partitions:
1455+
- name: default
1456+
queues:
1457+
- name: root
1458+
submitacl: "*"
1459+
placementrules:
1460+
- name: fixed
1461+
value: cleanup_test
1462+
create: true
1463+
`
1464+
// Register RM
1465+
// Start all tests
1466+
ms := &mockScheduler{}
1467+
defer ms.Stop()
1468+
1469+
err := ms.Init(configData, false)
1470+
assert.NilError(t, err, "RegisterResourceManager failed")
1471+
1472+
// Check queues of cache and scheduler.
1473+
part := ms.scheduler.GetClusterContext().GetPartition(partition)
1474+
assert.Assert(t, part.GetTotalPartitionResource() == nil, "partition info max resource nil")
1475+
1476+
// Check the queue root
1477+
root := part.GetQueue("root")
1478+
assert.Assert(t, root.GetMaxResource() == nil, "root queue max resource should be nil")
1479+
1480+
leafName := ""
1481+
app1ID := appID1
1482+
1483+
// Register a node, and add apps
1484+
err = ms.proxy.UpdateNode(&si.NodeRequest{
1485+
Nodes: []*si.NodeInfo{
1486+
{
1487+
NodeID: "node-1:1234",
1488+
Attributes: map[string]string{},
1489+
SchedulableResource: &si.Resource{
1490+
Resources: map[string]*si.Quantity{
1491+
"memory": {Value: 100000000},
1492+
"vcore": {Value: 20000},
1493+
},
1494+
},
1495+
Action: si.NodeInfo_CREATE,
1496+
},
1497+
{
1498+
NodeID: "node-2:1234",
1499+
Attributes: map[string]string{},
1500+
SchedulableResource: &si.Resource{
1501+
Resources: map[string]*si.Quantity{
1502+
"memory": {Value: 100000000},
1503+
"vcore": {Value: 20000},
1504+
},
1505+
},
1506+
Action: si.NodeInfo_CREATE,
1507+
},
1508+
},
1509+
RmID: "rm:123",
1510+
})
1511+
1512+
assert.NilError(t, err, "NodeRequest failed")
1513+
1514+
err = ms.proxy.UpdateApplication(&si.ApplicationRequest{
1515+
New: newAddAppRequest(map[string]string{app1ID: leafName}),
1516+
RmID: "rm:123",
1517+
})
1518+
1519+
assert.NilError(t, err, "ApplicationRequest failed")
1520+
1521+
ms.mockRM.waitForAcceptedApplication(t, app1ID, 1000)
1522+
ms.mockRM.waitForAcceptedNode(t, "node-1:1234", 1000)
1523+
ms.mockRM.waitForAcceptedNode(t, "node-2:1234", 1000)
1524+
1525+
// Get the app
1526+
app := ms.getApplication(appID1)
1527+
1528+
// Get the queue cleanup_test
1529+
leafName = "root.cleanup_test"
1530+
leaf := part.GetQueue(leafName)
1531+
1532+
// Verify app initial state
1533+
var app01 *objects.Application
1534+
app01, err = getApplication(part, appID1)
1535+
assert.NilError(t, err, "application not found")
1536+
1537+
assert.Equal(t, app01.CurrentState(), objects.New.String())
1538+
1539+
err = ms.proxy.UpdateAllocation(&si.AllocationRequest{
1540+
Asks: []*si.AllocationAsk{
1541+
{
1542+
AllocationKey: "alloc-1",
1543+
ResourceAsk: &si.Resource{
1544+
Resources: map[string]*si.Quantity{
1545+
"memory": {Value: 10000000},
1546+
"vcore": {Value: 1000},
1547+
},
1548+
},
1549+
MaxAllocations: 2,
1550+
ApplicationID: appID1,
1551+
},
1552+
},
1553+
RmID: "rm:123",
1554+
})
1555+
assert.NilError(t, err, "AllocationRequest 2 failed")
1556+
1557+
// Wait pending resource of queue a and scheduler queue
1558+
// Both pending memory = 10 * 2 = 20;
1559+
waitForPendingQueueResource(t, leaf, 20000000, 1000)
1560+
waitForPendingQueueResource(t, root, 20000000, 1000)
1561+
waitForPendingAppResource(t, app, 20000000, 1000)
1562+
assert.Equal(t, app01.CurrentState(), objects.Accepted.String())
1563+
1564+
ms.scheduler.MultiStepSchedule(5)
1565+
1566+
ms.mockRM.waitForAllocations(t, 2, 1000)
1567+
1568+
// Make sure pending resource updated to 0
1569+
waitForPendingQueueResource(t, leaf, 0, 1000)
1570+
waitForPendingQueueResource(t, root, 0, 1000)
1571+
waitForPendingAppResource(t, app, 0, 1000)
1572+
1573+
// Check allocated resources of queues, apps
1574+
assert.Equal(t, int(leaf.GetAllocatedResource().Resources[resources.MEMORY]), 20000000, "leaf allocated memory incorrect")
1575+
assert.Equal(t, int(root.GetAllocatedResource().Resources[resources.MEMORY]), 20000000, "root allocated memory incorrect")
1576+
assert.Equal(t, int(app.GetAllocatedResource().Resources[resources.MEMORY]), 20000000, "app allocated memory incorrect")
1577+
1578+
// once we start to process allocation asks from this app, verify the state again
1579+
assert.Equal(t, app01.CurrentState(), objects.Running.String())
1580+
1581+
// Check allocated resources of nodes
1582+
waitForAllocatedNodeResource(t, ms.scheduler.GetClusterContext(), ms.partitionName, []string{"node-1:1234", "node-2:1234"}, 20000000, 1000)
1583+
1584+
updateRequest := &si.AllocationRequest{
1585+
Releases: &si.AllocationReleasesRequest{
1586+
AllocationsToRelease: make([]*si.AllocationRelease, 0),
1587+
},
1588+
RmID: "rm:123",
1589+
}
1590+
1591+
// Release all allocations
1592+
for _, v := range ms.mockRM.getAllocations() {
1593+
updateRequest.Releases.AllocationsToRelease = append(updateRequest.Releases.AllocationsToRelease, &si.AllocationRelease{
1594+
UUID: v.UUID,
1595+
ApplicationID: v.ApplicationID,
1596+
PartitionName: v.PartitionName,
1597+
})
1598+
}
1599+
1600+
// Before release allocations, shorten the completingTimeout first, otherwise it will take 30 seconds for the app to become completed state.
1601+
objects.SetCompletingTimeout(time.Millisecond * 100)
1602+
defer objects.SetCompletingTimeout(time.Second * 30)
1603+
1604+
// Release Allocations.
1605+
err = ms.proxy.UpdateAllocation(updateRequest)
1606+
assert.NilError(t, err, "AllocationRequest 3 failed")
1607+
1608+
ms.mockRM.waitForAllocations(t, 0, 1000)
1609+
1610+
// Check allocated resources of queues, apps should be 0 now
1611+
assert.Equal(t, int(leaf.GetAllocatedResource().Resources[resources.MEMORY]), 0, "leaf allocated memory incorrect")
1612+
assert.Equal(t, int(root.GetAllocatedResource().Resources[resources.MEMORY]), 0, "root allocated memory incorrect")
1613+
assert.Equal(t, int(app.GetAllocatedResource().Resources[resources.MEMORY]), 0, "app allocated memory incorrect")
1614+
1615+
// Check app to Completing status
1616+
assert.Equal(t, app01.CurrentState(), objects.Completing.String())
1617+
// the app changes from completing state to completed state
1618+
err = common.WaitFor(1*time.Millisecond, time.Millisecond*200, app.IsCompleted)
1619+
assert.NilError(t, err, "App should be in Completed state")
1620+
// partition manager should be able to clean up the dynamically created queue.
1621+
if err = common.WaitFor(1*time.Millisecond, time.Second*11, func() bool {
1622+
return part.GetQueue(leafName) == nil
1623+
}); err != nil {
1624+
t.Errorf("timeout waiting for queue is cleared %v", err)
1625+
}
1626+
}

0 commit comments

Comments
 (0)