@@ -1447,3 +1447,180 @@ func TestDupReleasesInGangScheduling(t *testing.T) {
1447
1447
ms .scheduler .MultiStepSchedule (5 )
1448
1448
ms .mockRM .waitForAllocations (t , 1 , 1000 )
1449
1449
}
1450
+
1451
+ //nolint:funlen
1452
+ func TestDynamicQueueCleanUp (t * testing.T ) {
1453
+ configData := `
1454
+ partitions:
1455
+ - name: default
1456
+ queues:
1457
+ - name: root
1458
+ submitacl: "*"
1459
+ placementrules:
1460
+ - name: fixed
1461
+ value: cleanup_test
1462
+ create: true
1463
+ `
1464
+ // Register RM
1465
+ // Start all tests
1466
+ ms := & mockScheduler {}
1467
+ defer ms .Stop ()
1468
+
1469
+ err := ms .Init (configData , false )
1470
+ assert .NilError (t , err , "RegisterResourceManager failed" )
1471
+
1472
+ // Check queues of cache and scheduler.
1473
+ part := ms .scheduler .GetClusterContext ().GetPartition (partition )
1474
+ assert .Assert (t , part .GetTotalPartitionResource () == nil , "partition info max resource nil" )
1475
+
1476
+ // Check the queue root
1477
+ root := part .GetQueue ("root" )
1478
+ assert .Assert (t , root .GetMaxResource () == nil , "root queue max resource should be nil" )
1479
+
1480
+ leafName := ""
1481
+ app1ID := appID1
1482
+
1483
+ // Register a node, and add apps
1484
+ err = ms .proxy .UpdateNode (& si.NodeRequest {
1485
+ Nodes : []* si.NodeInfo {
1486
+ {
1487
+ NodeID : "node-1:1234" ,
1488
+ Attributes : map [string ]string {},
1489
+ SchedulableResource : & si.Resource {
1490
+ Resources : map [string ]* si.Quantity {
1491
+ "memory" : {Value : 100000000 },
1492
+ "vcore" : {Value : 20000 },
1493
+ },
1494
+ },
1495
+ Action : si .NodeInfo_CREATE ,
1496
+ },
1497
+ {
1498
+ NodeID : "node-2:1234" ,
1499
+ Attributes : map [string ]string {},
1500
+ SchedulableResource : & si.Resource {
1501
+ Resources : map [string ]* si.Quantity {
1502
+ "memory" : {Value : 100000000 },
1503
+ "vcore" : {Value : 20000 },
1504
+ },
1505
+ },
1506
+ Action : si .NodeInfo_CREATE ,
1507
+ },
1508
+ },
1509
+ RmID : "rm:123" ,
1510
+ })
1511
+
1512
+ assert .NilError (t , err , "NodeRequest failed" )
1513
+
1514
+ err = ms .proxy .UpdateApplication (& si.ApplicationRequest {
1515
+ New : newAddAppRequest (map [string ]string {app1ID : leafName }),
1516
+ RmID : "rm:123" ,
1517
+ })
1518
+
1519
+ assert .NilError (t , err , "ApplicationRequest failed" )
1520
+
1521
+ ms .mockRM .waitForAcceptedApplication (t , app1ID , 1000 )
1522
+ ms .mockRM .waitForAcceptedNode (t , "node-1:1234" , 1000 )
1523
+ ms .mockRM .waitForAcceptedNode (t , "node-2:1234" , 1000 )
1524
+
1525
+ // Get the app
1526
+ app := ms .getApplication (appID1 )
1527
+
1528
+ // Get the queue cleanup_test
1529
+ leafName = "root.cleanup_test"
1530
+ leaf := part .GetQueue (leafName )
1531
+
1532
+ // Verify app initial state
1533
+ var app01 * objects.Application
1534
+ app01 , err = getApplication (part , appID1 )
1535
+ assert .NilError (t , err , "application not found" )
1536
+
1537
+ assert .Equal (t , app01 .CurrentState (), objects .New .String ())
1538
+
1539
+ err = ms .proxy .UpdateAllocation (& si.AllocationRequest {
1540
+ Asks : []* si.AllocationAsk {
1541
+ {
1542
+ AllocationKey : "alloc-1" ,
1543
+ ResourceAsk : & si.Resource {
1544
+ Resources : map [string ]* si.Quantity {
1545
+ "memory" : {Value : 10000000 },
1546
+ "vcore" : {Value : 1000 },
1547
+ },
1548
+ },
1549
+ MaxAllocations : 2 ,
1550
+ ApplicationID : appID1 ,
1551
+ },
1552
+ },
1553
+ RmID : "rm:123" ,
1554
+ })
1555
+ assert .NilError (t , err , "AllocationRequest 2 failed" )
1556
+
1557
+ // Wait pending resource of queue a and scheduler queue
1558
+ // Both pending memory = 10 * 2 = 20;
1559
+ waitForPendingQueueResource (t , leaf , 20000000 , 1000 )
1560
+ waitForPendingQueueResource (t , root , 20000000 , 1000 )
1561
+ waitForPendingAppResource (t , app , 20000000 , 1000 )
1562
+ assert .Equal (t , app01 .CurrentState (), objects .Accepted .String ())
1563
+
1564
+ ms .scheduler .MultiStepSchedule (5 )
1565
+
1566
+ ms .mockRM .waitForAllocations (t , 2 , 1000 )
1567
+
1568
+ // Make sure pending resource updated to 0
1569
+ waitForPendingQueueResource (t , leaf , 0 , 1000 )
1570
+ waitForPendingQueueResource (t , root , 0 , 1000 )
1571
+ waitForPendingAppResource (t , app , 0 , 1000 )
1572
+
1573
+ // Check allocated resources of queues, apps
1574
+ assert .Equal (t , int (leaf .GetAllocatedResource ().Resources [resources .MEMORY ]), 20000000 , "leaf allocated memory incorrect" )
1575
+ assert .Equal (t , int (root .GetAllocatedResource ().Resources [resources .MEMORY ]), 20000000 , "root allocated memory incorrect" )
1576
+ assert .Equal (t , int (app .GetAllocatedResource ().Resources [resources .MEMORY ]), 20000000 , "app allocated memory incorrect" )
1577
+
1578
+ // once we start to process allocation asks from this app, verify the state again
1579
+ assert .Equal (t , app01 .CurrentState (), objects .Running .String ())
1580
+
1581
+ // Check allocated resources of nodes
1582
+ waitForAllocatedNodeResource (t , ms .scheduler .GetClusterContext (), ms .partitionName , []string {"node-1:1234" , "node-2:1234" }, 20000000 , 1000 )
1583
+
1584
+ updateRequest := & si.AllocationRequest {
1585
+ Releases : & si.AllocationReleasesRequest {
1586
+ AllocationsToRelease : make ([]* si.AllocationRelease , 0 ),
1587
+ },
1588
+ RmID : "rm:123" ,
1589
+ }
1590
+
1591
+ // Release all allocations
1592
+ for _ , v := range ms .mockRM .getAllocations () {
1593
+ updateRequest .Releases .AllocationsToRelease = append (updateRequest .Releases .AllocationsToRelease , & si.AllocationRelease {
1594
+ UUID : v .UUID ,
1595
+ ApplicationID : v .ApplicationID ,
1596
+ PartitionName : v .PartitionName ,
1597
+ })
1598
+ }
1599
+
1600
+ // Before release allocations, shorten the completingTimeout first, otherwise it will take 30 seconds for the app to become completed state.
1601
+ objects .SetCompletingTimeout (time .Millisecond * 100 )
1602
+ defer objects .SetCompletingTimeout (time .Second * 30 )
1603
+
1604
+ // Release Allocations.
1605
+ err = ms .proxy .UpdateAllocation (updateRequest )
1606
+ assert .NilError (t , err , "AllocationRequest 3 failed" )
1607
+
1608
+ ms .mockRM .waitForAllocations (t , 0 , 1000 )
1609
+
1610
+ // Check allocated resources of queues, apps should be 0 now
1611
+ assert .Equal (t , int (leaf .GetAllocatedResource ().Resources [resources .MEMORY ]), 0 , "leaf allocated memory incorrect" )
1612
+ assert .Equal (t , int (root .GetAllocatedResource ().Resources [resources .MEMORY ]), 0 , "root allocated memory incorrect" )
1613
+ assert .Equal (t , int (app .GetAllocatedResource ().Resources [resources .MEMORY ]), 0 , "app allocated memory incorrect" )
1614
+
1615
+ // Check app to Completing status
1616
+ assert .Equal (t , app01 .CurrentState (), objects .Completing .String ())
1617
+ // the app changes from completing state to completed state
1618
+ err = common .WaitFor (1 * time .Millisecond , time .Millisecond * 200 , app .IsCompleted )
1619
+ assert .NilError (t , err , "App should be in Completed state" )
1620
+ // partition manager should be able to clean up the dynamically created queue.
1621
+ if err = common .WaitFor (1 * time .Millisecond , time .Second * 11 , func () bool {
1622
+ return part .GetQueue (leafName ) == nil
1623
+ }); err != nil {
1624
+ t .Errorf ("timeout waiting for queue is cleared %v" , err )
1625
+ }
1626
+ }
0 commit comments