Skip to content

Commit 2d4cf30

Browse files
himani2411Himani Anil Deshpande
andauthored
[SlurmTopo] Use --force-configuration option for creating topology configuration (#3008)
* [SlurmTopo] Use `--force-configuration` option for creating topology.conf * Ignore checks like Capacity Block and Matching Instance Type * [SlurmTopo] Update the not_if guard for pcluster_topology_generator.py * We do not update the file if queues are not updated and p6egb200_block_sizes does not exist * [SlurmTopo] cookstyle --------- Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent 94f83fd commit 2d4cf30

File tree

7 files changed

+177
-76
lines changed

7 files changed

+177
-76
lines changed

cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,6 @@
2525
# Pyxis
2626
default['cluster']['pyxis']['version'] = '0.20.0'
2727
default['cluster']['pyxis']['runtime_path'] = '/run/pyxis'
28+
29+
# Block Topology Plugin
30+
default['cluster']['slurm']['block_topology']['force_configuration'] = false

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
log = logging.getLogger()
2424

25-
25+
P6E_GB200 = "p6e-gb200"
2626
CAPACITY_TYPE_MAP = {
2727
"ONDEMAND": "on-demand",
2828
"SPOT": "spot",
@@ -49,7 +49,17 @@ def _load_cluster_config(input_file_path):
4949
return yaml.load(input_file, Loader=yaml.SafeLoader)
5050

5151

52-
def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901
52+
def _is_capacity_block(capacity_type):
53+
return capacity_type == CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK")
54+
55+
56+
def _is_gb200(instance_type):
57+
return instance_type is not None and instance_type.split(".")[0] == P6E_GB200
58+
59+
60+
def generate_topology_config_file( # noqa: C901
61+
output_file: str, input_file: str, block_sizes: str, force_configuration: bool
62+
):
5363
"""
5464
Generate Topology configuration file.
5565
@@ -74,7 +84,8 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
7484

7585
# Retrieve capacity info from the queue_name, if there
7686
queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
77-
if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
87+
if not _is_capacity_block(queue_capacity_type) and not force_configuration:
88+
# We ignore this check when force_configuration option is used.
7889
log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
7990
continue
8091

@@ -88,7 +99,7 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
8899
continue
89100

90101
# Check for if reservation is for NVLink and size matches min_block_size_list
91-
if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge":
102+
if _is_gb200(compute_resource_config.get("InstanceType")) or force_configuration:
92103
if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count:
93104
block_count += 1
94105
# Each Capacity Reservation ID is a Capacity Block,
@@ -149,6 +160,11 @@ def main():
149160
help="Yaml file containing pcluster CLI configuration file with default values",
150161
required=True,
151162
)
163+
parser.add_argument(
164+
"--force-configuration",
165+
help="Force creation of topology.conf by ignoring the checks of Capacity Block and Instance Type. ",
166+
action="store_true",
167+
)
152168
cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf")
153169
cleanup_or_generate_exclusive_group.add_argument(
154170
"--cleanup",
@@ -159,7 +175,7 @@ def main():
159175
if args.cleanup:
160176
cleanup_topology_config_file(args.output_file)
161177
else:
162-
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
178+
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes, args.force_configuration)
163179
log.info("Completed Execution of ParallelCluster Topology Config Generator")
164180
except Exception as e:
165181
log.exception("Failed to generate Topology.conf, exception: %s", e)

cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
3030
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
3131
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
32-
" --input-file #{node['cluster']['cluster_config_path']}"
32+
" --input-file #{node['cluster']['cluster_config_path']}"\
33+
"#{topology_generator_extra_args}"
3334
not_if { node['cluster']['p6egb200_block_sizes'].nil? }
3435
end
3536
end
@@ -48,8 +49,9 @@
4849
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
4950
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
5051
" --input-file #{node['cluster']['cluster_config_path']}"\
51-
"#{topology_generator_command_args}"
52-
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
52+
"#{topology_generator_command_args}"\
53+
"#{topology_generator_extra_args}"
54+
not_if { topology_generator_command_args.nil? }
5355
end
5456
end
5557

@@ -68,3 +70,9 @@ def topology_generator_command_args
6870
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
6971
end
7072
end
73+
74+
def topology_generator_extra_args
75+
if ['true', 'yes', true].include?(node['cluster']['slurm']['block_topology']['force_configuration'])
76+
" --force-configuration"
77+
end
78+
end

cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb

Lines changed: 85 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -23,57 +23,11 @@ def self.update(chef_run)
2323
block_sizes = '9,18'
2424
cluster_config = 'CONFIG_YAML'
2525
cookbook_env = 'FAKE_COOKBOOK_PATH'
26+
force_configuration_extra_args = ' --force-configuration'
2627

2728
describe 'block_topology:configure' do
28-
for_all_oses do |platform, version|
29-
context "on #{platform}#{version}" do
30-
cached(:chef_run) do
31-
runner = ChefSpec::SoloRunner.new(
32-
platform: platform,
33-
version: version,
34-
step_into: ['block_topology']
35-
) do |node|
36-
node.override['cluster']['node_type'] = 'HeadNode'
37-
node.override['cluster']['scripts_dir'] = script_dir
38-
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
39-
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
40-
node.override['cluster']['cluster_config_path'] = cluster_config
41-
end
42-
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
43-
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
44-
ConvergeBlockTopology.configure(runner)
45-
runner
46-
end
47-
48-
if platform == 'amazon' && version == '2'
49-
it 'does not configures block_topology' do
50-
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
51-
expect(chef_run).not_to run_execute('generate_topology_config')
52-
end
53-
else
54-
it 'creates the topology configuration template' do
55-
expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
56-
.with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb')
57-
.with(user: 'root')
58-
.with(group: 'root')
59-
.with(mode: '0644')
60-
end
61-
62-
it 'generates topology config when block sizes are present' do
63-
expect(chef_run).to run_execute('generate_topology_config')
64-
.with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
65-
" --output-file #{slurm_install_dir}/etc/topology.conf" \
66-
" --block-sizes #{block_sizes}" \
67-
" --input-file #{cluster_config}")
68-
end
69-
end
70-
end
71-
end
72-
end
73-
74-
describe 'block_topology:update' do
75-
for_all_oses do |platform, version|
76-
['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args|
29+
['false', false, 'no', 'true', true, 'yes'].each do |force_configuration|
30+
for_all_oses do |platform, version|
7731
context "on #{platform}#{version}" do
7832
cached(:chef_run) do
7933
runner = ChefSpec::SoloRunner.new(
@@ -86,18 +40,18 @@ def self.update(chef_run)
8640
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
8741
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
8842
node.override['cluster']['cluster_config_path'] = cluster_config
43+
node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration
8944
end
9045
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
91-
allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args)
9246
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
93-
ConvergeBlockTopology.update(runner)
47+
ConvergeBlockTopology.configure(runner)
9448
runner
9549
end
9650

9751
if platform == 'amazon' && version == '2'
9852
it 'does not configures block_topology' do
9953
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
100-
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
54+
expect(chef_run).not_to run_execute('generate_topology_config')
10155
end
10256
else
10357
it 'creates the topology configuration template' do
@@ -107,13 +61,86 @@ def self.update(chef_run)
10761
.with(group: 'root')
10862
.with(mode: '0644')
10963
end
64+
command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
65+
" --output-file #{slurm_install_dir}/etc/topology.conf" \
66+
" --block-sizes #{block_sizes}" \
67+
" --input-file #{cluster_config}"
68+
command_to_exe = if ['true', 'yes', true].include?(force_configuration)
69+
"#{command}#{force_configuration_extra_args}"
70+
else
71+
"#{command}"
72+
end
73+
it 'generates topology config when block sizes are present' do
74+
expect(chef_run).to run_execute('generate_topology_config')
75+
.with(command: command_to_exe)
76+
end
77+
end
78+
end
79+
end
80+
end
81+
end
82+
83+
describe 'block_topology:update' do
84+
['false', false, 'no', 'true', true, 'yes'].each do |force_configuration|
85+
for_all_oses do |platform, version|
86+
['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args|
87+
context "on #{platform}#{version}" do
88+
cached(:chef_run) do
89+
runner = ChefSpec::SoloRunner.new(
90+
platform: platform,
91+
version: version,
92+
step_into: ['block_topology']
93+
) do |node|
94+
node.override['cluster']['node_type'] = 'HeadNode'
95+
node.override['cluster']['scripts_dir'] = script_dir
96+
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
97+
node.override['cluster']['p6egb200_block_sizes'] = block_sizes
98+
node.override['cluster']['cluster_config_path'] = cluster_config
99+
node.override['cluster']['slurm']['block_topology']['force_configuration'] = force_configuration
100+
end
101+
allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true)
102+
allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args)
103+
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env)
104+
ConvergeBlockTopology.update(runner)
105+
runner
106+
end
107+
108+
if platform == 'amazon' && version == '2'
109+
it 'does not configures block_topology' do
110+
expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
111+
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
112+
end
113+
else
114+
command = "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
115+
" --output-file #{slurm_install_dir}/etc/topology.conf" \
116+
" --input-file #{cluster_config}"\
117+
"#{topo_command_args}"
118+
command_to_exe = if ['true', 'yes', true].include?(force_configuration)
119+
"#{command}#{force_configuration_extra_args}"
120+
else
121+
"#{command}"
122+
end
123+
124+
it 'creates the topology configuration template' do
125+
expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf")
126+
.with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb')
127+
.with(user: 'root')
128+
.with(group: 'root')
129+
.with(mode: '0644')
130+
end
131+
132+
if topo_command_args.nil?
133+
it 'update or cleanup topology.conf when block sizes are present' do
134+
expect(chef_run).not_to run_execute('update or cleanup topology.conf')
135+
.with(command: command_to_exe)
136+
end
137+
else
138+
it 'update or cleanup topology.conf when block sizes are present' do
139+
expect(chef_run).to run_execute('update or cleanup topology.conf')
140+
.with(command: command_to_exe)
141+
end
142+
end
110143

111-
it 'update or cleanup topology.conf when block sizes are present' do
112-
expect(chef_run).to run_execute('update or cleanup topology.conf')
113-
.with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \
114-
" --output-file #{slurm_install_dir}/etc/topology.conf" \
115-
" --input-file #{cluster_config}"\
116-
"#{topo_command_args}")
117144
end
118145
end
119146
end

test/unit/slurm/test_topology_generator.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import pytest
1515
from assertpy import assert_that
1616
from pcluster_topology_generator import (
17+
_is_capacity_block,
18+
_is_gb200,
1719
cleanup_topology_config_file,
1820
generate_topology_config_file,
1921
)
@@ -24,14 +26,23 @@ def _assert_files_are_equal(file, expected_file):
2426
assert_that(f.read()).is_equal_to(exp_f.read())
2527

2628

27-
@pytest.mark.parametrize("file_name_suffix", ["with_capacity_block", "no_capacity_block"])
28-
def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix):
29+
@pytest.mark.parametrize(
30+
"file_name_suffix, force_configuration",
31+
[
32+
("with_capacity_block", False),
33+
("no_capacity_block", False),
34+
("with_capacity_block", True),
35+
("no_capacity_block", True),
36+
],
37+
)
38+
def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix, force_configuration):
2939
block_sizes = "9,18" if "no" not in file_name_suffix else None
40+
force_suffix = "_force" if force_configuration else ""
3041
file_name = "sample_" + file_name_suffix + ".yaml"
3142
input_file_path = str(test_datadir / file_name)
32-
output_file_name = "topology_" + file_name_suffix + ".conf"
43+
output_file_name = "topology_" + file_name_suffix + force_suffix + ".conf"
3344
output_file_path = f"{tmpdir}/{output_file_name}"
34-
generate_topology_config_file(output_file_path, input_file_path, block_sizes)
45+
generate_topology_config_file(output_file_path, input_file_path, block_sizes, force_configuration)
3546
if "no" in file_name_suffix:
3647
assert_that(os.path.isfile(output_file_path)).is_equal_to(False)
3748
else:
@@ -48,3 +59,30 @@ def test_cleanup_topology_config_file(mocker, tmpdir, file_exists):
4859
mock_remove.assert_called_once_with(str(topology_file_path))
4960
else:
5061
mock_remove.assert_not_called()
62+
63+
64+
@pytest.mark.parametrize(
65+
"capacity_type, expected_result",
66+
[
67+
("capacity-block", True),
68+
("on-demand", False),
69+
("spot", False),
70+
("any-value", False),
71+
("bla-capacity-block-bla", False),
72+
],
73+
)
74+
def test_is_capacity_block(capacity_type, expected_result):
75+
assert_that(_is_capacity_block(capacity_type)).is_equal_to(expected_result)
76+
77+
78+
@pytest.mark.parametrize(
79+
"instance_type, expected_result",
80+
[
81+
("p6e-gb200.ANY_SIZE", True),
82+
("NOTp6e-gb200.ANY_SIZE", False),
83+
("p6e-b200.ANY_SIZE", False),
84+
("any-value", False),
85+
],
86+
)
87+
def test_is_gb200(instance_type, expected_result):
88+
assert_that(_is_gb200(instance_type)).is_equal_to(expected_result)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# This file is automatically generated by pcluster
2+
3+
BlockName=Block1 Nodes=multiple_spot-st-multiplespot-2-[1-9]
4+
BlockName=Block2 Nodes=gpu-st-gpu-p3dn24xlarge-[1-9]
5+
BlockName=Block3 Nodes=capacity-block-queue1-st-cb-gb200-1-[1-9]
6+
BlockName=Block4 Nodes=capacity-block-queue1-st-fleet-no-res-[1-18]
7+
BlockName=Block5 Nodes=capacity-block-queue2-st-cb-gb200-2-[1-18]
8+
BlockName=Block6 Nodes=capacity-block-queue2-st-cb-gb200-3-[1-9]
9+
BlockSizes=9,18

test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ Scheduling:
1818
Enabled: false
1919
GdrSupport: false
2020
InstanceType: c5.2xlarge
21-
MaxCount: 5
22-
MinCount: 5
21+
MaxCount: 9
22+
MinCount: 9
2323
Name: multiplespot-2
2424
SpotPrice: 1.5
2525
StaticNodePriority: 1
@@ -69,8 +69,8 @@ Scheduling:
6969
Enabled: false
7070
GdrSupport: false
7171
InstanceType: p3dn.24xlarge
72-
MaxCount: 10
73-
MinCount: 10
72+
MaxCount: 9
73+
MinCount: 9
7474
Name: gpu-p3dn24xlarge
7575
SpotPrice: null
7676
StaticNodePriority: 1
@@ -106,8 +106,8 @@ Scheduling:
106106
Instances:
107107
- InstanceType: c5n.4xlarge
108108
- InstanceType: r5.4xlarge
109-
MaxCount: 10
110-
MinCount: 10
109+
MaxCount: 18
110+
MinCount: 18
111111
Name: fleet-no-res
112112
SchedulableMemory: null
113113
SpotPrice: null

0 commit comments

Comments
 (0)