Merge branch 'main' into patch-1

omrishiv · web-flow · commit 6e9777c03509 · 2025-08-16T13:29:03.000-07:00
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -29,7 +29,7 @@ torch>=2.7.0
 torchmetrics==1.6.3
 torchserve>=0.10.0
 torchtext==0.18.0
-torchvision==0.22.0
+torchvision==0.23.0
 typing-extensions
 ts==0.5.1
 ray[default]
diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py
@@ -207,8 +207,7 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> None:
                 " (e.g. `local_cwd`)"
             )
 
-        scheduler_opts = runner.scheduler_run_opts(args.scheduler)
-        cfg = scheduler_opts.cfg_from_str(args.scheduler_args)
+        cfg = dict(runner.cfg_from_str(args.scheduler, args.scheduler_args))
         config.apply(scheduler=args.scheduler, cfg=cfg)
 
         component, component_args = _parse_component_name_and_args(
@@ -263,12 +262,14 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> None:
             sys.exit(1)
         except specs.InvalidRunConfigException as e:
             error_msg = (
-                f"Scheduler arg is incorrect or missing required option: `{e.cfg_key}`\n"
-                f"Run `torchx runopts` to check configuration for `{args.scheduler}` scheduler\n"
-                f"Use `-cfg` to specify run cfg as `key1=value1,key2=value2` pair\n"
-                "of setup `.torchxconfig` file, see: https://pytorch.org/torchx/main/experimental/runner.config.html"
+                "Invalid scheduler configuration: %s\n"
+                "To configure scheduler options, either:\n"
+                "  1. Use the `-cfg` command-line argument, e.g., `-cfg key1=value1,key2=value2`\n"
+                "  2. Set up a `.torchxconfig` file. For more details, visit: https://pytorch.org/torchx/main/runner.config.html\n"
+                "Run `torchx runopts %s` to check all available configuration options for the "
+                "`%s` scheduler."
             )
-            logger.error(error_msg)
+            print(error_msg % (e, args.scheduler, args.scheduler), file=sys.stderr)
             sys.exit(1)
 
     def run(self, args: argparse.Namespace) -> None:
diff --git a/torchx/runner/api.py b/torchx/runner/api.py
@@ -129,9 +129,9 @@ def __init__(
     def _get_scheduler_params_from_env(self) -> Dict[str, str]:
         scheduler_params = {}
         for key, value in os.environ.items():
-            lower_case_key = key.lower()
-            if lower_case_key.startswith("torchx_"):
-                scheduler_params[lower_case_key.strip("torchx_")] = value
+            key = key.lower()
+            if key.startswith("torchx_"):
+                scheduler_params[key.removeprefix("torchx_")] = value
         return scheduler_params
 
     def __enter__(self) -> "Self":
@@ -486,6 +486,27 @@ def scheduler_run_opts(self, scheduler: str) -> runopts:
         """
         return self._scheduler(scheduler).run_opts()
 
+    def cfg_from_str(self, scheduler: str, *cfg_literal: str) -> Mapping[str, CfgVal]:
+        """
+        Convenience function around the scheduler's ``runopts.cfg_from_str()`` method.
+
+        Usage:
+
+        .. doctest::
+
+            from torchx.runner import get_runner
+
+            runner = get_runner()
+            cfg = runner.cfg_from_str("local_cwd", "log_dir=/tmp/foobar", "prepend_cwd=True")
+            assert cfg == {"log_dir": "/tmp/foobar", "prepend_cwd": True, "auto_set_cuda_visible_devices": False}
+        """
+
+        opts = self._scheduler(scheduler).run_opts()
+        cfg = {}
+        for cfg_str in cfg_literal:
+            cfg.update(opts.cfg_from_str(cfg_str))
+        return cfg
+
     def scheduler_backends(self) -> List[str]:
         """
         Returns a list of all supported scheduler backends.
diff --git a/torchx/runner/config.py b/torchx/runner/config.py
@@ -278,14 +278,14 @@ def dump(
                     continue
 
                 # serialize list elements with `;` delimiter (consistent with torchx cli)
-                if opt.opt_type == List[str]:
+                if opt.is_type_list_of_str:
                     # deal with empty or None default lists
                     if opt.default:
                         # pyre-ignore[6] opt.default type checked already as List[str]
                         val = ";".join(opt.default)
                     else:
                         val = _NONE
-                elif opt.opt_type == Dict[str, str]:
+                elif opt.is_type_dict_of_str:
                     # deal with empty or None default lists
                     if opt.default:
                         # pyre-ignore[16] opt.default type checked already as Dict[str, str]
@@ -536,26 +536,26 @@ def load(scheduler: str, f: TextIO, cfg: Dict[str, CfgVal]) -> None:
                 # this also handles empty or None lists
                 cfg[name] = None
             else:
-                runopt = runopts.get(name)
+                opt = runopts.get(name)
 
-                if runopt is None:
+                if opt is None:
                     log.warning(
                         f"`{name} = {value}` was declared in the [{section}] section "
                         f" of the config file but is not a runopt of `{scheduler}` scheduler."
                         f" Remove the entry from the config file to no longer see this warning"
                     )
                 else:
-                    if runopt.opt_type is bool:
+                    if opt.opt_type is bool:
                         # need to handle bool specially since str -> bool is based on
                         # str emptiness not value (e.g. bool("False") == True)
                         cfg[name] = config.getboolean(section, name)
-                    elif runopt.opt_type is List[str]:
+                    elif opt.is_type_list_of_str:
                         cfg[name] = value.split(";")
-                    elif runopt.opt_type is Dict[str, str]:
+                    elif opt.is_type_dict_of_str:
                         cfg[name] = {
                             s.split(":", 1)[0]: s.split(":", 1)[1]
                             for s in value.replace(",", ";").split(";")
                         }
                     else:
                         # pyre-ignore[29]
-                        cfg[name] = runopt.opt_type(value)
+                        cfg[name] = opt.opt_type(value)
diff --git a/torchx/runner/test/api_test.py b/torchx/runner/test/api_test.py
@@ -28,6 +28,7 @@
     parse_app_handle,
     Resource,
     Role,
+    runopts,
     UnknownAppException,
 )
 from torchx.specs.finder import ComponentNotFoundException
@@ -701,3 +702,36 @@ def test_runner_manual_close(self, _) -> None:
     def test_get_default_runner(self, _) -> None:
         runner = get_runner()
         self.assertEqual("torchx", runner._name)
+
+    def test_cfg_from_str(self, _) -> None:
+        scheduler_mock = MagicMock()
+        opts = runopts()
+        opts.add("foo", type_=str, default="", help="")
+        opts.add("test_key", type_=str, default="", help="")
+        opts.add("default_time", type_=int, default=0, help="")
+        opts.add("enable", type_=bool, default=True, help="")
+        opts.add("disable", type_=bool, default=True, help="")
+        opts.add("complex_list", type_=List[str], default=[], help="")
+        scheduler_mock.run_opts.return_value = opts
+
+        with Runner(
+            name=SESSION_NAME,
+            scheduler_factories={"local_dir": lambda name, **kwargs: scheduler_mock},
+        ) as runner:
+            self.assertDictEqual(
+                {
+                    "foo": "bar",
+                    "test_key": "test_value",
+                    "default_time": 42,
+                    "enable": True,
+                    "disable": False,
+                    "complex_list": ["v1", "v2", "v3"],
+                },
+                runner.cfg_from_str(
+                    "local_dir",
+                    "foo=bar",
+                    "test_key=test_value",
+                    "default_time=42",
+                    "enable=True,disable=False,complex_list=v1;v2;v3",
+                ),
+            )
diff --git a/torchx/runner/test/config_test.py b/torchx/runner/test/config_test.py
@@ -95,22 +95,34 @@ def _run_opts(self) -> runopts:
         )
         opts.add(
             "l",
-            type_=List[str],
+            type_=list[str],
             default=["a", "b", "c"],
             help="a list option",
         )
         opts.add(
-            "l_none",
+            "l_typing",
             type_=List[str],
+            default=["a", "b", "c"],
+            help="a typing.List option",
+        )
+        opts.add(
+            "l_none",
+            type_=list[str],
             default=None,
             help="a None list option",
         )
         opts.add(
             "d",
-            type_=Dict[str, str],
+            type_=dict[str, str],
             default={"foo": "bar"},
             help="a dict option",
         )
+        opts.add(
+            "d_typing",
+            type_=Dict[str, str],
+            default={"foo": "bar"},
+            help="a typing.Dict option",
+        )
         opts.add(
             "d_none",
             type_=Dict[str, str],
@@ -151,6 +163,10 @@ def _run_opts(self) -> runopts:
 [test]
 s = my_default
 i = 100
+l = abc;def
+l_typing = ghi;jkl
+d = a:b,c:d
+d_typing = e:f,g:h
 """
 
 _MY_CONFIG2 = """#
@@ -387,6 +403,10 @@ def test_apply_dirs(self, _) -> None:
         self.assertEqual("runtime_value", cfg.get("s"))
         self.assertEqual(100, cfg.get("i"))
         self.assertEqual(1.2, cfg.get("f"))
+        self.assertEqual({"a": "b", "c": "d"}, cfg.get("d"))
+        self.assertEqual({"e": "f", "g": "h"}, cfg.get("d_typing"))
+        self.assertEqual(["abc", "def"], cfg.get("l"))
+        self.assertEqual(["ghi", "jkl"], cfg.get("l_typing"))
 
     def test_dump_invalid_scheduler(self) -> None:
         with self.assertRaises(ValueError):
@@ -460,7 +480,7 @@ def test_dump_and_load_all_runopt_types(self, _) -> None:
 
         # all runopts in the TestScheduler have defaults, just check against those
         for opt_name, opt in TestScheduler("test").run_opts():
-            self.assertEqual(cfg.get(opt_name), opt.default)
+            self.assertEqual(opt.default, cfg.get(opt_name))
 
     def test_dump_and_load_all_registered_schedulers(self) -> None:
         # dump all the runopts for all registered schedulers
diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py
@@ -18,6 +18,7 @@
 import shlex
 import subprocess
 import tempfile
+import warnings
 from dataclasses import dataclass
 from datetime import datetime
 from subprocess import CalledProcessError, PIPE
@@ -72,6 +73,55 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
 
 
+def version() -> Tuple[int, int]:
+    """
+    Uses ``sinfo --version`` to get the slurm version. If the command fails, it
+    assumes the version is ``slurm 24.05.8``.
+
+    Returns:
+    -------
+        Tuple[int, int] slurm version as a tuple of ints (major, minor).
+    """
+
+    cmd = ["sinfo", "--version"]
+    try:
+        out = subprocess.check_output(cmd, stderr=PIPE, encoding="utf-8")
+    except (CalledProcessError, FileNotFoundError):
+        out = "slurm 24.05.8"
+        warnings.warn(
+            "Error running: `{sinfo_cmd}` to get SLURM version. Are you running outside the "
+            "cluster's login or head node? This typically happens when running in `--dryrun`"
+            " mode. Assuming version is `slurm 24.05.8`.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+
+    # sinfo --version returns in the form "slurm 24.1.0"
+    _, version_literal = out.split(" ", maxsplit=2)
+    major, minor = [int(v) for v in version_literal.split(".")][:2]
+
+    return (major, minor)
+
+
+def _should_use_gpus_per_node_from_version() -> bool:
+    """
+    Determine whether to use gpus-per-node based on automatically detected slurm version.
+
+    Change Reference: https://fburl.com/sqwqzxn6
+    > select/linear - Reject jobs asking for GRES per job|socket|task or cpus|mem per GRES.
+
+    Returns:
+        ``True`` in slurm ``version>=24.11.0``, ``False`` otherwise.
+    """
+
+    slurm_24_11_0 = (24, 11)
+    slurm_version = version()
+
+    return slurm_version[0] > slurm_24_11_0[0] or (  # Major version is greater
+        slurm_version[0] == slurm_24_11_0[0] and slurm_version[1] >= slurm_24_11_0[1]
+    )  # Major version is equal and minor version is greater or equal
+
+
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -81,6 +131,7 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     "partition",
     "time",
     "constraint",
+    "qos",
 }
 
 log: logging.Logger = logging.getLogger(__name__)
@@ -106,6 +157,7 @@ def _apply_app_id_env(s: str) -> str:
         "mail-user": Optional[str],
         "mail-type": Optional[str],
         "job_dir": Optional[str],
+        "qos": Optional[str],
     },
     total=False,
 )
@@ -126,7 +178,11 @@ class SlurmReplicaRequest:
 
     @classmethod
     def from_role(
-        cls, name: str, role: Role, cfg: SlurmOpts, nomem: bool
+        cls,
+        name: str,
+        role: Role,
+        cfg: SlurmOpts,
+        nomem: bool,
     ) -> "SlurmReplicaRequest":
         """
         ``from_role`` creates a SlurmReplicaRequest for the specific role and
@@ -149,7 +205,11 @@ def from_role(
             if not nomem and resource.memMB > 0:
                 sbatch_opts.setdefault("mem", str(resource.memMB))
             if resource.gpu > 0:
-                sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
+                # Use smart GPU allocation based on automatically detected Slurm version
+                if _should_use_gpus_per_node_from_version():
+                    sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
+                else:
+                    sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
 
         srun_opts = {
             "output": f"slurm-{macros.app_id}-{name}.out",
@@ -378,6 +438,11 @@ def _run_opts(self) -> runopts:
             iteration, jobs will be tracked in ``.torchxslurmjobdirs``.
             """,
         )
+        opts.add(
+            "qos",
+            type_=str,
+            help="Quality of Service (QoS) to assign to the job.",
+        )
         return opts
 
     def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
diff --git a/torchx/schedulers/test/slurm_scheduler_test.py b/torchx/schedulers/test/slurm_scheduler_test.py
diff --git a/torchx/specs/api.py b/torchx/specs/api.py
diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py