Add Intel Gaudi as a supported device. (#2888)

vivekgoe · web-flow · commit b942406523dc · 2025-08-04T20:57:41.000+03:00
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
@@ -318,6 +318,9 @@ def __init__(
         self._batch_size = batch_size
         self._dtype = dtype
         self._enable_kv_cache = enable_kv_cache
+        # Set device explicitely here since HPU is not included in
+        # `device_list` in `HFLM` class
+        self._device = torch.device(device)
 
     @property
     def model(self):
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -55,7 +55,7 @@
 _DISTRIBUTED_STATE_DICT_API_IS_AVAILABLE = False
 
 # Valid backends for logging memory stats
-VALID_BACKENDS_FOR_MEMORY_STATS = ("cuda", "xpu", "npu")
+VALID_BACKENDS_FOR_MEMORY_STATS = ("cuda", "xpu", "npu", "hpu")
 
 
 @dataclass
@@ -221,7 +221,9 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
         elif dist.get_backend() == "xccl":
             tensor = tensor.to(get_device("xpu"))
         elif dist.get_backend() == "hccl":
-            tensor = tensor.to(get_device("npu"))
+            # Since NPU and HPU both have same backend names
+            # infer device based on environment here.
+            tensor = tensor.to(get_device())
         dist.broadcast(tensor, src=src, group=None)
         return tensor.to(device)
     else:
diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py
@@ -180,6 +180,7 @@ def setup_torch_profiler(
     cpu: bool = True,
     cuda: bool = True,
     xpu: bool = True,
+    hpu: bool = False,
     profile_memory: bool = DEFAULT_TRACE_OPTS["profile_memory"],
     with_stack: bool = DEFAULT_TRACE_OPTS["with_stack"],
     record_shapes: bool = DEFAULT_TRACE_OPTS["record_shapes"],
@@ -248,6 +249,7 @@ def setup_torch_profiler(
         cpu (bool): Enable cpu profiling. Default is True.
         cuda (bool): Enable cuda profiling. Default is True.
         xpu (bool): Enable xpu profiling. Default is True.
+        hpu (bool): Enable hpu profiling. Default is False.
         profile_memory (bool): Profile memory usage. Default is False.
         with_stack (bool): Profile stack. Default is False.
         record_shapes (bool): Record shapes. Default is True.
@@ -274,6 +276,8 @@ def setup_torch_profiler(
         activities.append(torch.profiler.ProfilerActivity.CUDA)
     if xpu:
         activities.append(torch.profiler.ProfilerActivity.XPU)
+    if hpu:
+        activities.append(torch.profiler.ProfilerActivity.HPU)
     if len(activities) == 0:
         _warn("No activities specified, defaulting to CPU + CUDA")
         activities = DEFAULT_PROFILER_ACTIVITIES
@@ -371,6 +375,7 @@ def setup_torch_profiler(
             "cpu": cpu,
             "cuda": cuda,
             "xpu": xpu,
+            "hpu": hpu,
             "profile_memory": profile_memory,
             "with_stack": with_stack,
             "record_shapes": record_shapes,
diff --git a/torchtune/training/memory.py b/torchtune/training/memory.py
@@ -48,7 +48,10 @@ def cleanup_before_training() -> None:
     Call gc collect, empty device cache, and reset peak memory stats.
     """
     gc.collect()
-    get_torch_device_namespace().empty_cache()
+    from torchtune.utils._device import is_hpu_available
+
+    if not is_hpu_available:
+        get_torch_device_namespace().empty_cache()
     get_torch_device_namespace().reset_peak_memory_stats()
 
 
diff --git a/torchtune/training/precision.py b/torchtune/training/precision.py
@@ -10,7 +10,7 @@
 import torch
 
 from torchtune.utils import get_logger
-from torchtune.utils._device import is_npu_available
+from torchtune.utils._device import is_hpu_available, is_npu_available
 
 log = get_logger()
 
@@ -69,7 +69,8 @@ def verify_bf16_support() -> bool:
     mps_support = torch.backends.mps.is_available() and torch.backends.mps.is_built()
     npu_support = is_npu_available and torch.npu.is_bf16_supported()
     xpu_support = torch.xpu.is_available() and torch.xpu.is_bf16_supported()
-    return cuda_support or mps_support or npu_support or xpu_support
+    hpu_support = is_hpu_available and torch.hpu.is_bf16_supported()
+    return cuda_support or mps_support or npu_support or xpu_support or hpu_support
 
 
 def get_dtype(
diff --git a/torchtune/utils/_device.py b/torchtune/utils/_device.py
@@ -47,6 +47,19 @@ def is_torch_npu_available() -> bool:
 is_npu_available = is_torch_npu_available()
 
 
+def is_torch_hpu_available() -> bool:
+    """Check the availability of HPU"""
+    try:
+        import habana_frameworks.torch  # noqa: F401
+
+        return torch.hpu.is_available()
+    except ImportError:
+        return False
+
+
+is_hpu_available = is_torch_hpu_available()
+
+
 def _get_local_rank() -> Optional[int]:
     """Function that gets the local rank from the environment.
 
@@ -78,7 +91,6 @@ def _setup_device(device: torch.device) -> torch.device:
     device_type = device_support.device_type
     device_name = device_support.device_name
     torch_device = get_torch_device_namespace()
-
     if device.index is None:
         device = torch.device(type=device_type, index=local_rank)
 
@@ -107,6 +119,8 @@ def _get_device_type_from_env() -> str:
         device = "cuda"
     elif is_npu_available:
         device = "npu"
+    elif is_hpu_available:
+        device = "hpu"
     elif torch.xpu.is_available():
         device = "xpu"
     elif torch.mps.is_available():
@@ -171,7 +185,7 @@ def get_device(device: Optional[str] = None) -> torch.device:
     if device is None:
         device = _get_device_type_from_env()
     device = torch.device(device)
-    if device.type in ["cuda", "npu", "xpu"]:
+    if device.type in ["cuda", "npu", "xpu", "hpu"]:
         device = _setup_device(device)
     _validate_device_from_env(device)
     return device
@@ -220,6 +234,7 @@ class DeviceSupport(Enum):
     NPU = ("npu", "NPU", "hccl")
     XPU = ("xpu", "XPU", "ccl")
     MPS = ("mps", "MPS", "gloo")
+    HPU = ("hpu", "HPU", "hccl")
 
     def __init__(
         self,