Fix OOM in fp8 w/ HPUGraph for llama3.2 (#1365) (#1479)

kdamaszk · Yu-Zhou · jkaniecki · web-flow · commit 9f1222c50608 · 2025-06-25T20:59:53.000+02:00
Signed-off-by: zhouyu5 &lt;yu.zhou@intel.com&gt;
Co-authored-by: Yu-Zhou &lt;yu.zhou@intel.com&gt;
Co-authored-by: Jan Kaniecki &lt;jan.kaniecki@intel.com&gt;
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -1350,6 +1350,7 @@ def forward(
         full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
                                                       torch.Tensor]],
         skip_cross_attention: bool,
+        **kwargs: object,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids=input_ids,
diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py
@@ -3,6 +3,7 @@
 import gc
 import itertools
 import math
+from functools import partial
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union, cast
 
 import habana_frameworks.torch as htorch
@@ -42,6 +43,13 @@ class HpuModelAdapterEncoderDecoder(HpuModelAdapter):
     def __init__(self, model, vllm_config, layer_names, is_causal, sampler):
         super().__init__(model, vllm_config, layer_names, is_causal, sampler)
 
+        # We only wrap the language model in HPU graph because some Ops in
+        # vision model will fallback to CPU and cause the graph building fail.
+        if htorch.utils.internal.is_lazy() and hasattr(self.model,
+                                                       "language_model"):
+            self.model.language_model = htorch.hpu.wrap_in_hpu_graph(
+                self.model.language_model, disable_tensor_cache=True)
+
     def _set_cross_block_mapping(self, metadata, batch_size, device, dtype):
         mask = torch.arange(0,
                             self.block_size,
@@ -110,6 +118,13 @@ def forward(self, *args, **kwargs):
         kwargs['attn_metadata'] = self._update_cross_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
             input_ids.device, self.dtype)
+        if htorch.utils.internal.is_lazy() and hasattr(self.model,
+                                                       "language_model"):
+            bypass_hpu_graphs = kwargs.get('bypass_hpu_graphs', False)
+            self.model.language_model.forward = partial(
+                self.model.language_model.forward,
+                attn_metadata=kwargs['attn_metadata'],
+                bypass_hpu_graphs=bypass_hpu_graphs)
         # TODO: Change the input_ids to 1D to match the public vllm
         # implementation and avoid shape mismatch issues with some
         # models(i.e. Mllama). But currently this will cause graph
@@ -118,9 +133,9 @@ def forward(self, *args, **kwargs):
         virtual_engine = 0
         if 'virtual_engine' in kwargs:
             virtual_engine = kwargs.pop('virtual_engine')
+        attn_metadata = kwargs.pop('attn_metadata')
         if 'kv_caches' in kwargs:
             kwargs.pop('kv_caches')
-        attn_metadata = kwargs.pop("attn_metadata")
         with set_forward_context(attn_metadata, self.vllm_config,
                                  virtual_engine):
             hidden_states = self.model(*args, **kwargs)
@@ -193,11 +208,7 @@ def _flatten(self, in_list):
         return list(itertools.chain(*in_list))
 
     def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
-        return htorch.hpu.wrap_in_hpu_graph(
-            HpuModelAdapterEncoderDecoder(*args, **kwargs),
-            disable_tensor_cache=True,
-        ) if htorch.utils.internal.is_lazy(
-        ) else HpuModelAdapterEncoderDecoder(*args, **kwargs)
+        return HpuModelAdapterEncoderDecoder(*args, **kwargs)
 
     def prepare_model_input(
         self,