Skip to content

Commit 2930e4a

Browse files
authored
[CI] Upgrade vllm to newest commit (#3182)
### What this PR does / why we need it? Upgrade vLLM to newest commit - Fix the aclgraph doesn't work problem, caused by vllm-project/vllm@24fab45 - Fix PoolerOutput import error, caused by vllm-project/vllm@755ed7b - Fix the aclgraph weight load error to keep the same with torchair fix. vllm-project/vllm@4492e3a ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? All test should pass - vLLM version: v0.10.2 - vLLM main: vllm-project/vllm@52d0cb8 --------- Signed-off-by: wangxiyuan <[email protected]>
1 parent 0794f64 commit 2930e4a

File tree

9 files changed

+49
-53
lines changed

9 files changed

+49
-53
lines changed

.github/workflows/format_pr_body.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636

3737
- name: Get vLLM version
3838
run: |
39-
VLLM_COMMIT=52d0cb845866869d587fc013a7c59e60a86ebcf2
39+
VLLM_COMMIT=17b4c6685ce62d5652654784d6771a3d38e4273e
4040
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
4141
4242
- name: Checkout repository

.github/workflows/vllm_ascend_test.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
lint:
4343
uses: ./.github/workflows/pre-commit.yml
4444
with:
45-
vllm: 52d0cb845866869d587fc013a7c59e60a86ebcf2
45+
vllm: 17b4c6685ce62d5652654784d6771a3d38e4273e
4646

4747
changes:
4848
runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
8383
VLLM_USE_MODELSCOPE: True
8484
strategy:
8585
matrix:
86-
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
86+
vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
8787
steps:
8888
- name: Install packages
8989
run: |
@@ -138,7 +138,7 @@ jobs:
138138
name: e2e-light
139139
strategy:
140140
matrix:
141-
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
141+
vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
142142
# Note (yikun): If CI resource are limited we can split job into two chain jobs
143143
needs: [lint, changes]
144144
# only trigger e2e test after lint passed and the change is e2e related with pull request.

.github/workflows/vllm_ascend_test_full.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ jobs:
6868
name: e2e-full
6969
strategy:
7070
matrix:
71-
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
71+
vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
7272
needs: [changes]
7373
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
7474
uses: ./.github/workflows/_e2e_test.yaml

vllm_ascend/patch/worker/patch_common/patch_weight_loader.py

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import torch
22
from torch.nn.parameter import Parameter
33
from vllm.logger import init_logger
4-
# yapf: disable
5-
from vllm.model_executor.parameter import ModelWeightParameter
6-
# yapf: enable
74
from vllm.model_executor.utils import set_weight_attrs
85
from vllm.utils import GiB_bytes
96

@@ -16,27 +13,15 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
1613
output_partition_sizes: list[int], input_size: int,
1714
output_size: int, params_dtype: torch.dtype,
1815
**extra_weight_attrs):
19-
from vllm_ascend.ascend_config import get_ascend_config
20-
ascend_config = get_ascend_config()
2116
# This method creates unquantized linear weights.
2217
# The weights are not quantized, and they are not sharded.
2318
# The amount of memory allocated for the weights is
2419
# sum(output_partition_sizes) * input_size_per_partition.
2520
try:
26-
if ascend_config.torchair_graph_config.enabled:
27-
weight = Parameter(torch.empty(sum(output_partition_sizes),
28-
input_size_per_partition,
29-
dtype=params_dtype),
30-
requires_grad=False)
31-
else:
32-
weight_loader = extra_weight_attrs.pop("weight_loader")
33-
weight = ModelWeightParameter(data=torch.empty(
34-
sum(output_partition_sizes),
35-
input_size_per_partition,
36-
dtype=params_dtype),
37-
input_dim=1,
38-
output_dim=0,
39-
weight_loader=weight_loader)
21+
weight = Parameter(torch.empty(sum(output_partition_sizes),
22+
input_size_per_partition,
23+
dtype=params_dtype),
24+
requires_grad=False)
4025
except torch.cuda.OutOfMemoryError as e:
4126
logger.error("Failed to create unquantized linear weights: %s", e)
4227
if torch.cuda.is_available():
@@ -49,8 +34,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
4934
"Failed to create unquantized linear weights. "
5035
"This may be caused by insufficient memory to allocate "
5136
"the weight.") from e
52-
if ascend_config.torchair_graph_config.enabled:
53-
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
37+
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
5438
layer.register_parameter("weight", weight)
5539
set_weight_attrs(weight, extra_weight_attrs)
5640

vllm_ascend/platform.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
209209
# set cudaprah sizes before extending `compilation_config.splitting_ops`
210210
vllm_config._set_cudagraph_sizes()
211211

212+
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
213+
if not vllm_version_is("v0.10.2"):
214+
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
215+
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
216+
212217
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
213218
compilation_config.level = CompilationLevel.NO_COMPILATION
214219
# TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition

vllm_ascend/quantization/quant_config.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
3434
from vllm.model_executor.layers.vocab_parallel_embedding import (
3535
UnquantizedEmbeddingMethod, VocabParallelEmbedding)
36-
from vllm.model_executor.parameter import PerTensorScaleParameter
3736
from vllm.model_executor.utils import set_weight_attrs
3837

3938
from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
@@ -251,7 +250,6 @@ def create_weights(
251250
**extra_weight_attrs,
252251
) -> None:
253252
output_size_per_partition = sum(output_partition_sizes)
254-
weight_loader = extra_weight_attrs.get("weight_loader")
255253

256254
weight_dict = self.quant_method.get_weight(input_size_per_partition,
257255
output_size_per_partition,
@@ -264,8 +262,7 @@ def create_weights(
264262

265263
pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
266264
for pertensor_name, pertensor_param in pertensor_dict.items():
267-
param = PerTensorScaleParameter(data=pertensor_param,
268-
weight_loader=weight_loader)
265+
param = torch.nn.Parameter(pertensor_param, requires_grad=False)
269266
# disable warning
270267
param.ignore_warning = True
271268
layer.register_parameter(pertensor_name, param)

vllm_ascend/torchair/torchair_worker.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,6 @@
2828
class NPUTorchairWorker(NPUWorker):
2929
"""Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
3030

31-
def __init__(self,
32-
vllm_config,
33-
local_rank,
34-
rank,
35-
distributed_init_method,
36-
is_driver_worker=False,
37-
**kwargs):
38-
super().__init__(vllm_config, local_rank, rank,
39-
distributed_init_method, is_driver_worker, **kwargs)
40-
from vllm.model_executor.layers.linear import \
41-
WEIGHT_LOADER_V2_SUPPORTED
42-
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
43-
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
44-
4531
def determine_available_memory(self) -> int:
4632
"""Override determine_available_memory to use cached torchair kv_cache_bytes."""
4733

vllm_ascend/worker/model_runner_v1.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,12 @@
6464
from vllm.multimodal.utils import group_mm_kwargs_by_modality
6565
from vllm.pooling_params import PoolingParams
6666
from vllm.sampling_params import SamplingType
67-
from vllm.sequence import IntermediateTensors, PoolerOutput
67+
from vllm.sequence import IntermediateTensors
6868
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
6969
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
7070
LazyLoader, cdiv, get_dtype_size,
7171
is_pin_memory_available)
72+
from vllm.utils.jsontree import json_map_leaves
7273
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
7374
from vllm.v1.attention.backends.utils import (
7475
AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills)
@@ -144,7 +145,9 @@
144145

145146
if not vllm_version_is("0.10.2"):
146147
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
148+
from vllm.v1.outputs import PoolerOutput
147149
else:
150+
from vllm.sequence import PoolerOutput
148151
UniformTypeKVCacheSpecs = None
149152

150153

@@ -1806,18 +1809,30 @@ def _pool(
18061809
device=hidden_states.device)
18071810
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
18081811

1809-
# Pooling models D2H & synchronize occurs in pooler.py:build_output
1810-
raw_pooler_output = self.model.pooler(
1811-
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
1812+
if vllm_version_is("0.10.2"):
1813+
# Pooling models D2H & synchronize occurs in pooler.py:build_output
1814+
raw_pooler_output = self.model.pooler(
1815+
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
1816+
else:
1817+
model = cast(VllmModelForPooling, self.model)
1818+
raw_pooler_output = model.pooler(
1819+
hidden_states=hidden_states,
1820+
pooling_metadata=pooling_metadata,
1821+
)
1822+
raw_pooler_output = json_map_leaves(
1823+
lambda x: x.to("cpu", non_blocking=True),
1824+
raw_pooler_output,
1825+
)
1826+
torch.npu.synchronize()
18121827

18131828
pooler_output: list[Optional[torch.Tensor]] = []
18141829
for raw_output, seq_len, prompt_len in zip(
18151830
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
1816-
1817-
if seq_len == prompt_len:
1818-
pooler_output.append(raw_output.data)
1831+
if vllm_version_is("0.10.2"):
1832+
output = raw_output.data if seq_len == prompt_len else None
18191833
else:
1820-
pooler_output.append(None)
1834+
output = raw_output if seq_len == prompt_len else None
1835+
pooler_output.append(output)
18211836

18221837
return ModelRunnerOutput(
18231838
req_ids=self.input_batch.req_ids,
@@ -2582,7 +2597,10 @@ def _dummy_pooler_run(
25822597
for task in self.get_supported_pooling_tasks():
25832598
# Run a full batch with each task to ensure none of them OOMs
25842599
output = self._dummy_pooler_run_task(hidden_states, task)
2585-
output_size[task] = output.get_data_nbytes()
2600+
if vllm_version_is("0.10.2"):
2601+
output_size[task] = output.get_data_nbytes()
2602+
else:
2603+
output_size[task] = sum(o.nbytes for o in output)
25862604
del output # Allow GC
25872605

25882606
max_task = max(output_size.items(), key=lambda x: x[1])[0]

vllm_ascend/worker/worker_v1.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ def __init__(
116116
# Buffers saved before sleep
117117
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
118118

119+
# FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
120+
from vllm.model_executor.layers.linear import \
121+
WEIGHT_LOADER_V2_SUPPORTED
122+
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
123+
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
124+
119125
def sleep(self, level: int = 1) -> None:
120126
if not sleep_mode_enabled():
121127
raise ValueError(

0 commit comments

Comments
 (0)