Skip to content

Commit 24fab45

Browse files
authored
[Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)
Signed-off-by: mgoin <[email protected]>
1 parent 6340025 commit 24fab45

File tree

3 files changed

+32
-7
lines changed

3 files changed

+32
-7
lines changed

vllm/config/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,8 +509,15 @@ def __post_init__(self):
509509
if self.compilation_config.cudagraph_mode is None:
510510
if envs.VLLM_USE_V1 and self.compilation_config.level \
511511
== CompilationLevel.PIECEWISE:
512+
# default to full and piecewise for most models
512513
self.compilation_config.cudagraph_mode = \
513-
CUDAGraphMode.PIECEWISE
514+
CUDAGraphMode.FULL_AND_PIECEWISE
515+
516+
# pooling model does not support full cudagraphs
517+
if self.model_config is not None and \
518+
self.model_config.pooler_config is not None:
519+
self.compilation_config.cudagraph_mode = \
520+
CUDAGraphMode.PIECEWISE
514521
else:
515522
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
516523

vllm/config/compilation.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -228,15 +228,14 @@ class CompilationConfig:
228228
The mode of the cudagraph:
229229
230230
- NONE, no cudagraph capture.
231-
- PIECEWISE. (v1 default)
231+
- PIECEWISE.
232232
- FULL.
233233
- FULL_DECODE_ONLY.
234-
- FULL_AND_PIECEWISE.
234+
- FULL_AND_PIECEWISE. (v1 default)
235235
236236
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
237237
incompatible ops (i.e. some attention ops) outside the cudagraph
238238
for general flexibility.
239-
This is the default mode.
240239
241240
FULL mode: Capture full cudagraph for all batches. Can be good for small
242241
models or workloads with small prompts; not supported by many backends.
@@ -249,7 +248,7 @@ class CompilationConfig:
249248
250249
FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
251250
piecewise cudagraph for prefill and mixed prefill-decode batches.
252-
This is like the most performant mode for most models.
251+
This is the most performant mode for most models and is the default.
253252
254253
Currently, the cudagraph mode is only used for the v1 engine.
255254
Note that the cudagraph logic is generally orthogonal to the

vllm/v1/worker/gpu_model_runner.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2947,8 +2947,7 @@ def _dummy_run(
29472947
# TODO(luka) better system for describing dummy batches
29482948
seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
29492949
else:
2950-
# Make sure max_model_len is used at the graph capture time.
2951-
seq_lens = self.max_model_len
2950+
seq_lens = max_query_len
29522951
self.seq_lens.np[:num_reqs] = seq_lens
29532952
self.seq_lens.np[num_reqs:] = 0
29542953
self.seq_lens.copy_to_gpu()
@@ -3541,6 +3540,26 @@ def initialize_cudagraph_capture(self) -> None:
35413540
CUDAGraphMode.FULL_DECODE_ONLY
35423541
logger.warning(msg)
35433542

3543+
# check that if we are doing decode full-cudagraphs it is supported
3544+
if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
3545+
and min_cg_support == AttentionCGSupport.NEVER):
3546+
msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
3547+
f"with {min_cg_builder_name} backend (support: "
3548+
f"{min_cg_support})")
3549+
if (self.compilation_config.level == CompilationLevel.PIECEWISE and
3550+
(self.compilation_config.splitting_ops_contain_attention()
3551+
or self.compilation_config.use_inductor_graph_partition)):
3552+
msg += "; setting cudagraph_mode=PIECEWISE because "\
3553+
"attention is compiled piecewise"
3554+
cudagraph_mode = self.compilation_config.cudagraph_mode = \
3555+
CUDAGraphMode.PIECEWISE
3556+
else:
3557+
msg += "; setting cudagraph_mode=NONE because "\
3558+
"attention is not compiled piecewise"
3559+
cudagraph_mode = self.compilation_config.cudagraph_mode = \
3560+
CUDAGraphMode.NONE
3561+
logger.warning(msg)
3562+
35443563
# check that if we are doing spec-decode + decode full-cudagraphs it is
35453564
# supported
35463565
if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL

0 commit comments

Comments
 (0)