Merge pull request #104 from coreweave/es/vllm-0.10.0

wbrown · web-flow · commit 8552fbcfe437 · 2025-07-29T19:17:45.000-04:00
feat(vllm-tensorizer): Update to vLLM v0.10.0 &amp; `flashinfer` v0.2.8
diff --git a/.github/configurations/vllm-tensorizer.yml b/.github/configurations/vllm-tensorizer.yml
@@ -1,7 +1,7 @@
 vllm-commit:
-  - 'v0.9.2'
+  - 'v0.10.0'
 flashinfer-commit:
-  - 'v0.2.6.post1'
+  - 'v0.2.8'
 builder-base-image:
   - 'ghcr.io/coreweave/ml-containers/torch-extras:es-fa3-te-update-7a94157-nccl-cuda12.9.1-ubuntu22.04-nccl2.27.6-1-torch2.7.1-vision0.22.1-audio2.7.1-abi1'
 final-base-image:
diff --git a/vllm-tensorizer/Dockerfile b/vllm-tensorizer/Dockerfile
@@ -25,6 +25,28 @@ WORKDIR /workspace
 RUN --mount=type=bind,from=freezer,target=/tmp/frozen \
     /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /opt/constraints.txt
 
+COPY --link --chmod=755 nvcc-wrapper.py /opt/nvcc-wrapper.py
+ENV PYTORCH_NVCC='/opt/nvcc-wrapper.py' \
+    CMAKE_CUDA_COMPILER='/opt/nvcc-wrapper.py'
+
+ARG TARGETPLATFORM
+# Switch 9.0, 10.0, and 12.0 to -a variants; preserve originals for PTX
+# Flashinfer v0.28.0 in particular can only build for 12.0a but not 12.0
+RUN printf 'TORCH_CUDA_ARCH_LIST=' && \
+    echo "${TORCH_CUDA_ARCH_LIST}" \
+    | sed -E 's@\b(9|10|12)\.0\b@\1\.0a@g; s@\+PTX\b@@g' \
+    | tee /opt/arch_list.txt && \
+    printf 'NVCC_WRAPPER_FILTER_CODES=' && \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+      echo 'sm_80;sm_89;sm_90;sm_100;sm_120;compute_80;compute_89;compute_90;compute_100;compute_120'; \
+    else \
+      echo 'sm_90;sm_100;sm_120;compute_90;compute_100;compute_120'; \
+    fi \
+    | tee /opt/filter_codes.txt && \
+    printf '#!/bin/sh\nexport %s %s;\n' \
+      'TORCH_CUDA_ARCH_LIST="$(cat /opt/arch_list.txt)"' \
+      'NVCC_WRAPPER_FILTER_CODES="$(cat /opt/filter_codes.txt)"' \
+    | install -m 500 /dev/stdin /opt/arch_flags.sh
 
 FROM alpine/git:2.36.3 AS vllm-downloader
 WORKDIR /git
@@ -66,6 +88,7 @@ RUN git clone --filter=tree:0 --no-single-branch --no-checkout \
 
 FROM builder-base AS vllm-builder
 RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw \
+    . /opt/arch_flags.sh && \
     if [ -z "$MAX_JOBS" ]; then unset MAX_JOBS; fi && \
     python3 -m pip install --no-cache-dir py-cpuinfo && \
     if [ -f 'use_existing_torch.py' ]; then \
@@ -88,7 +111,9 @@ WORKDIR /wheels
 
 FROM builder-base AS flashinfer-builder
 RUN --mount=type=bind,from=flashinfer-downloader,source=/git/flashinfer,target=/workspace,rw \
+    . /opt/arch_flags.sh && \
     export TORCH_CUDA_ARCH_LIST="$(echo "${TORCH_CUDA_ARCH_LIST}" | sed 's@[67]\.0 \+@@g')" && \
+    sed -i 's@torch\.cuda\.get_device_capability()@(12, 0)@' flashinfer/comm/trtllm_ar.py && \
     python3 -m flashinfer.aot && \
     python3 -m pip wheel -w /wheels \
       -v --no-cache-dir --no-build-isolation --no-deps \
@@ -101,6 +126,7 @@ WORKDIR /wheels
 FROM builder-base AS lmcache-builder
 # LMCache must be built from source as it doesn't have pre-built ARM binaries
 RUN --mount=type=bind,from=lmcache-downloader,source=/git/LMCache,target=/workspace,rw \
+    . /opt/arch_flags.sh && \
     python3 -m pip install --no-cache-dir 'xxhash==3.5.0' 'setuptools_scm>=8' && \
     sed -Ei \
       '/[ "]*(torch(vision|audio)?|xformers) *[<>=~]+/d' \
@@ -155,14 +181,13 @@ RUN --mount=type=bind,from=lmcache-builder,source=/wheels,target=/tmp/wheels \
 ARG TARGETPLATFORM
 
 RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --no-cache-dir \
-          accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' \
-          boto3 runai-model-streamer runai-model-streamer[s3] -c /tmp/constraints.txt; \
+      BITSANDBYTES_VER='0.42.0'; \
     else \
-        python3 -m pip install --no-cache-dir \
-          accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' \
-          boto3 runai-model-streamer runai-model-streamer[s3] -c /tmp/constraints.txt; \
+      BITSANDBYTES_VER='0.46.1'; \
     fi && \
+    python3 -m pip install --no-cache-dir \
+      accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VER:?}" 'timm==0.9.10' \
+      boto3 runai-model-streamer runai-model-streamer[s3] -c /tmp/constraints.txt && \
     rm /tmp/constraints.txt
 
 EXPOSE 8080
diff --git a/vllm-tensorizer/nvcc-wrapper.py b/vllm-tensorizer/nvcc-wrapper.py
@@ -0,0 +1,160 @@
+#!/bin/env python3
+
+"""
+Wraps invocations of ``nvcc``, watching for evidence of SIGKILL or SIGSEGV,
+and then re-running the ``nvcc`` command a configurable number of times.
+
+Checking for SIGKILL or SIGSEGV is implemented by checking for either:
+
+- A subprocess return code indicating either of these signals, or
+- The standard ``dash`` error messages for either signal.
+
+``dash`` status messages are checked as NVCC utilizes ``sh``
+subprocesses internally, and ``sh`` usually resolves to
+the ``dash`` shell within Ubuntu container images.
+
+This wrapper also has the ability to filter out some -gencode flags.
+Gencode flags to filter out should be identified by their code parameter
+in a semicolon-delimited list stored in the NVCC_WRAPPER_FILTER_CODES
+environment variable.
+"""
+
+import asyncio
+import os
+import re
+import shutil
+import signal
+import subprocess
+import sys
+from typing import BinaryIO, Final, FrozenSet, Iterable, List, Sequence, Set
+
+NVCC_PATH: Final[str] = shutil.which("nvcc")
+if NVCC_PATH is None:
+    raise SystemExit("NVCC wrapper: fatal: nvcc binary not found")
+
+WRAPPER_ATTEMPTS: Final[int] = int(os.getenv("NVCC_WRAPPER_ATTEMPTS") or 10)
+if WRAPPER_ATTEMPTS < 1:
+    raise SystemExit("NVCC wrapper: fatal: invalid value for NVCC_WRAPPER_ATTEMPTS")
+
+FILTER_CODES: Final[FrozenSet[str]] = frozenset(
+    filter(None, os.getenv("NVCC_WRAPPER_FILTER_CODES", "").split(";"))
+)
+if FILTER_CODES and not all(
+    re.fullmatch(r"(?:sm|compute|lto)_\d+[af]?", a) for a in FILTER_CODES
+):
+    raise SystemExit("NVCC wrapper: fatal: invalid value for NVCC_WRAPPER_FILTER_CODES")
+
+RETRY_RET_CODES: Final[FrozenSet[int]] = frozenset({
+    -signal.SIGSEGV,
+    -signal.SIGKILL,
+    128 + signal.SIGSEGV,
+    128 + signal.SIGKILL,
+    255,
+})
+
+
+async def main(args) -> int:
+    args = transform_args(args)
+    ret: int = 0
+    for attempt in range(1, WRAPPER_ATTEMPTS + 1):
+        if attempt > 1:
+            print(
+                "NVCC wrapper: info:"
+                f" Retrying [{attempt:d}/{WRAPPER_ATTEMPTS:d}]"
+                f" after exit code {ret:d}",
+                file=sys.stderr,
+                flush=True,
+            )
+            # Wait an exponentially increasing amount of time
+            # before trying again, up to one minute
+            await asyncio.sleep(min(60, int(1.5**attempt)))
+        proc = await asyncio.create_subprocess_exec(
+            NVCC_PATH, *args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        restart_signals: tuple = await asyncio.gather(
+            monitor_stream(proc.stdout, sys.stdout.buffer),
+            monitor_stream(proc.stderr, sys.stderr.buffer),
+        )
+        ret = await proc.wait()
+        del proc
+        if ret == 0 or not any(restart_signals) and ret not in RETRY_RET_CODES:
+            break
+    else:
+        print(
+            "NVCC wrapper: info:"
+            f" Maximum attempts reached, exiting with status {ret:d}",
+            file=sys.stderr,
+            flush=True,
+        )
+    return ret
+
+
+async def monitor_stream(
+    stream: asyncio.StreamReader,
+    output: BinaryIO,
+    watch_for: Iterable[bytes] = (
+        b"Segmentation fault",
+        b"Segmentation fault (core dumped)",
+        b"Killed",
+    ),
+) -> bool:
+    found: bool = False
+    while line := await stream.readline():
+        found = found or line.strip() in watch_for
+        output.write(line)
+        output.flush()
+    return found
+
+
+def transform_args(args: Sequence[str]) -> Sequence[str]:
+    # This filters out args of the form -gencode=arch=X,code=Y
+    # or -gencode arch=X,code=Y for any code in FILTER_CODES.
+    # This does not filter arguments specified using the
+    # --gpu-architecture and --gpu-code flags, nor codes specified
+    # among others in groups, like -gencode=arch=X,code=[Y,Z].
+    if not FILTER_CODES:
+        return args
+    transformed_args = []
+    partial: bool = False
+    gencode: Set[str] = {"-gencode", "--generate-code"}
+    for arg in args:
+        if not partial and arg in gencode:
+            partial = True
+            transformed_args.append(arg)
+            continue
+        if partial:
+            pattern: str = r"(arch=[^,]+,code=)(\S+)"
+        else:
+            pattern: str = r"((?:-gencode|--generate-code)=arch=\S+,code=)(\S+)"
+        m: re.Match = re.fullmatch(pattern, arg)
+        if m:
+            code: str = m.group(2)
+            if code in FILTER_CODES:
+                if partial:
+                    # There was a hanging `-gencode` arg before this, so delete it
+                    assert transformed_args[-1] in gencode
+                    del transformed_args[-1]
+            elif re.fullmatch(r"\[\S+]", code):
+                codes: List[str] = code[1:-1].split(",")
+                filtered_codes: List[str] = [c for c in codes if c not in FILTER_CODES]
+                if filtered_codes:
+                    filtered_code: str = ",".join(filtered_codes)
+                    if len(filtered_codes) > 1:
+                        filtered_code = f"[{filtered_code}]"
+                    transformed_args.append(m.group(1) + filtered_code)
+                elif partial:
+                    assert transformed_args[-1] in gencode
+                    del transformed_args[-1]
+            else:
+                transformed_args.append(arg)
+        else:
+            transformed_args.append(arg)
+        partial = False
+    return transformed_args
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(asyncio.run(main(sys.argv[1:])))
+    except KeyboardInterrupt:
+        sys.exit(130)