Merge pull request #102 from coreweave/es/fa3-te-update

wbrown · web-flow · commit 7ef4f053862b · 2025-07-21T13:55:04.000-04:00
feat: Update `torch` &amp; `vllm-tensorizer` images, improve build process
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
@@ -5,5 +5,5 @@ include:
   - torch: 2.7.1
     vision: 0.22.1
     audio: 2.7.1
-    nccl: 2.27.5-1
-    nccl-tests-hash: '0120901'
+    nccl: 2.27.6-1
+    nccl-tests-hash: '7c12c62'
diff --git a/.github/configurations/vllm-tensorizer.yml b/.github/configurations/vllm-tensorizer.yml
@@ -1,8 +1,8 @@
 vllm-commit:
-  - 'b6553be1bc75f046b00046a4ad7576364d03c835'
+  - 'v0.9.2'
 flashinfer-commit:
   - 'v0.2.6.post1'
 builder-base-image:
-  - 'ghcr.io/coreweave/ml-containers/torch-extras:es-cuda-12.9.1-74755e9-nccl-cuda12.9.1-ubuntu22.04-nccl2.27.5-1-torch2.7.1-vision0.22.1-audio2.7.1-abi1'
+  - 'ghcr.io/coreweave/ml-containers/torch-extras:es-fa3-te-update-7a94157-nccl-cuda12.9.1-ubuntu22.04-nccl2.27.6-1-torch2.7.1-vision0.22.1-audio2.7.1-abi1'
 final-base-image:
-  - 'ghcr.io/coreweave/ml-containers/torch-extras:es-cuda-12.9.1-74755e9-base-cuda12.9.1-ubuntu22.04-torch2.7.1-vision0.22.1-audio2.7.1-abi1'
+  - 'ghcr.io/coreweave/ml-containers/torch-extras:es-fa3-te-update-7a94157-base-cuda12.9.1-ubuntu22.04-torch2.7.1-vision0.22.1-audio2.7.1-abi1'
diff --git a/.github/workflows/vllm-tensorizer.yml b/.github/workflows/vllm-tensorizer.yml
@@ -2,6 +2,7 @@ on:
   push:
     paths:
       - "vllm-tensorizer/**"
+      - ".github/configurations/vllm-tensorizer.yml"
       - ".github/workflows/vllm-tensorizer.yml"
       - ".github/workflows/build.yml"
 
diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
@@ -5,7 +5,7 @@ ARG DEEPSPEED_VERSION="0.14.4"
 ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8"
 ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1"
 ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90"
-ARG XFORMERS_VERSION="0.0.30"
+ARG XFORMERS_VERSION="0.0.31.post1"
 ARG BUILD_MAX_JOBS=""
 
 FROM alpine/git:2.36.3 as apex-downloader
diff --git a/torch/Dockerfile b/torch/Dockerfile
@@ -5,9 +5,9 @@ ARG FINAL_BASE_IMAGE="nvidia/cuda:12.9.1-base-ubuntu22.04"
 ARG BUILD_TORCH_VERSION="2.7.1"
 ARG BUILD_TORCH_VISION_VERSION="0.22.1"
 ARG BUILD_TORCH_AUDIO_VERSION="2.7.1"
-ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
+ARG BUILD_TRANSFORMERENGINE_VERSION="2.4"
 ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
-ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
+ARG BUILD_FLASH_ATTN_3_VERSION="b36ad4ef767d2d5536ff8af2e3f720ae4eba731c"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0 12.0+PTX"
@@ -90,7 +90,8 @@ RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VER
 FROM downloader-base AS flash-attn-3-downloader
 ARG BUILD_FLASH_ATTN_3_VERSION
 RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \
-      ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \
+      ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}" && \
+      git -C flash-attention cherry-pick -n 3edf7e0daa62662cd2dd2ec8fd999dd7f254415c; \
     else \
       mkdir flash-attention; \
     fi
@@ -329,11 +330,16 @@ ARG BUILD_MAX_JOBS=""
 RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
     --mount=type=cache,target=/ccache \
     if [ "$BUILD_TRITON" = '1' ]; then \
-      pip3 install --no-cache-dir pybind11 && \
+      pip3 install --no-cache-dir pybind11 lit && \
       export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
-      cd triton/python && \
-      python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \
-      pip3 install ../../dist/*.whl; \
+      DIST_DIR="$(realpath -e ./dist)" && \
+      if [ -f 'triton/python/setup.py' ]; then \
+        cd triton/python; \
+      else \
+        cd triton; \
+      fi && \
+      python3 -m pip wheel -w "${DIST_DIR}/" --no-build-isolation --no-deps -vv . && \
+      pip3 install --no-cache-dir "${DIST_DIR}"/*.whl; \
     fi
 
 ARG BUILD_TORCH_VERSION
@@ -348,15 +354,22 @@ ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
 RUN printf 'Arch: %s\nTORCH_CUDA_ARCH_LIST=%s\n' "$(uname -m)" "${TORCH_CUDA_ARCH_LIST}"
 
 ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a"
-# Add sm_100a & sm_120 builds if NV_CUDA_LIB_VERSION matches 12.[89].*
+# Add sm_100a & sm_120a builds if NV_CUDA_LIB_VERSION matches 12.[89].*
 RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
     case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
       FLAGS="${FLAGS}$( \
-        printf -- ' -gencode=arch=compute_%s,code=sm_%s' 120 120 100 100 100a 100a \
+        printf -- ' -gencode=arch=compute_%s,code=sm_%s' 120a 120a 100a 100a \
       )" ;; \
     esac && \
     echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
 
+COPY --link --chmod=755 nvcc-wrapper.py /build/nvcc-wrapper.py
+ENV PYTORCH_NVCC='/build/nvcc-wrapper.py' \
+    CMAKE_CUDA_COMPILER='/build/nvcc-wrapper.py'
+# Filter these codes because we already build for the architecture-specific
+# versions of them instead.
+ENV NVCC_WRAPPER_FILTER_CODES='sm_90;sm_100;sm_120;compute_90;compute_100'
+
 # If the directory /opt/nccl-tests exists,
 # the base image is assumed to be nccl-tests,
 # so it uses the system's special NCCL and UCC installations for the build.
@@ -534,10 +547,6 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
       export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \
     esac && \
     cd TransformerEngine && \
-    if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \
-      sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \
-        build_tools/utils.py; \
-    fi && \
     python3 setup.py bdist_wheel --dist-dir /build/dist
 
 FROM builder-base AS flash-attn-builder-base
@@ -550,8 +559,9 @@ COPY <<-"EOT" /build/fa-build.sh
 	#!/bin/bash
 	set -eo pipefail;
 	if [ -n "$1" ]; then cd "$1"; fi;
+	echo "Flash Attention build: building $(realpath -s .)";
 	python3 setup.py bdist_wheel --dist-dir /build/dist \
-	| grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores'
+	| grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores';
 EOT
 RUN chmod 755 /build/fa-build.sh
 
@@ -581,8 +591,10 @@ FROM flash-attn-builder-base AS flash-attn-3-builder
 # Artifically sequence this build stage after the previous one
 # to prevent parallelism, because these are both very resource-intensive
 RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build :
+ARG BUILD_FLASH_ATTN_MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-3}"
 
 # Build flash-attn v3
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \
     --mount=type=cache,target=/ccache \
     if [ ! -d flash-attention/hopper ]; then \
@@ -592,8 +604,16 @@ RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,t
       MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    if [ "$(uname -m)" = 'aarch64' ]; then \
+      export FLASH_ATTENTION_DISABLE_SM80=TRUE; \
+    else \
+      NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS }-Xcompiler -mcmodel=medium"; \
+    fi && \
     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
-    /build/fa-build.sh flash-attention/hopper
+    sed -i \
+      's@if bare_metal_version != Version("12.8"):@if bare_metal_version < Version("12.8"):@' \
+      flash-attention/hopper/setup.py && \
+    NVCC_THREADS=4 /build/fa-build.sh flash-attention/hopper
 
 FROM builder-base AS builder
 COPY --link --from=torchaudio-builder /build/dist/ /build/dist/
@@ -671,27 +691,27 @@ COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
 # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0.
 # - Don't mess with libnccl2 when using nccl-tests as a base,
 #   checked via the existence of the directory "/opt/nccl-tests".
-RUN export \
-      CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
-      CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
-    export \
-      CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
+RUN CUDA_MAJOR_VERSION="$(echo "$CUDA_VERSION" | cut -d. -f1)" && \
+    CUDA_MINOR_VERSION="$(echo "$CUDA_VERSION" | cut -d. -f2)" && \
+    CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
+    CUDART_VERSION_SPEC="${NV_CUDA_CUDART_VERSION:+=$NV_CUDA_CUDART_VERSION}" && \
     apt-get -qq update && \
     apt-get -qq install --no-upgrade -y \
       libcurand-${CUDA_PACKAGE_VERSION} \
       libcufft-${CUDA_PACKAGE_VERSION} \
       libcublas-${CUDA_PACKAGE_VERSION} \
       cuda-nvrtc-${CUDA_PACKAGE_VERSION} \
+      cuda-cudart-dev-${CUDA_PACKAGE_VERSION}"${CUDART_VERSION_SPEC}" \
       libcusparse-${CUDA_PACKAGE_VERSION} \
       libcusolver-${CUDA_PACKAGE_VERSION} \
       libcufile-${CUDA_PACKAGE_VERSION} \
       cuda-cupti-${CUDA_PACKAGE_VERSION} \
       libnvjpeg-${CUDA_PACKAGE_VERSION} \
       libnvtoolsext1 && \
-    { if [ $CUDA_MAJOR_VERSION -ge 12 ]; then \
+    { if [ "$CUDA_MAJOR_VERSION" -ge 12 ]; then \
       apt-get -qq install --no-upgrade -y libnvjitlink-${CUDA_PACKAGE_VERSION}; fi; } && \
     { if [ ! -d /opt/nccl-tests ]; then \
-      export NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \
+      NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \
       apt-get -qq install --no-upgrade -y "libnccl2=$NCCL_PACKAGE_VERSION"; fi; } && \
     /tmp/install_cudnn.sh "$CUDA_VERSION" runtime && \
     rm /tmp/install_cudnn.sh && \
@@ -717,7 +737,12 @@ RUN <<-"EOT" python3
 	from pathlib import Path
 	from py_compile import compile
 
-	dist = metadata.distribution("flashattn-hopper")
+	try:
+	    dist = metadata.distribution("flash-attn-3")
+	    record_pattern = "flash?attn?3-*.dist-info/RECORD"
+	except metadata.PackageNotFoundError:
+	    dist = metadata.distribution("flashattn-hopper")
+	    record_pattern = "flashattn?hopper-*.dist-info/RECORD"
 	p = dist.locate_file("flash_attn_interface.py")
 	print("flash_attn_interface:", p)
 	root = p.parent
@@ -727,7 +752,7 @@ RUN <<-"EOT" python3
 	if not p.is_file():
 	    raise SystemExit("flash_attn_interface path is not a file")
 
-	d = root / "flashattn_hopper"
+	d = root / "flash_attn_3"
 	if d.exists():
 	    raise SystemExit(f'"{d}" already exists')
 
@@ -747,7 +772,7 @@ RUN <<-"EOT" python3
 
 
 	for f in dist.files:
-	    if f.match("flashattn?hopper-*.dist-info/RECORD"):
+	    if f.match(record_pattern):
 	        with f.locate().open("a", encoding="utf-8", newline="") as record:
 	            for added in (new, compiled):
 	                record.write(record_entry(added))
diff --git a/torch/install_cudnn.sh b/torch/install_cudnn.sh
@@ -33,7 +33,9 @@ LIBCUDNN_VER="$(
 
 if [ -z "$LIBCUDNN_VER" ]; then
     apt-get -qq update && \
-    apt-get -qq install --no-upgrade -y "${DEV_PREFIX}cudnn9-cuda-${CUDA_MAJOR_VERSION}" && \
+    apt-get -qq install --no-upgrade -y \
+        "${DEV_PREFIX}cudnn9-cuda-${CUDA_MAJOR_VERSION}" \
+        "libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION}" && \
     apt-get clean && \
     ldconfig;
 else
diff --git a/torch/nvcc-wrapper.py b/torch/nvcc-wrapper.py
@@ -0,0 +1,160 @@
+#!/bin/env python3
+
+"""
+Wraps invocations of ``nvcc``, watching for evidence of SIGKILL or SIGSEGV,
+and then re-running the ``nvcc`` command a configurable number of times.
+
+Checking for SIGKILL or SIGSEGV is implemented by checking for either:
+
+- A subprocess return code indicating either of these signals, or
+- The standard ``dash`` error messages for either signal.
+
+``dash`` status messages are checked as NVCC utilizes ``sh``
+subprocesses internally, and ``sh`` usually resolves to
+the ``dash`` shell within Ubuntu container images.
+
+This wrapper also has the ability to filter out some -gencode flags.
+Gencode flags to filter out should be identified by their code parameter
+in a semicolon-delimited list stored in the NVCC_WRAPPER_FILTER_CODES
+environment variable.
+"""
+
+import asyncio
+import os
+import re
+import shutil
+import signal
+import subprocess
+import sys
+from typing import BinaryIO, Final, FrozenSet, Iterable, List, Sequence, Set
+
+NVCC_PATH: Final[str] = shutil.which("nvcc")
+if NVCC_PATH is None:
+    raise SystemExit("NVCC wrapper: fatal: nvcc binary not found")
+
+WRAPPER_ATTEMPTS: Final[int] = int(os.getenv("NVCC_WRAPPER_ATTEMPTS") or 10)
+if WRAPPER_ATTEMPTS < 1:
+    raise SystemExit("NVCC wrapper: fatal: invalid value for NVCC_WRAPPER_ATTEMPTS")
+
+FILTER_CODES: Final[FrozenSet[str]] = frozenset(
+    filter(None, os.getenv("NVCC_WRAPPER_FILTER_CODES", "").split(";"))
+)
+if FILTER_CODES and not all(
+    re.fullmatch(r"(?:sm|compute|lto)_\d+[af]?", a) for a in FILTER_CODES
+):
+    raise SystemExit("NVCC wrapper: fatal: invalid value for NVCC_WRAPPER_FILTER_CODES")
+
+RETRY_RET_CODES: Final[FrozenSet[int]] = frozenset({
+    -signal.SIGSEGV,
+    -signal.SIGKILL,
+    128 + signal.SIGSEGV,
+    128 + signal.SIGKILL,
+    255,
+})
+
+
+async def main(args) -> int:
+    args = transform_args(args)
+    ret: int = 0
+    for attempt in range(1, WRAPPER_ATTEMPTS + 1):
+        if attempt > 1:
+            print(
+                "NVCC wrapper: info:"
+                f" Retrying [{attempt:d}/{WRAPPER_ATTEMPTS:d}]"
+                f" after exit code {ret:d}",
+                file=sys.stderr,
+                flush=True,
+            )
+            # Wait an exponentially increasing amount of time
+            # before trying again, up to one minute
+            await asyncio.sleep(min(60, int(1.5**attempt)))
+        proc = await asyncio.create_subprocess_exec(
+            NVCC_PATH, *args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        restart_signals: tuple = await asyncio.gather(
+            monitor_stream(proc.stdout, sys.stdout.buffer),
+            monitor_stream(proc.stderr, sys.stderr.buffer),
+        )
+        ret = await proc.wait()
+        del proc
+        if ret == 0 or not any(restart_signals) and ret not in RETRY_RET_CODES:
+            break
+    else:
+        print(
+            "NVCC wrapper: info:"
+            f" Maximum attempts reached, exiting with status {ret:d}",
+            file=sys.stderr,
+            flush=True,
+        )
+    return ret
+
+
+async def monitor_stream(
+    stream: asyncio.StreamReader,
+    output: BinaryIO,
+    watch_for: Iterable[bytes] = (
+        b"Segmentation fault",
+        b"Segmentation fault (core dumped)",
+        b"Killed",
+    ),
+) -> bool:
+    found: bool = False
+    while line := await stream.readline():
+        found = found or line.strip() in watch_for
+        output.write(line)
+        output.flush()
+    return found
+
+
+def transform_args(args: Sequence[str]) -> Sequence[str]:
+    # This filters out args of the form -gencode=arch=X,code=Y
+    # or -gencode arch=X,code=Y for any code in FILTER_CODES.
+    # This does not filter arguments specified using the
+    # --gpu-architecture and --gpu-code flags, nor codes specified
+    # among others in groups, like -gencode=arch=X,code=[Y,Z].
+    if not FILTER_CODES:
+        return args
+    transformed_args = []
+    partial: bool = False
+    gencode: Set[str] = {"-gencode", "--generate-code"}
+    for arg in args:
+        if not partial and arg in gencode:
+            partial = True
+            transformed_args.append(arg)
+            continue
+        if partial:
+            pattern: str = r"(arch=[^,]+,code=)(\S+)"
+        else:
+            pattern: str = r"((?:-gencode|--generate-code)=arch=\S+,code=)(\S+)"
+        m: re.Match = re.fullmatch(pattern, arg)
+        if m:
+            code: str = m.group(2)
+            if code in FILTER_CODES:
+                if partial:
+                    # There was a hanging `-gencode` arg before this, so delete it
+                    assert transformed_args[-1] in gencode
+                    del transformed_args[-1]
+            elif re.fullmatch(r"\[\S+]", code):
+                codes: List[str] = code[1:-1].split(",")
+                filtered_codes: List[str] = [c for c in codes if c not in FILTER_CODES]
+                if filtered_codes:
+                    filtered_code: str = ",".join(filtered_codes)
+                    if len(filtered_codes) > 1:
+                        filtered_code = f"[{filtered_code}]"
+                    transformed_args.append(m.group(1) + filtered_code)
+                elif partial:
+                    assert transformed_args[-1] in gencode
+                    del transformed_args[-1]
+            else:
+                transformed_args.append(arg)
+        else:
+            transformed_args.append(arg)
+        partial = False
+    return transformed_args
+
+
+if __name__ == "__main__":
+    try:
+        sys.exit(asyncio.run(main(sys.argv[1:])))
+    except KeyboardInterrupt:
+        sys.exit(130)
diff --git a/vllm-tensorizer/Dockerfile b/vllm-tensorizer/Dockerfile