Skip to content

Commit 6160769

Browse files
authored
Release 0.3.2
2 parents d91ccb8 + d25b6f9 commit 6160769

File tree

13 files changed

+35
-105
lines changed

13 files changed

+35
-105
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "vllm-base-image/vllm"]
2+
path = vllm-base-image/vllm
3+
url = https://github.com/runpod/vllm-fork-for-sls-worker.git

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG WORKER_CUDA_VERSION=11.8.0
2-
FROM runpod/worker-vllm:base-0.3.1-cuda${WORKER_CUDA_VERSION} AS vllm-base
2+
FROM runpod/worker-vllm:base-0.3.2-cuda${WORKER_CUDA_VERSION} AS vllm-base
33

44
RUN apt-get update -y \
55
&& apt-get install -y python3-pip

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
Deploy Blazing-fast LLMs powered by [vLLM](https://github.com/vllm-project/vllm) on RunPod Serverless in a few clicks.
66

7-
<p>Worker Version: 0.3.1 | vLLM Version: 0.3.2</p>
7+
<p>Worker Version: 0.3.2 | vLLM Version: 0.3.3</p>
88

99
[![CD | Docker-Build-Release](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml/badge.svg)](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml)
1010

@@ -88,7 +88,7 @@ This table provides a quick reference to the image tags you should use based on
8888
**LLM Settings**
8989
| `MODEL_NAME`**\*** | - | `str` | Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`). |
9090
| `MODEL_REVISION` | `None` | `str` |Model revision(branch) to load. |
91-
| `MAX_MODEL_LENGTH` | Model's maximum | `int` |Maximum number of tokens for the engine to handle per request. |
91+
| `MAX_MODEL_LEN` | Model's maximum | `int` |Maximum number of tokens for the engine to handle per request. |
9292
| `BASE_PATH` | `/runpod-volume` | `str` |Storage directory for Huggingface cache and model. Utilizes network storage if attached when pointed at `/runpod-volume`, which will have only one worker download the model once, which all workers will be able to load. If no network volume is present, creates a local directory within each worker. |
9393
| `LOAD_FORMAT` | `auto` | `str` |Format to load model in. |
9494
| `HF_TOKEN` | - | `str` |Hugging Face token for private and gated models. |

builder/download_model.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ def move_files(src_dir, dest_dir):
4545
with open("/local_model_path.txt", "w") as f:
4646
f.write(model_folder)
4747

48-
if tokenizer != model:
49-
tokenizer_folder = download_extras_or_tokenizer(tokenizer, download_dir, revisions["tokenizer"])
50-
with open("/local_tokenizer_path.txt", "w") as f:
51-
f.write(tokenizer_folder)
48+
tokenizer_folder = download_extras_or_tokenizer(tokenizer, download_dir, revisions["tokenizer"])
49+
with open("/local_tokenizer_path.txt", "w") as f:
50+
f.write(tokenizer_folder)

builder/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ runpod==1.6.2
66
huggingface-hub
77
packaging
88
typing-extensions==4.7.1
9-
pydantic
9+
pydantic
10+
pydantic-settings

src/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def _initialize_config(self):
3939
"trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
4040
"gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)),
4141
"max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")),
42-
"max_model_len": int(os.getenv("MAX_MODEL_LENGTH")) if os.getenv("MAX_MODEL_LENGTH") else None,
42+
"max_model_len": int(os.getenv("MAX_MODEL_LEN")) if os.getenv("MAX_MODEL_LEN") else None,
4343
"tensor_parallel_size": device_count(),
4444
"seed": int(os.getenv("SEED")) if os.getenv("SEED") else None,
4545
"kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"),

src/constants.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,4 @@
1-
from typing import Union
2-
31
DEFAULT_BATCH_SIZE = 50
42
DEFAULT_MAX_CONCURRENCY = 300
53
DEFAULT_BATCH_SIZE_GROWTH_FACTOR = 3
6-
DEFAULT_MIN_BATCH_SIZE = 1
7-
8-
SAMPLING_PARAM_TYPES = {
9-
"n": int,
10-
"best_of": int,
11-
"presence_penalty": float,
12-
"frequency_penalty": float,
13-
"repetition_penalty": float,
14-
"temperature": Union[float, int],
15-
"top_p": float,
16-
"top_k": int,
17-
"min_p": float,
18-
"use_beam_search": bool,
19-
"length_penalty": float,
20-
"early_stopping": Union[bool, str],
21-
"stop": Union[str, list],
22-
"stop_token_ids": list,
23-
"ignore_eos": bool,
24-
"max_tokens": int,
25-
"logprobs": int,
26-
"prompt_logprobs": int,
27-
"skip_special_tokens": bool,
28-
"spaces_between_special_tokens": bool,
29-
"include_stop_str_in_output": bool
30-
}
4+
DEFAULT_MIN_BATCH_SIZE = 1

src/engine.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from torch.cuda import device_count
77
from typing import AsyncGenerator
88

9-
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
9+
from vllm import AsyncLLMEngine, AsyncEngineArgs
1010
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
1111
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
1212
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse
@@ -16,7 +16,6 @@
1616
from tokenizer import TokenizerWrapper
1717
from config import EngineConfig
1818

19-
2019
class vLLMEngine:
2120
def __init__(self, engine = None):
2221
load_dotenv() # For local development
@@ -35,7 +34,7 @@ async def generate(self, job_input: JobInput):
3534
try:
3635
async for batch in self._generate_vllm(
3736
llm_input=job_input.llm_input,
38-
validated_sampling_params=job_input.validated_sampling_params,
37+
validated_sampling_params=job_input.sampling_params,
3938
batch_size=job_input.max_batch_size,
4039
stream=job_input.stream,
4140
apply_chat_template=job_input.apply_chat_template,
@@ -45,12 +44,11 @@ async def generate(self, job_input: JobInput):
4544
):
4645
yield batch
4746
except Exception as e:
48-
yield create_error_response(str(e)).model_dump()
47+
yield {"error": create_error_response(str(e)).model_dump()}
4948

5049
async def _generate_vllm(self, llm_input, validated_sampling_params, batch_size, stream, apply_chat_template, request_id, batch_size_growth_factor, min_batch_size: str) -> AsyncGenerator[dict, None]:
5150
if apply_chat_template or isinstance(llm_input, list):
5251
llm_input = self.tokenizer.apply_chat_template(llm_input)
53-
validated_sampling_params = SamplingParams(**validated_sampling_params)
5452
results_generator = self.llm.generate(llm_input, validated_sampling_params, request_id)
5553
n_responses, n_input_tokens, is_first_output = validated_sampling_params.n, 0, True
5654
last_output_texts, token_counters = ["" for _ in range(n_responses)], {"batch": 0, "total": 0}

src/utils.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import logging
22
from http import HTTPStatus
33
from typing import Any, Dict
4-
from constants import SAMPLING_PARAM_TYPES
54
from vllm.utils import random_uuid
65
from vllm.entrypoints.openai.protocol import ErrorResponse
7-
6+
from vllm import SamplingParams
87

98
logging.basicConfig(level=logging.INFO)
109

@@ -25,20 +24,6 @@ def count_physical_cores():
2524

2625
return len(cores)
2726

28-
def validate_sampling_params(params: Dict[str, Any]) -> Dict[str, Any]:
29-
validated_params = {}
30-
invalid_params = []
31-
for key, value in params.items():
32-
expected_type = SAMPLING_PARAM_TYPES.get(key)
33-
if expected_type and isinstance(value, expected_type):
34-
validated_params[key] = value
35-
else:
36-
invalid_params.append(key)
37-
38-
if len(invalid_params) > 0:
39-
logging.warning("Ignoring invalid sampling params: %s", invalid_params)
40-
41-
return validated_params
4227

4328
class JobInput:
4429
def __init__(self, job):
@@ -47,7 +32,7 @@ def __init__(self, job):
4732
self.max_batch_size = job.get("max_batch_size")
4833
self.apply_chat_template = job.get("apply_chat_template", False)
4934
self.use_openai_format = job.get("use_openai_format", False)
50-
self.validated_sampling_params = validate_sampling_params(job.get("sampling_params", {}))
35+
self.sampling_params = SamplingParams(**job.get("sampling_params", {}))
5136
self.request_id = random_uuid()
5237
batch_size_growth_factor = job.get("batch_size_growth_factor")
5338
self.batch_size_growth_factor = float(batch_size_growth_factor) if batch_size_growth_factor else None
@@ -78,4 +63,6 @@ def update(self):
7863
def create_error_response(message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
7964
return ErrorResponse(message=message,
8065
type=err_type,
81-
code=status_code.value)
66+
code=status_code.value)
67+
68+

vllm-base/Dockerfile renamed to vllm-base-image/Dockerfile

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,16 @@ ARG WORKER_CUDA_VERSION
1717
RUN apt-get update -y \
1818
&& apt-get install -y python3-pip git
1919

20-
RUN if [ "${WORKER_CUDA_VERSION}" = "12.1.0" ]; then \
21-
ldconfig /usr/local/cuda-12.1/compat/; \
22-
fi
23-
2420
# Set working directory
2521
WORKDIR /vllm-installation
2622

2723
# Install build and runtime dependencies
28-
COPY vllm-${WORKER_CUDA_VERSION}/requirements.txt requirements.txt
24+
COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt requirements.txt
2925
RUN --mount=type=cache,target=/root/.cache/pip \
3026
pip install -r requirements.txt
3127

32-
RUN --mount=type=cache,target=/root/.cache/pip \
33-
if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \
34-
pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
35-
fi
36-
3728
# Install development dependencies
38-
COPY vllm-${WORKER_CUDA_VERSION}/requirements-dev.txt requirements-dev.txt
29+
COPY vllm/requirements-dev.txt requirements-dev.txt
3930
RUN --mount=type=cache,target=/root/.cache/pip \
4031
pip install -r requirements-dev.txt
4132

@@ -45,25 +36,15 @@ FROM dev AS build
4536
ARG WORKER_CUDA_VERSION
4637

4738
# Install build dependencies
48-
COPY vllm-${WORKER_CUDA_VERSION}/requirements-build.txt requirements-build.txt
39+
COPY vllm/requirements-build.txt requirements-build.txt
4940
RUN --mount=type=cache,target=/root/.cache/pip \
5041
pip install -r requirements-build.txt
5142

5243
# Copy necessary files
53-
COPY vllm-${WORKER_CUDA_VERSION}/csrc csrc
54-
COPY vllm-${WORKER_CUDA_VERSION}/setup.py setup.py
55-
COPY vllm-12.1.0/pyproject.toml pyproject.toml
56-
COPY vllm-${WORKER_CUDA_VERSION}/vllm/__init__.py vllm/__init__.py
57-
58-
# Conditional installation based on CUDA version
59-
RUN --mount=type=cache,target=/root/.cache/pip \
60-
if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \
61-
pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
62-
rm pyproject.toml; \
63-
elif [ "${WORKER_CUDA_VERSION}" != "12.1.0" ]; then \
64-
echo "WORKER_CUDA_VERSION not supported"; \
65-
exit 1; \
66-
fi
44+
COPY vllm/csrc csrc
45+
COPY vllm/setup.py setup.py
46+
COPY vllm/pyproject.toml pyproject.toml
47+
COPY vllm/vllm/__init__.py vllm/__init__.py
6748

6849
# Set environment variables for building extensions
6950
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
@@ -72,8 +53,10 @@ ARG max_jobs=48
7253
ENV MAX_JOBS=${max_jobs}
7354
ARG nvcc_threads=1024
7455
ENV NVCC_THREADS=${nvcc_threads}
75-
56+
ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION}
57+
ENV VLLM_INSTALL_PUNICA_KERNELS=0
7658
# Build extensions
59+
RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/
7760
RUN python3 setup.py build_ext --inplace
7861

7962
FROM nvidia/cuda:${WORKER_CUDA_VERSION}-runtime-ubuntu22.04 AS vllm-base
@@ -88,19 +71,15 @@ RUN apt-get update -y \
8871
# Set working directory
8972
WORKDIR /vllm-installation
9073

74+
9175
# Install runtime dependencies
92-
COPY vllm-${WORKER_CUDA_VERSION}/requirements.txt requirements.txt
76+
COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt requirements.txt
9377
RUN --mount=type=cache,target=/root/.cache/pip \
9478
pip install -r requirements.txt
9579

96-
RUN --mount=type=cache,target=/root/.cache/pip \
97-
if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \
98-
pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
99-
fi
100-
10180
# Copy built files from the build stage
10281
COPY --from=build /vllm-installation/vllm/*.so /vllm-installation/vllm/
103-
COPY vllm-${WORKER_CUDA_VERSION}/vllm vllm
82+
COPY vllm/vllm vllm
10483

10584
# Set PYTHONPATH environment variable
10685
ENV PYTHONPATH="/"

0 commit comments

Comments
 (0)