Skip to content

Commit 91167b8

Browse files
committed
v0.3.0: OpenAI Compatibility, Dynamic Stream Batching, Refactor, Error Catching
2 parents 235d0d3 + 819102c commit 91167b8

File tree

13 files changed

+669
-313
lines changed

13 files changed

+669
-313
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
runpod.toml
33
*.pyc
44
.env
5-
test/*
5+
test/*
6+
vllm-base/vllm-*

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG WORKER_CUDA_VERSION=11.8.0
2-
FROM runpod/worker-vllm:base-0.2.2-cuda${WORKER_CUDA_VERSION} AS vllm-base
2+
FROM runpod/worker-vllm:base-0.3.0-cuda${WORKER_CUDA_VERSION} AS vllm-base
33

44
RUN apt-get update -y \
55
&& apt-get install -y python3-pip

README.md

Lines changed: 416 additions & 127 deletions
Large diffs are not rendered by default.

builder/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ hf_transfer
22
ray
33
pandas
44
pyarrow
5-
runpod==1.5.3
5+
runpod==1.6.2
66
huggingface-hub
77
packaging
88
typing-extensions==4.7.1

src/__init__.py

Whitespace-only changes.

src/config.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
from dotenv import load_dotenv
3+
from utils import count_physical_cores
4+
from torch.cuda import device_count
5+
6+
class EngineConfig:
7+
def __init__(self):
8+
load_dotenv()
9+
self.model_name_or_path, self.hf_home, self.model_revision = self._get_local_or_env("/local_model_path.txt", "MODEL_NAME")
10+
self.tokenizer_name_or_path, _, self.tokenizer_revision = self._get_local_or_env("/local_tokenizer_path.txt", "TOKENIZER_NAME")
11+
self.tokenizer_name_or_path = self.tokenizer_name_or_path or self.model_name_or_path
12+
self.quantization = self._get_quantization()
13+
self.config = self._initialize_config()
14+
15+
def _get_local_or_env(self, local_path, env_var):
16+
if os.path.exists(local_path):
17+
with open(local_path, "r") as file:
18+
return file.read().strip(), None, None
19+
return os.getenv(env_var), os.getenv("HF_HOME"), os.getenv(f"{env_var}_REVISION")
20+
21+
def _get_quantization(self):
22+
quantization = os.getenv("QUANTIZATION", "").lower()
23+
return quantization if quantization in ["awq", "squeezellm", "gptq"] else None
24+
25+
def _initialize_config(self):
26+
args = {
27+
"model": self.model_name_or_path,
28+
"revision": self.model_revision,
29+
"download_dir": self.hf_home,
30+
"quantization": self.quantization,
31+
"load_format": os.getenv("LOAD_FORMAT", "auto"),
32+
"dtype": os.getenv("DTYPE", "half" if self.quantization else "auto"),
33+
"tokenizer": self.tokenizer_name_or_path,
34+
"tokenizer_revision": self.tokenizer_revision,
35+
"disable_log_stats": bool(int(os.getenv("DISABLE_LOG_STATS", 1))),
36+
"disable_log_requests": bool(int(os.getenv("DISABLE_LOG_REQUESTS", 1))),
37+
"trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
38+
"gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)),
39+
"max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")),
40+
"max_model_len": int(os.getenv("MAX_MODEL_LENGTH")) if os.getenv("MAX_MODEL_LENGTH") else None,
41+
"tensor_parallel_size": device_count(),
42+
"seed": int(os.getenv("SEED")) if os.getenv("SEED") else None,
43+
"kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"),
44+
"block_size": int(os.getenv("BLOCK_SIZE")) if os.getenv("BLOCK_SIZE") else None,
45+
"swap_space": int(os.getenv("SWAP_SPACE")) if os.getenv("SWAP_SPACE") else None,
46+
"max_context_len_to_capture": int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE") else None,
47+
"disable_custom_all_reduce": bool(int(os.getenv("DISABLE_CUSTOM_ALL_REDUCE", 0))),
48+
"enforce_eager": bool(int(os.getenv("ENFORCE_EAGER", 0)))
49+
}
50+
51+
return {k: v for k, v in args.items() if v is not None}

src/constants.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from typing import Union
22

3-
DEFAULT_BATCH_SIZE = 30
3+
DEFAULT_BATCH_SIZE = 50
44
DEFAULT_MAX_CONCURRENCY = 300
5+
DEFAULT_BATCH_SIZE_GROWTH_FACTOR = 3
6+
DEFAULT_MIN_BATCH_SIZE = 1
57

68
SAMPLING_PARAM_TYPES = {
79
"n": int,
@@ -25,4 +27,4 @@
2527
"skip_special_tokens": bool,
2628
"spaces_between_special_tokens": bool,
2729
"include_stop_str_in_output": bool
28-
}
30+
}

0 commit comments

Comments
 (0)