Merge pull request #147 from mohamednaji7/BitsAndBytes

pandyamarut · web-flow · commit 6b64bb93bdfb · 2025-04-21T11:39:53.000-07:00
completing the "bitsandbytes" option - based on https://docs.vllm.ai/en/stable/quantization/bnb.html
diff --git a/README.md b/README.md
@@ -125,7 +125,7 @@ Below is a summary of the available RunPod Worker images, categorized by image s
 | `MAX_NUM_SEQS`                            | 256                   | `int`                                      | Maximum number of sequences per iteration. |
 | `MAX_LOGPROBS`                            | 20                    | `int`                                      | Max number of log probs to return when logprobs is specified in SamplingParams. |
 | `DISABLE_LOG_STATS`                       | False                 | `bool`                                     | Disable logging statistics. |
-| `QUANTIZATION`                            | None                  | ['awq', 'squeezellm', 'gptq']              | Method used to quantize the weights. |
+| `QUANTIZATION`                            | None                  | ['awq', 'squeezellm', 'gptq', 'bitsandbytes']              | Method used to quantize the weights. |
 | `ROPE_SCALING`                            | None                  | `dict`                                     | RoPE scaling configuration in JSON format. |
 | `ROPE_THETA`                              | None                  | `float`                                    | RoPE theta. Use with rope_scaling. |
 | `TOKENIZER_POOL_SIZE`                     | 0                     | `int`                                      | Size of tokenizer pool to use for asynchronous tokenization. |
diff --git a/builder/requirements.txt b/builder/requirements.txt
@@ -4,8 +4,9 @@ pyarrow
 runpod~=1.7.7
 huggingface-hub
 packaging
-typing-extensions==4.7.1
+typing-extensions>=4.8.0
 pydantic
 pydantic-settings
 hf-transfer
 transformers
+bitsandbytes>=0.45.0
diff --git a/src/engine_args.py b/src/engine_args.py
@@ -147,6 +147,9 @@ def get_engine_args():
     
     # Rename and match to vllm args
     args = match_vllm_args(args)
+
+    if args.get("load_format") == "bitsandbytes":
+        args["quantization"] = args["load_format"]
     
     # Set tensor parallel size and max parallel loading workers if more than 1 GPU is available
     num_gpus = device_count()
diff --git a/worker-config.json b/worker-config.json
@@ -802,14 +802,15 @@
     "env_var_name": "QUANTIZATION",
     "value": "",
     "title": "Quantization",
-    "description": "Method used to quantize the weights.",
+    "description": "Method used to quantize the weights.\nif the `Load Format` is 'bitsandbytes' then `Quantization` will be forced to 'bitsandbytes'",
     "required": false,
     "type": "select",
     "options": [
       { "value": "None", "label": "None" },
       { "value": "awq", "label": "AWQ" },
       { "value": "squeezellm", "label": "SqueezeLLM" },
-      { "value": "gptq", "label": "GPTQ" }
+      { "value": "gptq", "label": "GPTQ" },
+      { "value": "bitsandbytes", "label": "bitsandbytes" }
     ]
   },
   "ROPE_SCALING": {