Add regex support to vLLM endpoint

tscholak · rlouf · commit 0032c659dad9 · 2023-12-29T18:53:57.000+01:00
diff --git a/outlines/serve/serve.py b/outlines/serve/serve.py
@@ -17,19 +17,23 @@
 
 import uvicorn
 import vllm.model_executor.layers.sampler as sampler
-
-from .vllm import JSONLogitsProcessor, _patched_apply_logits_processors
-
-# Patch the _apply_logits_processors so it is compatible with `JSONLogitsProcessor`
-sampler._apply_logits_processors = _patched_apply_logits_processors
-
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
+from .vllm import (
+    JSONLogitsProcessor,
+    RegexLogitsProcessor,
+    _patched_apply_logits_processors,
+)
+
+# Patch the _apply_logits_processors so it is compatible with `JSONLogitsProcessor`
+sampler._apply_logits_processors = _patched_apply_logits_processors
+
+
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
 app = FastAPI()
@@ -48,22 +52,28 @@ async def generate(request: Request) -> Response:
 
     The request should be a JSON object with the following fields:
     - prompt: the prompt to use for the generation.
-    - schema: the JSON schema to use for the generation
+    - schema: the JSON schema to use for the generation (if regex is not provided).
+    - regex: the regex to use for the generation (if schema is not provided).
     - stream: whether to stream the results or not.
     - other fields: the sampling parameters (See `SamplingParams` for details).
     """
+    assert engine is not None
+
     request_dict = await request.json()
     prompt = request_dict.pop("prompt")
     stream = request_dict.pop("stream", False)
 
     json_schema = request_dict.pop("schema", None)
+    regex_string = request_dict.pop("regex", None)
     if json_schema is not None:
-        logits_processors = [JSONLogitsProcessor(json_schema, engine.engine)]  # type: ignore
+        logits_processors = [JSONLogitsProcessor(json_schema, engine.engine)]
+    elif regex_string is not None:
+        logits_processors = [RegexLogitsProcessor(regex_string, engine.engine)]
     else:
         logits_processors = []
 
     sampling_params = SamplingParams(
-        **request_dict, logits_processors=logits_processors
+        **request_dict, logits_processors=logits_processors  # type: ignore
     )
     request_id = random_uuid()
 
@@ -107,7 +117,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
 
     # Adds the `engine_use_ray`,  `disable_log_requests` and `max_log_len`
     # arguments
-    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine_args: AsyncEngineArgs = AsyncEngineArgs.from_cli_args(args)  # type: ignore
 
     # Sets default for the model (`facebook/opt-125m`)
     engine = AsyncLLMEngine.from_engine_args(engine_args)
diff --git a/outlines/serve/vllm.py b/outlines/serve/vllm.py
@@ -39,25 +39,21 @@ def _patched_apply_logits_processors(
     return logits
 
 
-class JSONLogitsProcessor:
-    def __init__(self, schema, llm):
-        """Compile the FSM that drives the JSON-guided generation.
+class RegexLogitsProcessor:
+    def __init__(self, regex_string, llm):
+        """Compile the FSM that drives the regex-guided generation.
 
         Parameters
         ----------
-        pydantic_model
-            A Pydantic `BaseModel` that encodes the structure we want
-            the model to generate.
+        regex_string
+            A string that represents a regular expression
         llm
             An instance of `vllm.LLM`
 
         """
-        if isinstance(schema, dict):
-            schema = json.dumps(schema)
-        regex_str = build_regex_from_object(schema)
         tokenizer = self.adapt_tokenizer(llm.tokenizer)
 
-        fsm = RegexFSM(regex_str, tokenizer)
+        fsm = RegexFSM(regex_string, tokenizer)
         self.fsm = fsm
 
     def __call__(
@@ -106,3 +102,21 @@ def convert_token_to_string(token: str) -> str:
         tokenizer.convert_token_to_string = convert_token_to_string
 
         return tokenizer
+
+
+class JSONLogitsProcessor(RegexLogitsProcessor):
+    def __init__(self, schema, llm):
+        """Compile the FSM that drives the JSON-guided generation.
+
+        Parameters
+        ----------
+        schema
+            A JSON schema that encodes the structure we want the model to generate
+        llm
+            An instance of `vllm.LLM`
+
+        """
+        if isinstance(schema, dict):
+            schema = json.dumps(schema)
+        regex_string = build_regex_from_object(schema)
+        super().__init__(regex_string, llm)
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,7 +53,12 @@ test = [
     "datasets",
     "responses",
 ]
-serve = ["vllm==0.2.6"]
+serve = [
+    "vllm==0.2.6",
+    "ray==2.9.0",
+    "uvicorn",
+    "fastapi"
+]
 
 [project.urls]
 homepage = "https://github.com/outlines-dev/outlines"