Update vllm patch to v0.2.6

tscholak · rlouf · commit 3e7507849ac2 · 2023-12-29T18:53:57.000+01:00
diff --git a/outlines/base.py b/outlines/base.py
@@ -13,7 +13,10 @@
 )
 
 # Allow nested loops, useful to run in notebooks
-nest_asyncio.apply()
+try:
+    nest_asyncio.apply()
+except ValueError as e:
+    print("Could not apply nest_asyncio:", e)
 
 
 class vectorize:
diff --git a/outlines/serve/serve.py b/outlines/serve/serve.py
@@ -16,24 +16,25 @@
 from typing import AsyncGenerator
 
 import uvicorn
-import vllm
+import vllm.model_executor.layers.sampler as sampler
+
+from .vllm import JSONLogitsProcessor, _patched_apply_logits_processors
+
+# Patch the _apply_logits_processors so it is compatible with `JSONLogitsProcessor`
+sampler._apply_logits_processors = _patched_apply_logits_processors
+
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
-from .vllm import JSONLogitsProcessor, PatchedSampler
-
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
 app = FastAPI()
 engine = None
 
-# Patch the sampler so it is compatible with `JSONLogitsProcessor`
-vllm.model_executor.layers.sampler.Sampler = PatchedSampler
-
 
 @app.get("/health")
 async def health() -> Response:
diff --git a/outlines/serve/vllm.py b/outlines/serve/vllm.py
@@ -2,24 +2,9 @@
 import json
 import math
 from collections import defaultdict
-from typing import DefaultDict, List, Optional
+from typing import DefaultDict, List
 
 import torch
-import torch.nn as nn
-from vllm.model_executor.layers.sampler import (
-    _SAMPLING_EPS,
-    _apply_min_p,
-    _apply_penalties,
-    _apply_top_p_top_k,
-    _build_sampler_output,
-    _get_logits,
-    _get_logprobs,
-    _get_penalties,
-    _get_temperatures,
-    _get_top_p_top_k_min_p,
-    _prune_hidden_states,
-    _sample,
-)
 
 from outlines.fsm.fsm import RegexFSM
 from outlines.fsm.json_schema import build_regex_from_object
@@ -54,98 +39,6 @@ def _patched_apply_logits_processors(
     return logits
 
 
-class PatchedSampler(nn.Module):
-    """This code is copied from vLLM and uses the patched logits processor.
-
-    Samples the next tokens from the model's outputs.
-
-    This layer does the following:
-    1. Discard the hidden states that are not used for sampling (i.e., all
-        tokens except the final one in each prompt).
-    2. Compute the logits for the next tokens.
-    3. Apply presence, frequency and repetition penalties.
-    4. Apply temperature scaling.
-    5. Apply top-p and top-k truncation.
-    6. Sample the next tokens.
-
-    Here, each sequence group within the batch can have different sampling
-    parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
-    """
-
-    def __init__(self, vocab_size: int) -> None:
-        super().__init__()
-        self.vocab_size = vocab_size
-
-    def forward(
-        self,
-        embedding: torch.Tensor,
-        hidden_states: torch.Tensor,
-        sampling_metadata,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ):
-        # Get the hidden states that we use for sampling.
-        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
-
-        # Get the logits for the next tokens.
-        logits = _get_logits(hidden_states, embedding, embedding_bias, self.vocab_size)
-
-        # Apply logits processors (if any).
-        logits = _patched_apply_logits_processors(logits, sampling_metadata)
-        # Apply presence and frequency penalties.
-        presence_penalties, frequency_penalties, repetition_penalties = _get_penalties(
-            sampling_metadata
-        )
-        assert len(presence_penalties) == logits.shape[0]
-        assert len(frequency_penalties) == logits.shape[0]
-        assert len(repetition_penalties) == logits.shape[0]
-        logits = _apply_penalties(
-            logits,
-            sampling_metadata,
-            presence_penalties,
-            frequency_penalties,
-            repetition_penalties,
-        )
-
-        # Apply temperature scaling.
-        temperatures = _get_temperatures(sampling_metadata)
-        assert len(temperatures) == logits.shape[0]
-        if any(t != 1.0 for t in temperatures):
-            t = torch.tensor(temperatures, dtype=logits.dtype, device=logits.device)
-            # Use in-place division to avoid creating a new tensor.
-            logits.div_(t.unsqueeze(dim=1))
-
-        # Apply top-p and top-k truncation.
-        top_ps, top_ks, min_ps = _get_top_p_top_k_min_p(
-            sampling_metadata, self.vocab_size
-        )
-        assert len(top_ps) == len(top_ks) == logits.shape[0]
-        do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
-        do_top_k = any(k != self.vocab_size for k in top_ks)
-        if do_top_p or do_top_k:
-            logits = _apply_top_p_top_k(logits, top_ps, top_ks)
-
-        do_min_p = any(mp > _SAMPLING_EPS for mp in min_ps)
-        if do_min_p:
-            logits = _apply_min_p(logits, min_ps)
-
-        # We use float32 for probabilities and log probabilities.
-        # Compute the probabilities.
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
-        # Compute the log probabilities.
-        # Use log_softmax to ensure numerical stability.
-        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
-
-        # Sample the next tokens.
-        sample_results = _sample(probs, logprobs, sampling_metadata)
-        # Get the logprobs query results.
-        prompt_logprobs, sample_logprobs = _get_logprobs(
-            logprobs, sampling_metadata, sample_results
-        )
-        return _build_sampler_output(
-            sample_results, sampling_metadata, prompt_logprobs, sample_logprobs
-        )
-
-
 class JSONLogitsProcessor:
     def __init__(self, schema, llm):
         """Compile the FSM that drives the JSON-guided generation.
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,7 +53,7 @@ test = [
     "datasets",
     "responses",
 ]
-serve = ["vllm==0.2.5"]
+serve = ["vllm==0.2.6"]
 
 [project.urls]
 homepage = "https://github.com/outlines-dev/outlines"

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,10 @@`
`13`	`13`	`)`
`14`	`14`
`15`	`15`	`# Allow nested loops, useful to run in notebooks`
`16`		`-nest_asyncio.apply()`
	`16`	`+try:`
	`17`	`+ nest_asyncio.apply()`
	`18`	`+except ValueError as e:`
	`19`	`+ print("Could not apply nest_asyncio:", e)`
`17`	`20`
`18`	`21`
`19`	`22`	`class vectorize:`
Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ test = [`
`53`	`53`	`"datasets",`
`54`	`54`	`"responses",`
`55`	`55`	`]`
`56`		`-serve = ["vllm==0.2.5"]`
	`56`	`+serve = ["vllm==0.2.6"]`
`57`	`57`
`58`	`58`	`[project.urls]`
`59`	`59`	`homepage = "https://github.com/outlines-dev/outlines"`