Skip to content

Commit 3c98b43

Browse files
committed
Revert "[Metrics] Hide deprecated metrics with gpu_ prefix (#24245)"
This change is intended for 0.11.0. This reverts commit 2942970.
1 parent c10101a commit 3c98b43

File tree

2 files changed

+46
-63
lines changed

2 files changed

+46
-63
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,6 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
232232
"vllm:gpu_cache_usage_perc",
233233
"vllm:gpu_prefix_cache_queries",
234234
"vllm:gpu_prefix_cache_hits",
235-
"vllm:kv_cache_usage_perc",
236-
"vllm:prefix_cache_queries",
237-
"vllm:prefix_cache_hits",
238235
"vllm:num_preemptions_total",
239236
"vllm:prompt_tokens_total",
240237
"vllm:generation_tokens_total",
@@ -280,9 +277,6 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
280277
]
281278

282279
HIDDEN_DEPRECATED_METRICS: list[str] = [
283-
"vllm:gpu_cache_usage_perc",
284-
"vllm:gpu_prefix_cache_queries",
285-
"vllm:gpu_prefix_cache_hits",
286280
"vllm:time_per_output_token_seconds_sum",
287281
"vllm:time_per_output_token_seconds_bucket",
288282
"vllm:time_per_output_token_seconds_count",
@@ -313,7 +307,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
313307
client: openai.AsyncClient, use_v1: bool):
314308

315309
running_requests, waiting_requests, kv_cache_usage = (
316-
_get_running_metrics_from_api(server, use_v1))
310+
_get_running_metrics_from_api(server))
317311

318312
# Expect no running requests or kvcache usage
319313
assert running_requests == 0
@@ -336,7 +330,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
336330

337331
# Check that we have running requests
338332
running_requests, waiting_requests, kv_cache_usage = (
339-
_get_running_metrics_from_api(server, use_v1))
333+
_get_running_metrics_from_api(server))
340334

341335
# Expect running requests and kvcache usage
342336
assert running_requests > 0
@@ -355,7 +349,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
355349

356350
# Verify running and waiting requests counts and KV cache usage are zero
357351
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
358-
_get_running_metrics_from_api(server, use_v1))
352+
_get_running_metrics_from_api(server))
359353

360354
assert running_requests_after == 0,\
361355
(f"Expected 0 running requests after abort, got "
@@ -368,7 +362,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
368362
f"{kv_cache_usage_after}")
369363

370364

371-
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
365+
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
372366
"""Return (running_count, waiting_count, kv_cache_usage)"""
373367

374368
response = requests.get(server.url_for("metrics"))
@@ -377,9 +371,6 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
377371
# Verify running and waiting requests counts and KV cache usage are zero
378372
running_requests, waiting_requests, kv_cache_usage = None, None, None
379373

380-
kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
381-
if use_v1 else "vllm:gpu_cache_usage_perc")
382-
383374
for family in text_string_to_metric_families(response.text):
384375
if family.name == "vllm:num_requests_running":
385376
for sample in family.samples:
@@ -391,9 +382,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
391382
if sample.name == "vllm:num_requests_waiting":
392383
waiting_requests = sample.value
393384
break
394-
elif family.name == kv_cache_usage_metric:
385+
elif family.name == "vllm:gpu_cache_usage_perc":
395386
for sample in family.samples:
396-
if sample.name == kv_cache_usage_metric:
387+
if sample.name == "vllm:gpu_cache_usage_perc":
397388
kv_cache_usage = sample.value
398389
break
399390

vllm/v1/metrics/loggers.py

Lines changed: 40 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -208,46 +208,40 @@ def __init__(self,
208208
#
209209
# GPU cache
210210
#
211-
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
212-
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
213-
# TODO: remove in 0.12.0
214-
if self.show_hidden_metrics:
215-
gauge_gpu_cache_usage = self._gauge_cls(
216-
name="vllm:gpu_cache_usage_perc",
217-
documentation=(
218-
"GPU KV-cache usage. 1 means 100 percent usage."
219-
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
220-
multiprocess_mode="mostrecent",
221-
labelnames=labelnames)
222-
self.gauge_gpu_cache_usage = make_per_engine(
223-
gauge_gpu_cache_usage, engine_indexes, model_name)
224-
225-
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
226-
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
227-
# TODO: remove in 0.12.0
228-
if self.show_hidden_metrics:
229-
counter_gpu_prefix_cache_queries = self._counter_cls(
230-
name="vllm:gpu_prefix_cache_queries",
231-
documentation=(
232-
"GPU prefix cache queries, in terms of number of queried"
233-
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
234-
),
235-
labelnames=labelnames)
236-
self.counter_gpu_prefix_cache_queries = make_per_engine(
237-
counter_gpu_prefix_cache_queries, engine_indexes, model_name)
238-
239-
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
240-
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
241-
# TODO: remove in 0.12.0
242-
if self.show_hidden_metrics:
243-
counter_gpu_prefix_cache_hits = self._counter_cls(
244-
name="vllm:gpu_prefix_cache_hits",
245-
documentation=(
246-
"GPU prefix cache hits, in terms of number of cached "
247-
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
248-
labelnames=labelnames)
249-
self.counter_gpu_prefix_cache_hits = make_per_engine(
250-
counter_gpu_prefix_cache_hits, engine_indexes, model_name)
211+
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
212+
# TODO: in 0.10, only enable if show_hidden_metrics=True
213+
gauge_gpu_cache_usage = self._gauge_cls(
214+
name="vllm:gpu_cache_usage_perc",
215+
documentation=(
216+
"GPU KV-cache usage. 1 means 100 percent usage."
217+
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
218+
multiprocess_mode="mostrecent",
219+
labelnames=labelnames)
220+
self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage,
221+
engine_indexes,
222+
model_name)
223+
224+
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
225+
# TODO: in 0.10, only enable if show_hidden_metrics=True
226+
counter_gpu_prefix_cache_queries = self._counter_cls(
227+
name="vllm:gpu_prefix_cache_queries",
228+
documentation=(
229+
"GPU prefix cache queries, in terms of number of queried"
230+
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."),
231+
labelnames=labelnames)
232+
self.counter_gpu_prefix_cache_queries = make_per_engine(
233+
counter_gpu_prefix_cache_queries, engine_indexes, model_name)
234+
235+
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
236+
# TODO: in 0.10, only enable if show_hidden_metrics=True
237+
counter_gpu_prefix_cache_hits = self._counter_cls(
238+
name="vllm:gpu_prefix_cache_hits",
239+
documentation=(
240+
"GPU prefix cache hits, in terms of number of cached "
241+
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
242+
labelnames=labelnames)
243+
self.counter_gpu_prefix_cache_hits = make_per_engine(
244+
counter_gpu_prefix_cache_hits, engine_indexes, model_name)
251245

252246
gauge_kv_cache_usage = self._gauge_cls(
253247
name="vllm:kv_cache_usage_perc",
@@ -521,17 +515,15 @@ def record(self,
521515
self.gauge_scheduler_waiting[engine_idx].set(
522516
scheduler_stats.num_waiting_reqs)
523517

524-
if self.show_hidden_metrics:
525-
self.gauge_gpu_cache_usage[engine_idx].set(
526-
scheduler_stats.kv_cache_usage)
518+
self.gauge_gpu_cache_usage[engine_idx].set(
519+
scheduler_stats.kv_cache_usage)
527520
self.gauge_kv_cache_usage[engine_idx].set(
528521
scheduler_stats.kv_cache_usage)
529522

530-
if self.show_hidden_metrics:
531-
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
532-
scheduler_stats.prefix_cache_stats.queries)
533-
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
534-
scheduler_stats.prefix_cache_stats.hits)
523+
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
524+
scheduler_stats.prefix_cache_stats.queries)
525+
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
526+
scheduler_stats.prefix_cache_stats.hits)
535527

536528
self.counter_prefix_cache_queries[engine_idx].inc(
537529
scheduler_stats.prefix_cache_stats.queries)

0 commit comments

Comments
 (0)