@@ -208,46 +208,40 @@ def __init__(self,
208
208
#
209
209
# GPU cache
210
210
#
211
- # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
212
- # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
213
- # TODO: remove in 0.12.0
214
- if self .show_hidden_metrics :
215
- gauge_gpu_cache_usage = self ._gauge_cls (
216
- name = "vllm:gpu_cache_usage_perc" ,
217
- documentation = (
218
- "GPU KV-cache usage. 1 means 100 percent usage."
219
- "DEPRECATED: Use vllm:kv_cache_usage_perc instead." ),
220
- multiprocess_mode = "mostrecent" ,
221
- labelnames = labelnames )
222
- self .gauge_gpu_cache_usage = make_per_engine (
223
- gauge_gpu_cache_usage , engine_indexes , model_name )
224
-
225
- # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
226
- # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
227
- # TODO: remove in 0.12.0
228
- if self .show_hidden_metrics :
229
- counter_gpu_prefix_cache_queries = self ._counter_cls (
230
- name = "vllm:gpu_prefix_cache_queries" ,
231
- documentation = (
232
- "GPU prefix cache queries, in terms of number of queried"
233
- "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
234
- ),
235
- labelnames = labelnames )
236
- self .counter_gpu_prefix_cache_queries = make_per_engine (
237
- counter_gpu_prefix_cache_queries , engine_indexes , model_name )
238
-
239
- # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
240
- # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
241
- # TODO: remove in 0.12.0
242
- if self .show_hidden_metrics :
243
- counter_gpu_prefix_cache_hits = self ._counter_cls (
244
- name = "vllm:gpu_prefix_cache_hits" ,
245
- documentation = (
246
- "GPU prefix cache hits, in terms of number of cached "
247
- "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead." ),
248
- labelnames = labelnames )
249
- self .counter_gpu_prefix_cache_hits = make_per_engine (
250
- counter_gpu_prefix_cache_hits , engine_indexes , model_name )
211
+ # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
212
+ # TODO: in 0.10, only enable if show_hidden_metrics=True
213
+ gauge_gpu_cache_usage = self ._gauge_cls (
214
+ name = "vllm:gpu_cache_usage_perc" ,
215
+ documentation = (
216
+ "GPU KV-cache usage. 1 means 100 percent usage."
217
+ "DEPRECATED: Use vllm:kv_cache_usage_perc instead." ),
218
+ multiprocess_mode = "mostrecent" ,
219
+ labelnames = labelnames )
220
+ self .gauge_gpu_cache_usage = make_per_engine (gauge_gpu_cache_usage ,
221
+ engine_indexes ,
222
+ model_name )
223
+
224
+ # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
225
+ # TODO: in 0.10, only enable if show_hidden_metrics=True
226
+ counter_gpu_prefix_cache_queries = self ._counter_cls (
227
+ name = "vllm:gpu_prefix_cache_queries" ,
228
+ documentation = (
229
+ "GPU prefix cache queries, in terms of number of queried"
230
+ "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." ),
231
+ labelnames = labelnames )
232
+ self .counter_gpu_prefix_cache_queries = make_per_engine (
233
+ counter_gpu_prefix_cache_queries , engine_indexes , model_name )
234
+
235
+ # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
236
+ # TODO: in 0.10, only enable if show_hidden_metrics=True
237
+ counter_gpu_prefix_cache_hits = self ._counter_cls (
238
+ name = "vllm:gpu_prefix_cache_hits" ,
239
+ documentation = (
240
+ "GPU prefix cache hits, in terms of number of cached "
241
+ "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead." ),
242
+ labelnames = labelnames )
243
+ self .counter_gpu_prefix_cache_hits = make_per_engine (
244
+ counter_gpu_prefix_cache_hits , engine_indexes , model_name )
251
245
252
246
gauge_kv_cache_usage = self ._gauge_cls (
253
247
name = "vllm:kv_cache_usage_perc" ,
@@ -521,17 +515,15 @@ def record(self,
521
515
self .gauge_scheduler_waiting [engine_idx ].set (
522
516
scheduler_stats .num_waiting_reqs )
523
517
524
- if self .show_hidden_metrics :
525
- self .gauge_gpu_cache_usage [engine_idx ].set (
526
- scheduler_stats .kv_cache_usage )
518
+ self .gauge_gpu_cache_usage [engine_idx ].set (
519
+ scheduler_stats .kv_cache_usage )
527
520
self .gauge_kv_cache_usage [engine_idx ].set (
528
521
scheduler_stats .kv_cache_usage )
529
522
530
- if self .show_hidden_metrics :
531
- self .counter_gpu_prefix_cache_queries [engine_idx ].inc (
532
- scheduler_stats .prefix_cache_stats .queries )
533
- self .counter_gpu_prefix_cache_hits [engine_idx ].inc (
534
- scheduler_stats .prefix_cache_stats .hits )
523
+ self .counter_gpu_prefix_cache_queries [engine_idx ].inc (
524
+ scheduler_stats .prefix_cache_stats .queries )
525
+ self .counter_gpu_prefix_cache_hits [engine_idx ].inc (
526
+ scheduler_stats .prefix_cache_stats .hits )
535
527
536
528
self .counter_prefix_cache_queries [engine_idx ].inc (
537
529
scheduler_stats .prefix_cache_stats .queries )
0 commit comments