@@ -177,7 +177,6 @@ spec:
177
177
terminationGracePeriodSeconds : 130
178
178
nodeSelector :
179
179
cloud.google.com/gke-accelerator : " nvidia-h100-80gb"
180
-
181
180
volumes :
182
181
- name : data
183
182
emptyDir : {}
@@ -250,40 +249,133 @@ spec:
250
249
spec :
251
250
terminationGracePeriodSeconds : 130
252
251
containers :
253
- - name : epp
254
- image : us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
255
- imagePullPolicy : Always
256
- args :
257
- - -poolName
258
- - " vllm-llama3-8b-instruct-new"
259
- - " -poolNamespace"
260
- - " default"
261
- - -v
262
- - " 4"
263
- - --zap-encoder
264
- - " json"
265
- - -grpcPort
266
- - " 9002"
267
- - -grpcHealthPort
268
- - " 9003"
269
- ports :
270
- - containerPort : 9002
271
- - containerPort : 9003
272
- - name : metrics
273
- containerPort : 9090
274
- livenessProbe :
275
- grpc :
276
- port : 9003
277
- service : inference-extension
278
- initialDelaySeconds : 5
279
- periodSeconds : 10
280
- readinessProbe :
281
- grpc :
282
- port : 9003
283
- service : inference-extension
284
- initialDelaySeconds : 5
285
- periodSeconds : 10
286
- EOF
252
+ - name : epp
253
+ image : us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
254
+ imagePullPolicy : Always
255
+ args :
256
+ - -poolName
257
+ - " vllm-llama3-8b-instruct-new"
258
+ - -poolNamespace
259
+ - " default"
260
+ - -v
261
+ - " 4"
262
+ - --zap-encoder
263
+ - " json"
264
+ - -grpcPort
265
+ - " 9002"
266
+ - -grpcHealthPort
267
+ - " 9003"
268
+ - -configFile
269
+ - " /config/default-plugins.yaml"
270
+ ports :
271
+ - containerPort : 9002
272
+ name : grpc
273
+ - containerPort : 9003
274
+ name : grpc-health
275
+ - containerPort : 9090
276
+ name : metrics
277
+ livenessProbe :
278
+ grpc :
279
+ port : 9003
280
+ service : inference-extension
281
+ initialDelaySeconds : 5
282
+ periodSeconds : 10
283
+ readinessProbe :
284
+ grpc :
285
+ port : 9003
286
+ service : inference-extension
287
+ initialDelaySeconds : 5
288
+ periodSeconds : 10
289
+ volumeMounts :
290
+ - name : plugins-config-volume
291
+ mountPath : /config
292
+ volumes :
293
+ - name : plugins-config-volume
294
+ configMap :
295
+ name : plugins-config
296
+ ---
297
+ apiVersion : v1
298
+ kind : ConfigMap
299
+ metadata :
300
+ name : plugins-config
301
+ namespace : default
302
+ data :
303
+ default-plugins.yaml : |
304
+ apiVersion: inference.networking.x-k8s.io/v1alpha1
305
+ kind: EndpointPickerConfig
306
+ plugins:
307
+ - type: low-queue-filter
308
+ parameters:
309
+ threshold: 128
310
+ - type: lora-affinity-filter
311
+ parameters:
312
+ threshold: 0.999
313
+ - type: least-queue-filter
314
+ - type: least-kv-cache-filter
315
+ - type: decision-tree-filter
316
+ name: low-latency-filter
317
+ parameters:
318
+ current:
319
+ pluginRef: low-queue-filter
320
+ nextOnSuccess:
321
+ decisionTree:
322
+ current:
323
+ pluginRef: lora-affinity-filter
324
+ nextOnSuccessOrFailure:
325
+ decisionTree:
326
+ current:
327
+ pluginRef: least-queue-filter
328
+ nextOnSuccessOrFailure:
329
+ decisionTree:
330
+ current:
331
+ pluginRef: least-kv-cache-filter
332
+ nextOnFailure:
333
+ decisionTree:
334
+ current:
335
+ pluginRef: least-queue-filter
336
+ nextOnSuccessOrFailure:
337
+ decisionTree:
338
+ current:
339
+ pluginRef: lora-affinity-filter
340
+ nextOnSuccessOrFailure:
341
+ decisionTree:
342
+ current:
343
+ pluginRef: least-kv-cache-filter
344
+ - type: random-picker
345
+ parameters:
346
+ maxNumOfEndpoints: 1
347
+ - type: single-profile-handler
348
+ schedulingProfiles:
349
+ - name: default
350
+ plugins:
351
+ - pluginRef: low-latency-filter
352
+ - pluginRef: random-picker
353
+ plugins-v2.yaml : |
354
+ apiVersion: inference.networking.x-k8s.io/v1alpha1
355
+ kind: EndpointPickerConfig
356
+ plugins:
357
+ - type: queue-scorer
358
+ - type: kv-cache-scorer
359
+ - type: prefix-cache-scorer
360
+ parameters:
361
+ hashBlockSize: 64
362
+ maxPrefixBlocksToMatch: 256
363
+ lruCapacityPerServer: 31250
364
+ - type: max-score-picker
365
+ parameters:
366
+ maxNumOfEndpoints: 1
367
+ - type: single-profile-handler
368
+ schedulingProfiles:
369
+ - name: default
370
+ plugins:
371
+ - pluginRef: queue-scorer
372
+ weight: 1
373
+ - pluginRef: kv-cache-scorer
374
+ weight: 1
375
+ - pluginRef: prefix-cache-scorer
376
+ weight: 1
377
+ - pluginRef: max-score-picker
378
+ EOF
287
379
```
288
380
289
381
### Direct traffic to the new inference pool
0 commit comments