Skip to content

Commit 73fd266

Browse files
committed
update more artifacts for release v0.5.0 rc1
Signed-off-by: Daneyon Hansen <[email protected]> Signed-off-by: Nir Rozenbaum <[email protected]>
1 parent 392a8cc commit 73fd266

File tree

4 files changed

+138
-44
lines changed

4 files changed

+138
-44
lines changed

config/manifests/inferencepool-resources.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
# Note: If you change this file, please also change the file used for e2e tests!
2-
#
3-
# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
1+
# Note: If you change this file, please also change:
2+
# - ./test/testdata/inferencepool-e2e.yaml
3+
# - ./conformance/resources/manifests/manifests.yaml
4+
# - ./site-src/guides/inferencepool-rollout.md
5+
---
46
apiVersion: inference.networking.x-k8s.io/v1alpha2
57
kind: InferencePool
68
metadata:

conformance/resources/manifests/manifests.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,8 @@ spec:
196196
terminationGracePeriodSeconds: 130
197197
containers:
198198
- name: epp
199-
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
200-
imagePullPolicy: Always
199+
image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.0-rc.1
200+
imagePullPolicy: IfNotPresent
201201
args:
202202
- -poolName
203203
- "primary-inference-pool"
@@ -293,8 +293,8 @@ spec:
293293
terminationGracePeriodSeconds: 130
294294
containers:
295295
- name: epp
296-
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
297-
imagePullPolicy: Always
296+
image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.0-rc.1
297+
imagePullPolicy: IfNotPresent
298298
args:
299299
- -poolName
300300
- "secondary-inference-pool"
@@ -342,7 +342,7 @@ apiVersion: v1
342342
kind: ConfigMap
343343
metadata:
344344
name: plugins-config
345-
namespace: default
345+
namespace: gateway-conformance-app-backend
346346
data:
347347
conformance-plugins.yaml: |
348348
apiVersion: inference.networking.x-k8s.io/v1alpha1

site-src/guides/inferencepool-rollout.md

Lines changed: 127 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ spec:
177177
terminationGracePeriodSeconds: 130
178178
nodeSelector:
179179
cloud.google.com/gke-accelerator: "nvidia-h100-80gb"
180-
181180
volumes:
182181
- name: data
183182
emptyDir: {}
@@ -250,40 +249,133 @@ spec:
250249
spec:
251250
terminationGracePeriodSeconds: 130
252251
containers:
253-
- name: epp
254-
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
255-
imagePullPolicy: Always
256-
args:
257-
- -poolName
258-
- "vllm-llama3-8b-instruct-new"
259-
- "-poolNamespace"
260-
- "default"
261-
- -v
262-
- "4"
263-
- --zap-encoder
264-
- "json"
265-
- -grpcPort
266-
- "9002"
267-
- -grpcHealthPort
268-
- "9003"
269-
ports:
270-
- containerPort: 9002
271-
- containerPort: 9003
272-
- name: metrics
273-
containerPort: 9090
274-
livenessProbe:
275-
grpc:
276-
port: 9003
277-
service: inference-extension
278-
initialDelaySeconds: 5
279-
periodSeconds: 10
280-
readinessProbe:
281-
grpc:
282-
port: 9003
283-
service: inference-extension
284-
initialDelaySeconds: 5
285-
periodSeconds: 10
286-
EOF
252+
- name: epp
253+
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
254+
imagePullPolicy: Always
255+
args:
256+
- -poolName
257+
- "vllm-llama3-8b-instruct-new"
258+
- -poolNamespace
259+
- "default"
260+
- -v
261+
- "4"
262+
- --zap-encoder
263+
- "json"
264+
- -grpcPort
265+
- "9002"
266+
- -grpcHealthPort
267+
- "9003"
268+
- -configFile
269+
- "/config/default-plugins.yaml"
270+
ports:
271+
- containerPort: 9002
272+
name: grpc
273+
- containerPort: 9003
274+
name: grpc-health
275+
- containerPort: 9090
276+
name: metrics
277+
livenessProbe:
278+
grpc:
279+
port: 9003
280+
service: inference-extension
281+
initialDelaySeconds: 5
282+
periodSeconds: 10
283+
readinessProbe:
284+
grpc:
285+
port: 9003
286+
service: inference-extension
287+
initialDelaySeconds: 5
288+
periodSeconds: 10
289+
volumeMounts:
290+
- name: plugins-config-volume
291+
mountPath: /config
292+
volumes:
293+
- name: plugins-config-volume
294+
configMap:
295+
name: plugins-config
296+
---
297+
apiVersion: v1
298+
kind: ConfigMap
299+
metadata:
300+
name: plugins-config
301+
namespace: default
302+
data:
303+
default-plugins.yaml: |
304+
apiVersion: inference.networking.x-k8s.io/v1alpha1
305+
kind: EndpointPickerConfig
306+
plugins:
307+
- type: low-queue-filter
308+
parameters:
309+
threshold: 128
310+
- type: lora-affinity-filter
311+
parameters:
312+
threshold: 0.999
313+
- type: least-queue-filter
314+
- type: least-kv-cache-filter
315+
- type: decision-tree-filter
316+
name: low-latency-filter
317+
parameters:
318+
current:
319+
pluginRef: low-queue-filter
320+
nextOnSuccess:
321+
decisionTree:
322+
current:
323+
pluginRef: lora-affinity-filter
324+
nextOnSuccessOrFailure:
325+
decisionTree:
326+
current:
327+
pluginRef: least-queue-filter
328+
nextOnSuccessOrFailure:
329+
decisionTree:
330+
current:
331+
pluginRef: least-kv-cache-filter
332+
nextOnFailure:
333+
decisionTree:
334+
current:
335+
pluginRef: least-queue-filter
336+
nextOnSuccessOrFailure:
337+
decisionTree:
338+
current:
339+
pluginRef: lora-affinity-filter
340+
nextOnSuccessOrFailure:
341+
decisionTree:
342+
current:
343+
pluginRef: least-kv-cache-filter
344+
- type: random-picker
345+
parameters:
346+
maxNumOfEndpoints: 1
347+
- type: single-profile-handler
348+
schedulingProfiles:
349+
- name: default
350+
plugins:
351+
- pluginRef: low-latency-filter
352+
- pluginRef: random-picker
353+
plugins-v2.yaml: |
354+
apiVersion: inference.networking.x-k8s.io/v1alpha1
355+
kind: EndpointPickerConfig
356+
plugins:
357+
- type: queue-scorer
358+
- type: kv-cache-scorer
359+
- type: prefix-cache-scorer
360+
parameters:
361+
hashBlockSize: 64
362+
maxPrefixBlocksToMatch: 256
363+
lruCapacityPerServer: 31250
364+
- type: max-score-picker
365+
parameters:
366+
maxNumOfEndpoints: 1
367+
- type: single-profile-handler
368+
schedulingProfiles:
369+
- name: default
370+
plugins:
371+
- pluginRef: queue-scorer
372+
weight: 1
373+
- pluginRef: kv-cache-scorer
374+
weight: 1
375+
- pluginRef: prefix-cache-scorer
376+
weight: 1
377+
- pluginRef: max-score-picker
378+
EOF
287379
```
288380

289381
### Direct traffic to the new inference pool

version/version.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ package version
1818

1919
const (
2020
// BundleVersion is the value used for labeling the version of the gateway-api-inference-extension.
21-
BundleVersion = "v0.4.0-dev"
21+
BundleVersion = "v0.5.0-rc.1"
2222
)

0 commit comments

Comments
 (0)