Skip to content

Commit fc98576

Browse files
committed
[fix]: small fixes for development YAMLs
Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com>
1 parent a80c163 commit fc98576

17 files changed

+234
-95
lines changed

DEVELOPMENT.md

+28-12
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,13 @@ Export the name of the `Secret` to the environment:
178178
export REGISTRY_SECRET=anna-pull-secret
179179
```
180180

181+
You can optionally set a custom EPP image (otherwise, the default will be used):
182+
183+
```console
184+
export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
185+
export EPP_TAG="<YOUR_TAG>"
186+
```
187+
181188
Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
182189

183190
- `vllm-sim`: Lightweight simulator for simple environments
@@ -197,18 +204,14 @@ export VLLM_SIM_TAG="<YOUR_TAG>"
197204
```
198205

199206
For vllm and vllm-p2p:
200-
207+
- set Vllm image:
201208
```console
202209
export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
203210
export VLLM_TAG="<YOUR_TAG>"
204211
```
205-
206-
The same thing will need to be done for the EPP:
207-
208-
```console
209-
export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
210-
export EPP_TAG="<YOUR_TAG>"
211-
```
212+
- Set hugging face token variable:
213+
export HF_TOKEN="<HF_TOKEN>"
214+
**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories.
212215

213216
Once all this is set up, you can deploy the environment:
214217

@@ -224,12 +227,25 @@ kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
224227
```
225228

226229
And making requests with `curl`:
230+
- vllm-sim
227231

228-
```console
229-
curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
230-
-d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
231-
```
232+
```console
233+
curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
234+
-d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
235+
```
236+
237+
- vllm
238+
239+
```console
240+
curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
241+
-d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
242+
```
232243

244+
- vllm-p2p
245+
```console
246+
curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
247+
-d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq
248+
```
233249
#### Development Cycle
234250

235251
> **WARNING**: This is a very manual process at the moment. We expect to make

deploy/components/inference-gateway/deployments.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,11 @@ spec:
4848
service: inference-extension
4949
initialDelaySeconds: 5
5050
periodSeconds: 10
51+
env:
52+
- name: KVCACHE_INDEXER_REDIS_ADDR
53+
value: ${REDIS_HOST}:${REDIS_PORT}
54+
- name: HF_TOKEN
55+
valueFrom:
56+
secretKeyRef:
57+
name: ${HF_SECRET_NAME}
58+
key: ${HF_SECRET_KEY}

deploy/components/inference-gateway/inference-models.yaml

+31-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,37 @@ spec:
66
modelName: food-review
77
criticality: Critical
88
poolRef:
9-
name: vllm-llama3-8b-instruct
9+
name: ${POOL_NAME}
1010
targetModels:
1111
- name: food-review
1212
weight: 100
13+
---
14+
apiVersion: inference.networking.x-k8s.io/v1alpha2
15+
kind: InferenceModel
16+
metadata:
17+
name: base-model
18+
spec:
19+
modelName: meta-llama/Llama-3.1-8B-Instruct
20+
criticality: Critical
21+
poolRef:
22+
name: ${POOL_NAME}
23+
---
24+
apiVersion: inference.networking.x-k8s.io/v1alpha2
25+
kind: InferenceModel
26+
metadata:
27+
name: base-model-cpu
28+
spec:
29+
modelName: Qwen/Qwen2.5-1.5B-Instruct
30+
criticality: Critical
31+
poolRef:
32+
name: ${POOL_NAME}
33+
---
34+
apiVersion: inference.networking.x-k8s.io/v1alpha2
35+
kind: InferenceModel
36+
metadata:
37+
name: mistarli
38+
spec:
39+
modelName: mistralai/Mistral-7B-Instruct-v0.2
40+
criticality: Critical
41+
poolRef:
42+
name: ${POOL_NAME}
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
apiVersion: inference.networking.x-k8s.io/v1alpha2
22
kind: InferencePool
33
metadata:
4-
name: vllm-llama3-8b-instruct
4+
name: ${POOL_NAME}
55
spec:
66
targetPortNumber: 8000
77
selector:
8-
app: vllm-llama3-8b-instruct
8+
app: ${POOL_NAME}
99
extensionRef:
1010
name: endpoint-picker

deploy/components/vllm-p2p/deployments/redis-deployment.yaml

+31-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: apps/v1
22
kind: Deployment
33
metadata:
4-
name: ${REDIS_NAME}
4+
name: ${REDIS_SVC_NAME}
55
labels:
66
app.kubernetes.io/name: redis
77
app.kubernetes.io/component: redis-lookup-server
@@ -20,8 +20,36 @@ spec:
2020
containers:
2121
- name: lookup-server
2222
image: ${REDIS_IMAGE}:${REDIS_TAG}
23-
imagePullPolicy: Always
23+
imagePullPolicy: IfNotPresent
2424
command:
2525
- redis-server
2626
ports:
27-
- containerPort: ${REDIS_TARGET_PORT}
27+
- name: redis-port
28+
containerPort: ${REDIS_TARGET_PORT}
29+
protocol: TCP
30+
resources:
31+
limits:
32+
cpu: "4"
33+
memory: 10G
34+
requests:
35+
cpu: "4"
36+
memory: 8G
37+
terminationMessagePath: /dev/termination-log
38+
terminationMessagePolicy: File
39+
restartPolicy: Always
40+
terminationGracePeriodSeconds: 30
41+
dnsPolicy: ClusterFirst
42+
securityContext: {}
43+
schedulerName: default-scheduler
44+
strategy:
45+
type: RollingUpdate
46+
rollingUpdate:
47+
maxUnavailable: 25%
48+
maxSurge: 25%
49+
revisionHistoryLimit: 10
50+
progressDeadlineSeconds: 600
51+
# securityContext:
52+
# allowPrivilegeEscalation: false
53+
# capabilities:
54+
# drop:
55+
# - ALL

deploy/components/vllm-p2p/deployments/vllm-deployment.yaml

+83-21
Original file line numberDiff line numberDiff line change
@@ -13,49 +13,111 @@ spec:
1313
app.kubernetes.io/name: vllm
1414
app.kubernetes.io/component: vllm
1515
app.kubernetes.io/model: ${MODEL_LABEL}
16+
app: ${POOL_NAME}
1617
template:
1718
metadata:
1819
labels:
1920
app.kubernetes.io/name: vllm
2021
app.kubernetes.io/component: vllm
2122
app.kubernetes.io/model: ${MODEL_LABEL}
23+
app: ${POOL_NAME}
2224
spec:
25+
# securityContext:
26+
# runAsUser: ${PROXY_UID}
27+
# runAsNonRoot: true
28+
# seccompProfile:
29+
# type: RuntimeDefault
2330
containers:
2431
- name: vllm
2532
image: ${VLLM_IMAGE}:${VLLM_TAG}
26-
imagePullPolicy: Always
33+
imagePullPolicy: IfNotPresent
2734
command:
2835
- /bin/sh
2936
- "-c"
3037
args:
3138
- |
32-
export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 &&
33-
vllm serve ${MODEL_NAME}
34-
--host 0.0.0.0
35-
--port 8000
36-
--enable-chunked-prefill false
37-
--max-model-len ${MAX_MODEL_LEN}
38-
--kv-transfer-config
39-
'{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
39+
export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \
40+
vllm serve ${MODEL_NAME} \
41+
--host 0.0.0.0 \
42+
--port 8000 \
43+
--enable-chunked-prefill false \
44+
--max-model-len ${MAX_MODEL_LEN} \
45+
--kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
4046
ports:
4147
- name: http
4248
containerPort: 8000
43-
- name: lmcache-dist
49+
protocol: TCP
50+
- name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL
4451
containerPort: 80
52+
protocol: TCP
53+
livenessProbe:
54+
failureThreshold: 3
55+
httpGet:
56+
path: /health
57+
port: 8000
58+
scheme: HTTP
59+
initialDelaySeconds: 15
60+
periodSeconds: 10
61+
successThreshold: 1
62+
timeoutSeconds: 1
63+
startupProbe:
64+
failureThreshold: 60
65+
httpGet:
66+
path: /health
67+
port: 8000
68+
scheme: HTTP
69+
initialDelaySeconds: 15
70+
periodSeconds: 10
71+
successThreshold: 1
72+
timeoutSeconds: 1
4573
env:
74+
- name: HF_HOME
75+
value: /data
76+
- name: POD_IP
77+
valueFrom:
78+
fieldRef:
79+
apiVersion: v1
80+
fieldPath: status.podIP
4681
- name: HF_TOKEN
4782
valueFrom:
4883
secretKeyRef:
4984
name: ${HF_SECRET_NAME}
5085
key: ${HF_SECRET_KEY}
51-
- name: POD_IP
52-
valueFrom:
53-
fieldRef:
54-
fieldPath: status.podIP
55-
volumeMounts:
56-
- name: model-storage
57-
mountPath: ${VOLUME_MOUNT_PATH}
58-
volumes:
59-
- name: model-storage
60-
persistentVolumeClaim:
61-
claimName: ${PVC_NAME}
86+
- name: LMCACHE_LOOKUP_URL
87+
value: ${REDIS_HOST}:${REDIS_PORT}
88+
- name: LMCACHE_ENABLE_DEBUG
89+
value: "True"
90+
- name: LMCACHE_ENABLE_P2P
91+
value: "True"
92+
- name: LMCACHE_LOCAL_CPU
93+
value: "True"
94+
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
95+
value: "20"
96+
- name: LMCACHE_USE_EXPERIMENTAL
97+
value: "True"
98+
- name: VLLM_RPC_TIMEOUT
99+
value: "1000000"
100+
resources:
101+
limits:
102+
nvidia.com/gpu: "1"
103+
requests:
104+
cpu: "10"
105+
memory: 40Gi
106+
nvidia.com/gpu: "1"
107+
terminationMessagePath: /dev/termination-log
108+
terminationMessagePolicy: File
109+
securityContext:
110+
runAsNonRoot: false
111+
restartPolicy: Always
112+
terminationGracePeriodSeconds: 30
113+
dnsPolicy: ClusterFirst
114+
securityContext: {}
115+
schedulerName: default-scheduler
116+
strategy:
117+
type: RollingUpdate
118+
rollingUpdate:
119+
maxUnavailable: 0
120+
maxSurge: "100%"
121+
revisionHistoryLimit: 10
122+
progressDeadlineSeconds: 1200
123+

deploy/components/vllm-p2p/kustomization.yaml

-12
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ resources:
77
- deployments/vllm-deployment.yaml
88
- deployments/redis-deployment.yaml
99
- service/redis-service.yaml
10-
- pvc/volume.yaml
1110
- deployments/secret.yaml
1211

1312
images:
@@ -17,14 +16,3 @@ images:
1716
- name: redis
1817
newName: ${REDIS_IMAGE}
1918
newTag: ${REDIS_TAG}
20-
21-
configMapGenerator:
22-
- name: model-config
23-
literals:
24-
- MODEL_NAME=${MODEL_NAME}
25-
- MODEL_LABEL=${MODEL_LABEL}
26-
- POOL_LABEL=${POOL_LABEL}
27-
- REDIS_ENABLED=${REDIS_ENABLED}
28-
29-
generatorOptions:
30-
disableNameSuffixHash: true

deploy/components/vllm-p2p/pvc/volume.yaml

-18
This file was deleted.

deploy/components/vllm-p2p/service/redis-service.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
apiVersion: v1
22
kind: Service
33
metadata:
4-
name: ${REDIS_NAME}
5-
namespace: ${NAMESPACE}
4+
name: ${REDIS_SVC_NAME}
65
labels:
76
app.kubernetes.io/name: redis
87
app.kubernetes.io/component: redis-lookup-server

deploy/components/vllm/configmap.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: vllm-llama3-8b-instruct-adapters
5+
data:
6+
configmap.yaml: |
7+
vLLMLoRAConfig:
8+
name: vllm-llama3-8b-instruct-adapters
9+
port: 8000
10+
defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
11+
ensureExist:
12+
models:
13+
- id: food-review-1
14+
source: Kawon/llama3.1-food-finetune_v14_r8

0 commit comments

Comments
 (0)