small fixes for pr

kfirtoledo · kfirtoledo · commit 1b6ab70b0ab9 · 2025-04-25T15:02:51.000+03:00
Signed-off-by: Kfir Toledo &lt;kfir.toledo@ibm.com&gt;
diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
@@ -48,3 +48,6 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
+        env:
+          - name: KVCACHE_INDEXER_REDIS_ADDR
+            value: ${REDIS_HOST}:${REDIS_PORT}
diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml
@@ -6,7 +6,37 @@ spec:
   modelName: food-review
   criticality: Critical
   poolRef:
-    name: vllm-llama3-8b-instruct
+    name: ${POOL_NAME}
   targetModels:
   - name: food-review
     weight: 100
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model
+spec:
+  modelName: meta-llama/Llama-3.1-8B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model-cpu
+spec:
+  modelName: Qwen/Qwen2.5-1.5B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: mistarli
+spec:
+  modelName: mistralai/Mistral-7B-Instruct-v0.2
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
diff --git a/deploy/components/inference-gateway/inference-pools.yaml b/deploy/components/inference-gateway/inference-pools.yaml
@@ -1,10 +1,10 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: ${POOL_NAME}
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama3-8b-instruct
+    app: ${POOL_NAME}
   extensionRef:
     name: endpoint-picker
diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: ${REDIS_NAME}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
@@ -20,8 +20,36 @@ spec:
       containers:
         - name: lookup-server
           image: ${REDIS_IMAGE}:${REDIS_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - redis-server
           ports:
-            - containerPort: ${REDIS_TARGET_PORT}
+            - name: redis-port
+              containerPort: ${REDIS_TARGET_PORT}
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "4"
+              memory: 10G
+            requests:
+              cpu: "4"
+              memory: 8G
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 25%
+      maxSurge: 25%
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 600
+          # securityContext:
+          #   allowPrivilegeEscalation: false
+          #   capabilities:
+          #     drop:
+          #       - ALL
diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
@@ -13,17 +13,24 @@ spec:
       app.kubernetes.io/name: vllm
       app.kubernetes.io/component: vllm
       app.kubernetes.io/model: ${MODEL_LABEL}
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
         app.kubernetes.io/name: vllm
         app.kubernetes.io/component: vllm
         app.kubernetes.io/model: ${MODEL_LABEL}
+        app: ${POOL_NAME}
     spec:
+      # securityContext:
+      #   runAsUser: ${PROXY_UID}
+        # runAsNonRoot: true
+        # seccompProfile:
+        #   type: RuntimeDefault
       containers:
         - name: vllm
           image: ${VLLM_IMAGE}:${VLLM_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - /bin/sh
             - "-c"
@@ -40,22 +47,78 @@ spec:
           ports:
             - name: http
               containerPort: 8000
-            - name: lmcache-dist
+              protocol: TCP
+            - name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL
               containerPort: 80
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
           env:
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: ${HF_SECRET_NAME}
-                  key: ${HF_SECRET_KEY}
+            - name: HF_HOME
+              value: /data
             - name: POD_IP
               valueFrom:
                 fieldRef:
+                  apiVersion: v1
                   fieldPath: status.podIP
-          volumeMounts:
-            - name: model-storage
-              mountPath: ${VOLUME_MOUNT_PATH}
-      volumes:
-        - name: model-storage
-          persistentVolumeClaim:
-            claimName: ${PVC_NAME}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: vllm-p2p-secrets
+                  key: hf_token_mistral7b
+            - name: LMCACHE_LOOKUP_URL
+              value: vllm-p2p-lookup-server-service.kvcache-manager.svc.cluster.local:8100
+            - name: LMCACHE_ENABLE_DEBUG
+              value: "True"
+            - name: LMCACHE_ENABLE_P2P
+              value: "True"
+            - name: LMCACHE_LOCAL_CPU
+              value: "True"
+            - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+              value: "20"
+            - name: LMCACHE_USE_EXPERIMENTAL
+              value: "True"
+            - name: VLLM_RPC_TIMEOUT
+              value: "1000000"
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 40Gi
+              nvidia.com/gpu: "1"
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+          securityContext:
+            runAsNonRoot: false
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: "100%"
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 1200
+
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
@@ -7,7 +7,6 @@ resources:
   - deployments/vllm-deployment.yaml
   - deployments/redis-deployment.yaml
   - service/redis-service.yaml
-  - pvc/volume.yaml
   - deployments/secret.yaml
 
 images:
@@ -23,8 +22,8 @@ configMapGenerator:
     literals:
       - MODEL_NAME=${MODEL_NAME}
       - MODEL_LABEL=${MODEL_LABEL}
-      - POOL_LABEL=${POOL_LABEL}
-      - REDIS_ENABLED=${REDIS_ENABLED}
+      - POOL_NAME=${POOL_NAME}
+      - REDIS_ENABLED="true"
 
 generatorOptions:
   disableNameSuffixHash: true
diff --git a/deploy/components/vllm-p2p/pvc/volume.yaml b/deploy/components/vllm-p2p/pvc/volume.yaml
diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml
@@ -1,8 +1,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: ${REDIS_NAME}
-  namespace: ${NAMESPACE}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml
@@ -3,14 +3,14 @@ kind: Deployment
 metadata:
   name: ${VLLM_DEPLOYMENT_NAME}
 spec:
-  replicas: 3
+  replicas: ${VLLM_REPLICA_COUNT}
   selector:
     matchLabels:
-      app: vllm-llama3-8b-instruct
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
-        app: vllm-llama3-8b-instruct
+        app: ${POOL_NAME}
     spec:
       securityContext:
         runAsUser: ${PROXY_UID}
diff --git a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
@@ -3,20 +3,20 @@ kind: GatewayParameters
 metadata:
   name: custom-gw-params
 spec:
-  kube: 
+  kube:
     envoyContainer:
       securityContext:
         allowPrivilegeEscalation: false
         readOnlyRootFilesystem: true
         runAsNonRoot: true
         runAsUser: "${PROXY_UID}"
     service:
-      type: NodePort
+      type: LoadBalancer
       extraLabels:
         gateway: custom
     podTemplate:
       extraLabels:
         gateway: custom
-      securityContext: 
+      securityContext:
         seccompProfile:
           type: RuntimeDefault
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
@@ -6,6 +6,6 @@ resources:
 
 images:
 - name: quay.io/vllm-d/vllm-d-dev:0.0.2
-  newName: ${VLLM_P2P_IMAGE}
-  newTag: ${VLLM_P2P_TAG}
+  newName: ${VLLM_IMAGE}
+  newTag: ${VLLM_TAG}
 
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh