neuralmagic
diff --git a/‎DEVELOPMENT.md
+28-12 b/‎DEVELOPMENT.md
+28-12
diff --git a/‎deploy/components/inference-gateway/deployments.yaml
+8 b/‎deploy/components/inference-gateway/deployments.yaml
+8
diff --git a/‎deploy/components/inference-gateway/inference-models.yaml
+31-1 b/‎deploy/components/inference-gateway/inference-models.yaml
+31-1
diff --git a/‎deploy/components/inference-gateway/inference-pools.yaml
+2-2 b/‎deploy/components/inference-gateway/inference-pools.yaml
+2-2
diff --git a/‎deploy/components/vllm-p2p/deployments/redis-deployment.yaml
+31-3 b/‎deploy/components/vllm-p2p/deployments/redis-deployment.yaml
+31-3
diff --git a/‎deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
+83-21 b/‎deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
+83-21
diff --git a/‎deploy/components/vllm-p2p/kustomization.yaml
-12 b/‎deploy/components/vllm-p2p/kustomization.yaml
-12
diff --git a/‎deploy/components/vllm-p2p/pvc/volume.yaml
-18 b/‎deploy/components/vllm-p2p/pvc/volume.yaml
-18
diff --git a/‎deploy/components/vllm-p2p/service/redis-service.yaml
+1-2 b/‎deploy/components/vllm-p2p/service/redis-service.yaml
+1-2
diff --git a/‎deploy/components/vllm/configmap.yaml
+14 b/‎deploy/components/vllm/configmap.yaml
+14
@@ -178,6 +178,13 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
+You can optionally set a custom EPP image (otherwise, the default will be used):
+
+```console
+export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export EPP_TAG="<YOUR_TAG>"
+```
+
 Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
 
 - `vllm-sim`: Lightweight simulator for simple environments
@@ -197,18 +204,14 @@ export VLLM_SIM_TAG="<YOUR_TAG>"
 ```
 
 For vllm and vllm-p2p:
-
+- set Vllm image:
 ```console
 export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_TAG="<YOUR_TAG>"
 ```
-
-The same thing will need to be done for the EPP:
-
-```console
-export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export EPP_TAG="<YOUR_TAG>"
-```
+- Set hugging face token variable:
+  export HF_TOKEN="<HF_TOKEN>"
+**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories.
 
 Once all this is set up, you can deploy the environment:
 
@@ -224,12 +227,25 @@ kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
+- vllm-sim
 
-```console
-curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
-  -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
-```
+    ```console
+    curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+      -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+    ```
+
+- vllm
+
+  ```console
+  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+    -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+  ```
 
+- vllm-p2p
+  ```console
+  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+    -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+  ```
 #### Development Cycle
 
 > **WARNING**: This is a very manual process at the moment. We expect to make
 
@@ -48,3 +48,11 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
+        env:
+          - name: KVCACHE_INDEXER_REDIS_ADDR
+            value: ${REDIS_HOST}:${REDIS_PORT}
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: ${HF_SECRET_NAME}
+                key: ${HF_SECRET_KEY}
@@ -6,7 +6,37 @@ spec:
   modelName: food-review
   criticality: Critical
   poolRef:
-    name: vllm-llama3-8b-instruct
+    name: ${POOL_NAME}
   targetModels:
   - name: food-review
     weight: 100
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model
+spec:
+  modelName: meta-llama/Llama-3.1-8B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model-cpu
+spec:
+  modelName: Qwen/Qwen2.5-1.5B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: mistarli
+spec:
+  modelName: mistralai/Mistral-7B-Instruct-v0.2
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
@@ -1,10 +1,10 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: ${POOL_NAME}
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama3-8b-instruct
+    app: ${POOL_NAME}
   extensionRef:
     name: endpoint-picker
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: ${REDIS_NAME}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
@@ -20,8 +20,36 @@ spec:
       containers:
         - name: lookup-server
           image: ${REDIS_IMAGE}:${REDIS_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - redis-server
           ports:
-            - containerPort: ${REDIS_TARGET_PORT}
+            - name: redis-port
+              containerPort: ${REDIS_TARGET_PORT}
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "4"
+              memory: 10G
+            requests:
+              cpu: "4"
+              memory: 8G
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 25%
+      maxSurge: 25%
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 600
+          # securityContext:
+          #   allowPrivilegeEscalation: false
+          #   capabilities:
+          #     drop:
+          #       - ALL
@@ -13,49 +13,111 @@ spec:
       app.kubernetes.io/name: vllm
       app.kubernetes.io/component: vllm
       app.kubernetes.io/model: ${MODEL_LABEL}
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
         app.kubernetes.io/name: vllm
         app.kubernetes.io/component: vllm
         app.kubernetes.io/model: ${MODEL_LABEL}
+        app: ${POOL_NAME}
     spec:
+      # securityContext:
+      #   runAsUser: ${PROXY_UID}
+        # runAsNonRoot: true
+        # seccompProfile:
+        #   type: RuntimeDefault
       containers:
         - name: vllm
           image: ${VLLM_IMAGE}:${VLLM_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - /bin/sh
             - "-c"
           args:
             - |
-              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 &&
-              vllm serve ${MODEL_NAME}
-              --host 0.0.0.0
-              --port 8000
-              --enable-chunked-prefill false
-              --max-model-len ${MAX_MODEL_LEN}
-              --kv-transfer-config
-              '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+              export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \
+              vllm serve ${MODEL_NAME} \
+              --host 0.0.0.0 \
+              --port 8000 \
+              --enable-chunked-prefill false \
+              --max-model-len ${MAX_MODEL_LEN} \
+              --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
           ports:
             - name: http
               containerPort: 8000
-            - name: lmcache-dist
+              protocol: TCP
+            - name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL
               containerPort: 80
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
           env:
+            - name: HF_HOME
+              value: /data
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: status.podIP
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: ${HF_SECRET_NAME}
                   key: ${HF_SECRET_KEY}
-            - name: POD_IP
-              valueFrom:
-                fieldRef:
-                  fieldPath: status.podIP
-          volumeMounts:
-            - name: model-storage
-              mountPath: ${VOLUME_MOUNT_PATH}
-      volumes:
-        - name: model-storage
-          persistentVolumeClaim:
-            claimName: ${PVC_NAME}
+            - name: LMCACHE_LOOKUP_URL
+              value: ${REDIS_HOST}:${REDIS_PORT}
+            - name: LMCACHE_ENABLE_DEBUG
+              value: "True"
+            - name: LMCACHE_ENABLE_P2P
+              value: "True"
+            - name: LMCACHE_LOCAL_CPU
+              value: "True"
+            - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+              value: "20"
+            - name: LMCACHE_USE_EXPERIMENTAL
+              value: "True"
+            - name: VLLM_RPC_TIMEOUT
+              value: "1000000"
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 40Gi
+              nvidia.com/gpu: "1"
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+          securityContext:
+            runAsNonRoot: false
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: "100%"
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 1200
+
@@ -7,7 +7,6 @@ resources:
   - deployments/vllm-deployment.yaml
   - deployments/redis-deployment.yaml
   - service/redis-service.yaml
-  - pvc/volume.yaml
   - deployments/secret.yaml
 
 images:
@@ -17,14 +16,3 @@ images:
   - name: redis
     newName: ${REDIS_IMAGE}
     newTag: ${REDIS_TAG}
-
-configMapGenerator:
-  - name: model-config
-    literals:
-      - MODEL_NAME=${MODEL_NAME}
-      - MODEL_LABEL=${MODEL_LABEL}
-      - POOL_LABEL=${POOL_LABEL}
-      - REDIS_ENABLED=${REDIS_ENABLED}
-
-generatorOptions:
-  disableNameSuffixHash: true
@@ -1,8 +1,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: ${REDIS_NAME}
-  namespace: ${NAMESPACE}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
 
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-llama3-8b-instruct-adapters
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        name: vllm-llama3-8b-instruct-adapters
+        port: 8000
+        defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
+        ensureExist:
+          models:
+          - id: food-review-1
+            source: Kawon/llama3.1-food-finetune_v14_r8