From 82095c76d626d9f6741b415e4657017ef747cb83 Mon Sep 17 00:00:00 2001
From: xiaotia3 <xiaotian.chen@intel.com>
Date: Thu, 3 Apr 2025 15:50:50 +0800
Subject: [PATCH 1/2] Update TGI image versions

Signed-off-by: xiaotia3 <xiaotian.chen@intel.com>
---
 AgentQnA/docker_compose/amd/gpu/rocm/README.md                | 4 ++--
 AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml             | 2 +-
 AudioQnA/docker_compose/amd/gpu/rocm/README.md                | 4 ++--
 AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml       | 2 +-
 AudioQnA/tests/test_compose_tgi_on_gaudi.sh                   | 2 +-
 AudioQnA/tests/test_compose_tgi_on_xeon.sh                    | 2 +-
 ChatQnA/docker_compose/intel/cpu/xeon/README.md               | 4 ++--
 ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml | 2 +-
 ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml        | 2 +-
 ChatQnA/kubernetes/gmc/README.md                              | 2 +-
 ChatQnA/tests/test_compose_faqgen_on_rocm.sh                  | 4 ++--
 ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh             | 2 +-
 ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh              | 2 +-
 ChatQnA/tests/test_compose_tgi_on_xeon.sh                     | 2 +-
 CodeGen/docker_compose/amd/gpu/rocm/README.md                 | 4 ++--
 CodeGen/docker_compose/intel/cpu/xeon/compose.yaml            | 2 +-
 CodeGen/tests/test_compose_on_rocm.sh                         | 2 +-
 CodeGen/tests/test_compose_on_xeon.sh                         | 2 +-
 CodeTrans/docker_compose/amd/gpu/rocm/README.md               | 4 ++--
 CodeTrans/docker_compose/intel/cpu/xeon/README.md             | 4 ++--
 CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml      | 2 +-
 CodeTrans/docker_compose/intel/hpu/gaudi/README.md            | 4 ++--
 CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml     | 2 +-
 CodeTrans/tests/test_compose_tgi_on_gaudi.sh                  | 2 +-
 CodeTrans/tests/test_compose_tgi_on_xeon.sh                   | 2 +-
 DBQnA/docker_compose/intel/cpu/xeon/compose.yaml              | 2 +-
 DBQnA/tests/test_compose_on_xeon.sh                           | 2 +-
 DocSum/docker_compose/intel/cpu/xeon/compose_tgi.yaml         | 2 +-
 DocSum/kubernetes/gmc/README.md                               | 2 +-
 DocSum/tests/test_compose_tgi_on_xeon.sh                      | 2 +-
 ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml  | 4 ++--
 ProductivitySuite/tests/test_compose_on_xeon.sh               | 2 +-
 SearchQnA/docker_compose/amd/gpu/rocm/README.md               | 4 ++--
 SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml          | 2 +-
 SearchQnA/tests/test_compose_on_xeon.sh                       | 2 +-
 Translation/docker_compose/amd/gpu/rocm/README.md             | 4 ++--
 Translation/docker_compose/intel/cpu/xeon/compose.yaml        | 2 +-
 Translation/tests/test_compose_on_xeon.sh                     | 2 +-
 VisualQnA/docker_compose/intel/cpu/xeon/README.md             | 4 ++--
 VisualQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml      | 2 +-
 VisualQnA/tests/test_compose_tgi_on_gaudi.sh                  | 2 +-
 VisualQnA/tests/test_compose_tgi_on_xeon.sh                   | 2 +-
 42 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/README.md b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
index 3849cd828d..e32e6578d0 100644
--- a/AgentQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/README.md
@@ -64,7 +64,7 @@ We remind you that when using a specific version of the code, you need to use th
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -110,7 +110,7 @@ We remind you that when using a specific version of the code, you need to use th
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/agent:latest
   - redis/redis-stack:7.2.0-v9
   - ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
index 31da7be7b0..ed008a03a1 100644
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -2,7 +2,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:3.0.0-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     container_name: tgi-service
     ports:
       - "${TGI_SERVICE_PORT-8085}:80"
diff --git a/AudioQnA/docker_compose/amd/gpu/rocm/README.md b/AudioQnA/docker_compose/amd/gpu/rocm/README.md
index 824ddf1d3a..96bca60e3a 100644
--- a/AudioQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/README.md
@@ -68,7 +68,7 @@ We remind you that when using a specific version of the code, you need to use th
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -94,7 +94,7 @@ We remind you that when using a specific version of the code, you need to use th
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     - opea/whisper:latest
     - opea/speecht5:latest
     - opea/audioqna:latest
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index d421f488fd..5fa82b3d8f 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -25,7 +25,7 @@ services:
       https_proxy: ${https_proxy}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - ${LLM_SERVER_PORT:-3006}:80
diff --git a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
index 5a046adfdb..1c599c88f1 100644
--- a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
     service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
     docker images && sleep 1s
 }
 
diff --git a/AudioQnA/tests/test_compose_tgi_on_xeon.sh b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
index d735c87b94..60210ee7e1 100644
--- a/AudioQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
     service_list="audioqna audioqna-ui whisper speecht5"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 4b61c091df..16e8248aed 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -225,7 +225,7 @@ For users in China who are unable to download models directly from Huggingface,
    # Start vLLM LLM Service
    docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
    # Start TGI LLM Service
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu --model-id $model_name
    ```
 
 2. Offline
@@ -242,7 +242,7 @@ For users in China who are unable to download models directly from Huggingface,
      # Start vLLM LLM Service
      docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
      # Start TGI LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu --model-id /data
      ```
 
 ### Setup Environment Variables
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
index 7e34ff041f..10d7269c5f 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
@@ -75,7 +75,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-server
     ports:
       - ${LLM_ENDPOINT_PORT:-9009}:80
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index ae8cf7ddb1..23a472da87 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -75,7 +75,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
diff --git a/ChatQnA/kubernetes/gmc/README.md b/ChatQnA/kubernetes/gmc/README.md
index 5775d14b88..29e69d6b21 100644
--- a/ChatQnA/kubernetes/gmc/README.md
+++ b/ChatQnA/kubernetes/gmc/README.md
@@ -18,7 +18,7 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 - tei_embedding_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
 - retriever: opea/retriever:latest
 - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
-- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
 - chaqna-xeon-backend-server: opea/chatqna:latest
 
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
diff --git a/ChatQnA/tests/test_compose_faqgen_on_rocm.sh b/ChatQnA/tests/test_compose_faqgen_on_rocm.sh
index cdfc79c5e7..a7738e953a 100644
--- a/ChatQnA/tests/test_compose_faqgen_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_faqgen_on_rocm.sh
@@ -16,7 +16,7 @@ LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
 
 export HOST_IP=${ip_address}
-export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
 export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
@@ -69,7 +69,7 @@ function build_docker_images() {
     service_list="chatqna chatqna-ui dataprep retriever llm-faqgen nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > "${LOG_PATH}"/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
diff --git a/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
index 3f2e7ae3a0..ac1a1ef154 100644
--- a/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
@@ -34,7 +34,7 @@ function build_docker_images() {
     service_list="chatqna chatqna-ui dataprep retriever llm-faqgen nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
     docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
     docker images && sleep 1s
diff --git a/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh b/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
index 14bdf0eed1..dfb303c89e 100644
--- a/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
@@ -38,7 +38,7 @@ function build_docker_images() {
     service_list="chatqna chatqna-ui dataprep retriever llm-faqgen nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
     docker images && sleep 1s
 }
diff --git a/ChatQnA/tests/test_compose_tgi_on_xeon.sh b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
index 9887a033ad..0a3a931ffa 100644
--- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
     service_list="chatqna chatqna-ui dataprep retriever nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
 
     docker images && sleep 1s
diff --git a/CodeGen/docker_compose/amd/gpu/rocm/README.md b/CodeGen/docker_compose/amd/gpu/rocm/README.md
index c1a6a77172..80e72bfd49 100644
--- a/CodeGen/docker_compose/amd/gpu/rocm/README.md
+++ b/CodeGen/docker_compose/amd/gpu/rocm/README.md
@@ -71,7 +71,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -97,7 +97,7 @@
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/llm-textgen:latest
   - opea/codegen:latest
   - opea/codegen-ui:latest
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
index 5567d9e368..f080cd5c21 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-server
     profiles:
       - codegen-xeon-tgi
diff --git a/CodeGen/tests/test_compose_on_rocm.sh b/CodeGen/tests/test_compose_on_rocm.sh
index 361dc613e5..a9793ac80c 100644
--- a/CodeGen/tests/test_compose_on_rocm.sh
+++ b/CodeGen/tests/test_compose_on_rocm.sh
@@ -34,7 +34,7 @@ function build_docker_images() {
     service_list="codegen codegen-ui llm-textgen"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     docker images && sleep 1s
 }
 
diff --git a/CodeGen/tests/test_compose_on_xeon.sh b/CodeGen/tests/test_compose_on_xeon.sh
index 6fc25963ac..7d1f7fe3ec 100644
--- a/CodeGen/tests/test_compose_on_xeon.sh
+++ b/CodeGen/tests/test_compose_on_xeon.sh
@@ -41,7 +41,7 @@ function build_docker_images() {
     service_list="codegen codegen-ui llm-textgen vllm"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/CodeTrans/docker_compose/amd/gpu/rocm/README.md b/CodeTrans/docker_compose/amd/gpu/rocm/README.md
index 17b67349eb..0cb60956cc 100644
--- a/CodeTrans/docker_compose/amd/gpu/rocm/README.md
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/README.md
@@ -71,7 +71,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -98,7 +98,7 @@
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/llm-textgen:latest
   - opea/codetrans:latest
   - opea/codetrans-ui:latest
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/README.md b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
index 3d250c7036..136f8295e9 100755
--- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
@@ -76,7 +76,7 @@ For users in China who are unable to download models directly from Huggingface,
    # Start vLLM LLM Service
    docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
    # Start TGI LLM Service
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu --model-id $model_name
    ```
 
 2. Offline
@@ -93,7 +93,7 @@ For users in China who are unable to download models directly from Huggingface,
      # Start vLLM LLM Service
      docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
      # Start TGI LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu --model-id /data
      ```
 
 ### Setup Environment Variables
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index 77c668241c..27b726f8cc 100644
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: codetrans-xeon-tgi-service
     ports:
       - "8008:80"
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
index d07326598f..e3949480a7 100755
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
@@ -68,7 +68,7 @@ For users in China who are unable to download models directly from Huggingface,
    # Start vLLM LLM Service
    docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
    # Start TGI LLM Service
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu --model-id $model_name
    ```
 
 2. Offline
@@ -85,7 +85,7 @@ For users in China who are unable to download models directly from Huggingface,
      # Start vLLM LLM Service
      docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
      # Start TGI LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu --model-id /data
      ```
 
 ### Setup Environment Variables
diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
index 9bcc01f318..023eed2adf 100644
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: codetrans-gaudi-tgi-service
     ports:
       - "8008:80"
diff --git a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
index 1c0404d397..3c961204f6 100644
--- a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
     service_list="codetrans codetrans-ui llm-textgen nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
     docker images && sleep 1s
 }
 
diff --git a/CodeTrans/tests/test_compose_tgi_on_xeon.sh b/CodeTrans/tests/test_compose_tgi_on_xeon.sh
index 95154c7c9d..7c3771b250 100644
--- a/CodeTrans/tests/test_compose_tgi_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_tgi_on_xeon.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
     service_list="codetrans codetrans-ui llm-textgen nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
index b96a71d01d..ebfe1f8dec 100644
--- a/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DBQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
diff --git a/DBQnA/tests/test_compose_on_xeon.sh b/DBQnA/tests/test_compose_on_xeon.sh
index da9fa1b71a..3122ec417d 100755
--- a/DBQnA/tests/test_compose_on_xeon.sh
+++ b/DBQnA/tests/test_compose_on_xeon.sh
@@ -24,7 +24,7 @@ function build_docker_images() {
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
     docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index 4b0362bd09..8ab5652b9e 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-server:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: docsum-xeon-tgi-server
     ports:
       - ${LLM_ENDPOINT_PORT:-8008}:80
diff --git a/DocSum/kubernetes/gmc/README.md b/DocSum/kubernetes/gmc/README.md
index aaab01a8c8..e6175f1587 100644
--- a/DocSum/kubernetes/gmc/README.md
+++ b/DocSum/kubernetes/gmc/README.md
@@ -8,7 +8,7 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll
 The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.
 
 The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image `llm-docsum-tgi:latest` which internally leverages the
-the image `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
+the image `ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
 service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.3.1`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.
 
 [NOTE]
diff --git a/DocSum/tests/test_compose_tgi_on_xeon.sh b/DocSum/tests/test_compose_tgi_on_xeon.sh
index 3d7b3f1b22..58401082c5 100644
--- a/DocSum/tests/test_compose_tgi_on_xeon.sh
+++ b/DocSum/tests/test_compose_tgi_on_xeon.sh
@@ -50,7 +50,7 @@ function build_docker_images() {
     service_list="docsum docsum-gradio-ui whisper llm-docsum"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1
     docker images && sleep 1s
 }
 
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
index 606456f84f..b9bf1bd91f 100644
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
@@ -131,7 +131,7 @@ services:
       LOGFLAG: ${LOGFLAG}
     restart: unless-stopped
   tgi_service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
@@ -204,7 +204,7 @@ services:
     ipc: host
     restart: always
   tgi_service_codegen:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi_service_codegen
     ports:
       - "8028:80"
diff --git a/ProductivitySuite/tests/test_compose_on_xeon.sh b/ProductivitySuite/tests/test_compose_on_xeon.sh
index 95abf05e1e..04c685726a 100755
--- a/ProductivitySuite/tests/test_compose_on_xeon.sh
+++ b/ProductivitySuite/tests/test_compose_on_xeon.sh
@@ -23,7 +23,7 @@ function build_docker_images() {
     docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/SearchQnA/docker_compose/amd/gpu/rocm/README.md b/SearchQnA/docker_compose/amd/gpu/rocm/README.md
index 72cff55a39..7ab2999559 100644
--- a/SearchQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/SearchQnA/docker_compose/amd/gpu/rocm/README.md
@@ -71,7 +71,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -99,7 +99,7 @@
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/llm-textgen:latest
   - opea/reranking:latest
   - opea/searchqna:latest
diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 29b5229b83..152d7cbc75 100644
--- a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -91,7 +91,7 @@ services:
       LOGFLAG: ${LOGFLAG}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
diff --git a/SearchQnA/tests/test_compose_on_xeon.sh b/SearchQnA/tests/test_compose_on_xeon.sh
index aa8c3aa6e7..49c00acd83 100644
--- a/SearchQnA/tests/test_compose_on_xeon.sh
+++ b/SearchQnA/tests/test_compose_on_xeon.sh
@@ -36,7 +36,7 @@ function build_docker_images() {
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/Translation/docker_compose/amd/gpu/rocm/README.md b/Translation/docker_compose/amd/gpu/rocm/README.md
index b2a56bf1d0..a7e488fd6b 100644
--- a/Translation/docker_compose/amd/gpu/rocm/README.md
+++ b/Translation/docker_compose/amd/gpu/rocm/README.md
@@ -71,7 +71,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -98,7 +98,7 @@
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     - opea/llm-textgen:latest
     - opea/nginx:latest
     - opea/translation:latest
diff --git a/Translation/docker_compose/intel/cpu/xeon/compose.yaml b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
index 4b77d84484..aeb94f8fdd 100644
--- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
diff --git a/Translation/tests/test_compose_on_xeon.sh b/Translation/tests/test_compose_on_xeon.sh
index 9e2ac58cb7..d358f2f823 100644
--- a/Translation/tests/test_compose_on_xeon.sh
+++ b/Translation/tests/test_compose_on_xeon.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
     service_list="translation translation-ui llm-textgen nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/README.md b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
index cfbc3ab1c1..35524d99ed 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
@@ -48,13 +48,13 @@ docker build --no-cache -t opea/visualqna-ui:latest --build-arg https_proxy=$htt
 # vLLM
 docker pull opea/vllm:latest
 # TGI (Optional)
-docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
 ```
 
 Then run the command `docker images`, you will have the following Docker Images:
 
 1. `opea/vllm:latest`
-2. `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu` (Optional)
+2. `ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu` (Optional)
 3. `opea/lvm:latest`
 4. `opea/visualqna:latest`
 5. `opea/visualqna-ui:latest`
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/VisualQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
index b595bdcba7..5bacf1108d 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -3,7 +3,7 @@
 
 services:
   llava-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-llava-xeon-server
     ports:
       - "8399:80"
diff --git a/VisualQnA/tests/test_compose_tgi_on_gaudi.sh b/VisualQnA/tests/test_compose_tgi_on_gaudi.sh
index 913d6ed527..5d723fe3a5 100644
--- a/VisualQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/VisualQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -34,7 +34,7 @@ function build_docker_images() {
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
     docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
     docker images && sleep 1s
 }
 
diff --git a/VisualQnA/tests/test_compose_tgi_on_xeon.sh b/VisualQnA/tests/test_compose_tgi_on_xeon.sh
index d6311719d0..b348daab9e 100644
--- a/VisualQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/VisualQnA/tests/test_compose_tgi_on_xeon.sh
@@ -34,7 +34,7 @@ function build_docker_images() {
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
     docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     docker images && sleep 1s
 }
 

From 9f0f48b78988ddd1fa9cb1d9f3da4d5054b463e7 Mon Sep 17 00:00:00 2001
From: xiaotia3 <xiaotian.chen@intel.com>
Date: Wed, 16 Apr 2025 16:37:29 +0800
Subject: [PATCH 2/2] Update TGI image versions

Signed-off-by: xiaotia3 <xiaotian.chen@intel.com>
---
 AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml    | 2 +-
 AvatarChatbot/docker_compose/intel/cpu/xeon/README.md     | 2 +-
 AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml  | 2 +-
 AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md    | 2 +-
 AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml | 2 +-
 AvatarChatbot/tests/test_compose_on_gaudi.sh              | 2 +-
 AvatarChatbot/tests/test_compose_on_rocm.sh               | 2 +-
 AvatarChatbot/tests/test_compose_on_xeon.sh               | 2 +-
 ChatQnA/docker_compose/amd/gpu/rocm/README.md             | 6 +++---
 ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml          | 2 +-
 ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml   | 2 +-
 ChatQnA/tests/test_compose_on_rocm.sh                     | 2 +-
 CodeGen/docker_compose/intel/cpu/xeon/README.md           | 4 ++--
 CodeGen/docker_compose/intel/hpu/gaudi/README.md          | 4 ++--
 DocSum/docker_compose/amd/gpu/rocm/README.md              | 4 ++--
 DocSum/tests/test_compose_on_rocm.sh                      | 2 +-
 DocSum/tests/test_compose_vllm_on_rocm.sh                 | 2 +-
 MultimodalQnA/docker_compose/amd/gpu/rocm/README.md       | 2 +-
 VisualQnA/docker_compose/amd/gpu/rocm/README.md           | 2 +-
 19 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml b/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
index 3bee9ab662..7813b66d3b 100644
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
@@ -42,7 +42,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     container_name: tgi-service
     ports:
       - "${TGI_SERVICE_PORT:-3006}:80"
diff --git a/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md b/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md
index bf686ce99e..67d9d0b456 100644
--- a/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md
+++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md
@@ -19,7 +19,7 @@ docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build
 
 ### 3. Build LLM Image
 
-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu (https://github.com/huggingface/text-generation-inference)
+Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu (https://github.com/huggingface/text-generation-inference)
 
 ### 4. Build TTS Image
 
diff --git a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
index f33449d020..c9748421a0 100644
--- a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
@@ -26,7 +26,7 @@ services:
       https_proxy: ${https_proxy}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
index 994d400ce4..ecd4bb5ec8 100644
--- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
+++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
@@ -19,7 +19,7 @@ docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy -
 
 ### 3. Build LLM Image
 
-Intel Gaudi optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)
+Intel Gaudi optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.3.1 (https://github.com/huggingface/tgi-gaudi)
 
 ### 4. Build TTS Image
 
diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml b/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml
index aba9bb910c..4123034856 100644
--- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -38,7 +38,7 @@ services:
       - SYS_NICE
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: tgi-gaudi-server
     ports:
       - "3006:80"
diff --git a/AvatarChatbot/tests/test_compose_on_gaudi.sh b/AvatarChatbot/tests/test_compose_on_gaudi.sh
index 6c167c4467..0fa2afc301 100755
--- a/AvatarChatbot/tests/test_compose_on_gaudi.sh
+++ b/AvatarChatbot/tests/test_compose_on_gaudi.sh
@@ -42,7 +42,7 @@ function build_docker_images() {
     service_list="avatarchatbot whisper-gaudi speecht5-gaudi wav2lip-gaudi animation"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
 
     docker images && sleep 1s
 }
diff --git a/AvatarChatbot/tests/test_compose_on_rocm.sh b/AvatarChatbot/tests/test_compose_on_rocm.sh
index 14cc33a891..73611115d4 100644
--- a/AvatarChatbot/tests/test_compose_on_rocm.sh
+++ b/AvatarChatbot/tests/test_compose_on_rocm.sh
@@ -30,7 +30,7 @@ function build_docker_images() {
     service_list="avatarchatbot whisper asr llm-textgen speecht5 tts wav2lip animation"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
 
     docker images && sleep 3s
 }
diff --git a/AvatarChatbot/tests/test_compose_on_xeon.sh b/AvatarChatbot/tests/test_compose_on_xeon.sh
index ed7bc15699..166d2196bf 100755
--- a/AvatarChatbot/tests/test_compose_on_xeon.sh
+++ b/AvatarChatbot/tests/test_compose_on_xeon.sh
@@ -42,7 +42,7 @@ function build_docker_images() {
     service_list="avatarchatbot whisper speecht5 wav2lip animation"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu
 
     docker images && sleep 1s
 }
diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/README.md b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
index 0edcf44141..54f802c31d 100644
--- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
@@ -90,7 +90,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -136,7 +136,7 @@
   - opea/dataprep:latest
   - ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
   - opea/retriever:latest
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/chatqna:latest
   - opea/chatqna-ui:latest
   - opea/nginx:latest
@@ -147,7 +147,7 @@
   - opea/dataprep:latest
   - ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
   - opea/retriever:latest
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/llm-faqgen:latest
   - opea/chatqna:latest
   - opea/chatqna-ui:latest
diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
index 2a203ddf1e..ec6fa3fa82 100644
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -79,7 +79,7 @@ services:
     command: --model-id ${CHATQNA_RERANK_MODEL_ID} --auto-truncate
 
   chatqna-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     container_name: chatqna-tgi-service
     ports:
       - "${CHATQNA_TGI_SERVICE_PORT}:80"
diff --git a/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml b/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
index ae726f1208..7528880909 100644
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
@@ -79,7 +79,7 @@ services:
     command: --model-id ${CHATQNA_RERANK_MODEL_ID} --auto-truncate
 
   chatqna-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     container_name: chatqna-tgi-service
     ports:
       - "${CHATQNA_TGI_SERVICE_PORT}:80"
diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh
index b370bc0c65..261bb54405 100644
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -67,7 +67,7 @@ function build_docker_images() {
     service_list="chatqna chatqna-ui dataprep retriever nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > "${LOG_PATH}"/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md
index a6b5abcba9..5c5826fbfb 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md
@@ -127,7 +127,7 @@ docker compose --profile codegen-xeon-tgi up -d
 Then run the command `docker images`, you will have the following Docker images:
 
 - `ghcr.io/huggingface/text-embeddings-inference:cpu-1.5`
-- `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`
+- `ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu`
 - `opea/codegen-gradio-ui`
 - `opea/codegen`
 - `opea/dataprep`
@@ -145,7 +145,7 @@ docker compose --profile codegen-xeon-vllm up -d
 Then run the command `docker images`, you will have the following Docker images:
 
 - `ghcr.io/huggingface/text-embeddings-inference:cpu-1.5`
-- `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`
+- `ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu`
 - `opea/codegen-gradio-ui`
 - `opea/codegen`
 - `opea/dataprep`
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
index 183f853115..50e40d1ecc 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
@@ -120,7 +120,7 @@ docker compose --profile codegen-gaudi-tgi up -d
 Then run the command `docker images`, you will have the following Docker images:
 
 - `ghcr.io/huggingface/text-embeddings-inference:cpu-1.5`
-- `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`
+- `ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu`
 - `opea/codegen-gradio-ui`
 - `opea/codegen`
 - `opea/dataprep`
@@ -138,7 +138,7 @@ docker compose --profile codegen-gaudi-vllm up -d
 Then run the command `docker images`, you will have the following Docker images:
 
 - `ghcr.io/huggingface/text-embeddings-inference:cpu-1.5`
-- `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`
+- `ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu`
 - `opea/codegen-gradio-ui`
 - `opea/codegen`
 - `opea/dataprep`
diff --git a/DocSum/docker_compose/amd/gpu/rocm/README.md b/DocSum/docker_compose/amd/gpu/rocm/README.md
index 79f88d0dd0..b432bd5a08 100644
--- a/DocSum/docker_compose/amd/gpu/rocm/README.md
+++ b/DocSum/docker_compose/amd/gpu/rocm/README.md
@@ -71,7 +71,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
@@ -98,7 +98,7 @@
 
   ##### TGI-based application:
 
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  - ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   - opea/llm-docsum:latest
   - opea/whisper:latest
   - opea/docsum:latest
diff --git a/DocSum/tests/test_compose_on_rocm.sh b/DocSum/tests/test_compose_on_rocm.sh
index c0d4c22d0a..f322dc547c 100644
--- a/DocSum/tests/test_compose_on_rocm.sh
+++ b/DocSum/tests/test_compose_on_rocm.sh
@@ -43,7 +43,7 @@ function build_docker_images() {
     service_list="docsum docsum-gradio-ui whisper llm-docsum"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     docker images && sleep 3s
 }
 
diff --git a/DocSum/tests/test_compose_vllm_on_rocm.sh b/DocSum/tests/test_compose_vllm_on_rocm.sh
index 40d10b039a..b8e3064557 100644
--- a/DocSum/tests/test_compose_vllm_on_rocm.sh
+++ b/DocSum/tests/test_compose_vllm_on_rocm.sh
@@ -44,7 +44,7 @@ function build_docker_images() {
     service_list="docsum docsum-gradio-ui whisper llm-docsum vllm-rocm"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
     docker images && sleep 3s
 }
 
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
index 14e66d989a..8489ab366e 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
@@ -77,7 +77,7 @@ After launching your instance, you can connect to it using SSH (for Linux instan
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images
diff --git a/VisualQnA/docker_compose/amd/gpu/rocm/README.md b/VisualQnA/docker_compose/amd/gpu/rocm/README.md
index 1647b16b2a..9a582c9dbf 100644
--- a/VisualQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/VisualQnA/docker_compose/amd/gpu/rocm/README.md
@@ -71,7 +71,7 @@
 - #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
 
   ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
   ```
 
 - #### Build Docker Images