From 935550fa1078f2492573185513de5b0c7ee37f14 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 16 Jan 2025 13:33:36 +0000 Subject: [PATCH] Not copy kvcache after last token generated --- src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 2e987036483e34..647a4b9f53142b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -229,6 +229,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits")); kvcache_desc.num_stored_tokens += 1; + if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) { + return; + } + LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration."); const std::size_t kStartOutputKVCacheLayers = 1u; const auto& kvcache_compiled = m_kvcache_request->get_compiled_model();