Skip to content

Commit

Permalink
Added possibility to pass PAD_TOKEN_ID
Browse files Browse the repository at this point in the history
  • Loading branch information
AsyaPronina committed Jan 15, 2025
1 parent 724cf80 commit e16bc9b
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,16 @@ static constexpr ov::Property<uint32_t> batch_dim{"NPUW_LLM_BATCH_DIM"};
*/
static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};

/**
* @brief
* TODO: Check that it is indeed needed, or remove.
* Type: int64_t.
* Pad token id, required to fill the input of prefill model until useful
* tokens met.
* Default value: 0.
*/
static constexpr ov::Property<uint32_t> pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"};

/**
* @brief
* Type: uint32_t.
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM>();
desc.add<NPUW_LLM_BATCH_DIM>();
desc.add<NPUW_LLM_SEQ_LEN_DIM>();
desc.add<NPUW_LLM_PAD_TOKEN_ID>();
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
Expand Down
5 changes: 3 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
m_cfg.update(any_copy(npuw_llm_props));

LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
auto kvcache_model = model->clone();
auto kvcache_model = model;
LOG_DEBUG("2. Transform kvcache model from stateful to stateless.");
ov::pass::StatefulToStateless().run_on_model(kvcache_model);
LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one.");
Expand All @@ -467,7 +467,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);

m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, false};
LOG_DEBUG("4. Make prefill model with static shapes");
reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
LOG_DEBUG("5. Make kvcache model with static shapes");
Expand Down Expand Up @@ -722,6 +722,7 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get),
BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get),
BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get),
BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get),
BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get),
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
}

void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
const auto pad_token_id = m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>();
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), pad_token_id);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
Expand Down

0 comments on commit e16bc9b

Please sign in to comment.