From 926a9cfdbb24d41db96bbdb944d8768772286298 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Tue, 14 Jan 2025 14:39:04 +0000 Subject: [PATCH] ci fix --- .github/workflows/causal_lm_cpp.yml | 36 +++++++-------- .../openvino/genai/generation_config.hpp | 2 + src/cpp/src/generation_config.cpp | 1 + src/cpp/src/icontinuous_batching.cpp | 5 ++- src/cpp/src/llm_pipeline_stateful.cpp | 4 +- src/cpp/src/llm_pipeline_static.cpp | 4 +- .../src/visual_language/inputs_embedder.cpp | 44 ++++++++++--------- .../src/visual_language/inputs_embedder.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 2 +- src/python/py_generation_config.cpp | 1 + tests/python_tests/common.py | 2 +- 11 files changed, 56 insertions(+), 47 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 99104056df..22bf003f9b 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -121,8 +121,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = 'Why is the Sun yellow?' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -140,8 +140,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = '69' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -160,7 +160,7 @@ jobs: prompt = 'Hi' if tokenizer.chat_template: prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -178,8 +178,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = 'return 0' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -197,8 +197,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = '你好! 你好嗎?' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -221,8 +221,8 @@ jobs: ] for prompt in prompts: if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -272,9 +272,9 @@ jobs: echo predictions = open('cpp.txt', 'r').read() >> ref.py echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py echo prompt = '69' - echo if tokenizer.chat_template: - echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - echo tokenized = tokenizer(prompt, return_tensors='pt') >> ref.py + echo if tokenizer.chat_template: + echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py @@ -581,8 +581,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') prompt = 'Alan Turing was a' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -639,8 +639,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') prompt = 'Alan Turing was a' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 3a75fc02ea..cd372e635d 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { std::optional adapters; + bool apply_chat_template = true; + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. */ diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 67682be787..dba5aaa5bd 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -125,6 +125,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { read_anymap_param(properties, "logprobs", logprobs); read_anymap_param(properties, "num_return_sequences", num_return_sequences); read_anymap_param(properties, "adapters", adapters); + read_anymap_param(properties, "apply_chat_template", apply_chat_template); // penalties read_anymap_param(properties, "frequency_penalty", frequency_penalty); diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp index d4fa7c3e5d..6b748d6665 100644 --- a/src/cpp/src/icontinuous_batching.cpp +++ b/src/cpp/src/icontinuous_batching.cpp @@ -53,10 +53,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { input_ids.reserve(prompts.size()); timer.start(); - for (const std::string& prompt : prompts) { + for (size_t i = 0; i < prompts.size(); i++) { + const std::string& prompt = prompts.at(i); const auto encode_start = std::chrono::steady_clock::now(); ov::Tensor encoded_inputs; - if (!m_tokenizer.get_chat_template().empty()) { + if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 18e9d30ebc..14a3f1b651 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -90,7 +90,7 @@ DecodedResults StatefulLLMPipeline::generate( OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); std::vector templated_input_vector; for (auto& input : *input_vector) { - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", input}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); @@ -170,7 +170,7 @@ DecodedResults StatefulLLMPipeline::generate( // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied } else { std::string& prompt = *input_prompt; - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index be3d7552c8..1f4e5ffcec 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -805,7 +805,7 @@ DecodedResults StatefulLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); @@ -1281,7 +1281,7 @@ DecodedResults StatelessLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 616a12bb6f..e3ce5ab067 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -50,7 +50,7 @@ class InputsEmbedder::IInputsEmbedder { ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; public: - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) = 0; virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; @@ -155,7 +155,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) { + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool apply_chat_template = true) { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. @@ -223,15 +223,19 @@ class InputsEmbedder::IInputsEmbedder { m_tokenized_history.clear(); std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); } else { - std::string templated_prompt; + std::string templated_prompt = prompt; ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; - if (!m_tokenizer.get_chat_template().empty()) { - templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); - } else { - // Use fallback chat template if it was not found in tokenizer_config.json - templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback); + if (apply_chat_template) { + if (!m_tokenizer.get_chat_template().empty()) { + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + std::cout << " m_tokenizer 1 " << std::endl; + } else { + // Use fallback chat template if it was not found in tokenizer_config.json + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback); + std::cout << " m_tokenizer 2 " << std::endl; + } } auto start_tokenizer_time = std::chrono::steady_clock::now(); @@ -331,7 +335,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string images_prompt; std::vector embeds; @@ -366,7 +370,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { } images_prompt += prompt; - ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics); + ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics, {}, apply_chat_template); ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); OPENVINO_ASSERT( @@ -629,7 +633,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; @@ -647,7 +651,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -742,7 +746,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { const ov::AnyMap device_config) : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; @@ -774,7 +778,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1069,7 +1073,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; @@ -1097,7 +1101,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, {}, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1214,7 +1218,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { ).create_infer_request(); } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string formatted_prompt; std::vector single_images = to_single_image_tensors(images); @@ -1246,7 +1250,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { // Adapted from Qwen/Qwen2-7B-Instruct std::string chat_template_fallback = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1616,8 +1620,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) { - return m_impl->get_inputs_embeds(prompt, images, metrics); +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) { + return m_impl->get_inputs_embeds(prompt, images, metrics, apply_chat_template); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 223d090b22..b21d0198cc 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -32,7 +32,7 @@ class InputsEmbedder { const ov::AnyMap device_config); // compute input embedding for prompt and multiple images - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 95e3064548..415e7fcae9 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -166,7 +166,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { generation_config.validate(); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); + ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics, generation_config.apply_chat_template); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist(); diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index e2a6d7062c..a7d7789a55 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -115,6 +115,7 @@ void init_generation_config(py::module_& m) { .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) .def_readwrite("adapters", &GenerationConfig::adapters) + .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index ed6263a284..8f2d2b05b9 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -274,7 +274,7 @@ def run_hugging_face( else: processed_prompts = prompts # process all prompts as a single batch as we have a single generation config for all prompts - inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left') + inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left') input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)