diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 2e0afaa882..9cc60648ad 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -120,7 +120,10 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') + prompt = 'Why is the Sun yellow?' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -136,7 +139,10 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('69', return_tensors='pt') + prompt = '69' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -152,7 +158,10 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('Hi', return_tensors='pt') + prompt = 'Hi' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -168,7 +177,10 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('return 0', return_tensors='pt') + prompt = 'return 0' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -184,7 +196,10 @@ jobs: with open('pred.txt', 'r', errors='ignore') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt') + prompt = '你好! 你好嗎?' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -206,7 +221,9 @@ jobs: '你好! 你好嗎?' ] for prompt in prompts: - tokenized = tokenizer(prompt, return_tensors='pt') + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -255,7 +272,10 @@ jobs: echo import transformers > ref.py echo predictions = open('cpp.txt', 'r').read() >> ref.py echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py - echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py + echo prompt = '69' + echo if tokenizer.chat_template: + echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py @@ -562,7 +582,10 @@ jobs: with open('pred_greedy.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') - tokenized = tokenizer('Alan Turing was a', return_tensors='pt') + prompt = 'Alan Turing was a' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -617,7 +640,10 @@ jobs: with open('pred_greedy.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') - tokenized = tokenizer('Alan Turing was a', return_tensors='pt') + prompt = 'Alan Turing was a' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) diff --git a/README.md b/README.md index cea1e358bc..221a81c6c3 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,6 @@ from PIL import Image # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU") -pipe.start_chat() image = Image.open("dog.jpg") image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8) diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index d9e5bd8d22..c05fa25f9c 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -62,7 +62,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat ./chat_sample ``` #### Missing chat template -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`. The following template can be used as a default, but it may not work properly with every model: ``` "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md index 9940904cfb..db2f6b0d5f 100644 --- a/samples/python/text_generation/README.md +++ b/samples/python/text_generation/README.md @@ -62,7 +62,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat python chat_sample.py model_dir ``` #### Missing chat template -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`. The following template can be used as a default, but it may not work properly with every model: ``` "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", diff --git a/src/README.md b/src/README.md index af4953f98a..c2ed8c2a60 100644 --- a/src/README.md +++ b/src/README.md @@ -73,6 +73,8 @@ output: 'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in' ``` +>**Note**: The chat_template from tokenizer_config.json or from tokenizer/detokenizer model will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template(""). + A simple chat in Python: ```python import openvino_genai as ov_genai diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 3a75fc02ea..cd372e635d 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { std::optional adapters; + bool apply_chat_template = true; + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. */ diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 31b1ac1675..26232574dc 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -177,6 +177,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param generation_config optional GenerationConfig * @param streamer optional streamer * @return DecodedResults decoded resulting text + * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it. + * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. */ DecodedResults generate( StringInputs inputs, @@ -191,6 +193,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param inputs input prompt or a vector of prompts * @param properties properties * @return DecodedResults decoded resulting text + * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it. + * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. */ template util::EnableIfAllStringAny generate( diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 0a54d1da2a..bde4eb3fe1 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -221,6 +221,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /// @param chat_template The new template to override with. void set_chat_template(const std::string& chat_template); + // get information about a chat template to check its status, for example whether it is empty + std::string get_chat_template() const; + // information about , tokens should be public, // they are used at least in StreamerBase descendants int64_t get_bos_token_id() const; diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 8c3d380b0f..b6b1d5c7f6 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -98,6 +98,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const std::vector& rgbs, @@ -111,6 +113,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const ov::Tensor& rgb, @@ -124,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// for its members, StreamerVariant a single image or multiple /// images. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const ov::AnyMap& config_map @@ -137,6 +143,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param ...properties ov::Property instances to be combined into /// ov::AnyMap. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. template util::EnableIfAllStringAny generate( const std::string& prompt, diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 67682be787..dba5aaa5bd 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -125,6 +125,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { read_anymap_param(properties, "logprobs", logprobs); read_anymap_param(properties, "num_return_sequences", num_return_sequences); read_anymap_param(properties, "adapters", adapters); + read_anymap_param(properties, "apply_chat_template", apply_chat_template); // penalties read_anymap_param(properties, "frequency_penalty", frequency_penalty); diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp index 78f8fda8f7..6b748d6665 100644 --- a/src/cpp/src/icontinuous_batching.cpp +++ b/src/cpp/src/icontinuous_batching.cpp @@ -53,9 +53,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { input_ids.reserve(prompts.size()); timer.start(); - for (const std::string& prompt : prompts) { + for (size_t i = 0; i < prompts.size(); i++) { + const std::string& prompt = prompts.at(i); const auto encode_start = std::chrono::steady_clock::now(); - input_ids.push_back(m_tokenizer.encode(prompt).input_ids); + ov::Tensor encoded_inputs; + if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids; + } else { + // in case when chat_template was not found in tokenizer_config.json or set + encoded_inputs = m_tokenizer.encode(prompt).input_ids; + } + input_ids.push_back(encoded_inputs); tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start)); } timer.end(); diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 2a53154c27..14a3f1b651 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -88,7 +88,19 @@ DecodedResults StatefulLLMPipeline::generate( if (auto input_vector = std::get_if>(&inputs)) { OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); - encoded_input = m_tokenizer.encode(*input_vector); + std::vector templated_input_vector; + for (auto& input : *input_vector) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", input}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + templated_input_vector.push_back(templated_prompt); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + templated_input_vector.push_back(input); + } + } + encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false)); } else if (auto input_prompt = std::get_if(&inputs)) { std::string& prompt = *input_prompt; @@ -104,7 +116,7 @@ DecodedResults StatefulLLMPipeline::generate( m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; - auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); // Do not add special tokens in chat scenario to be aligned with HF. auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)); auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); @@ -157,7 +169,16 @@ DecodedResults StatefulLLMPipeline::generate( // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied } else { - encoded_input = m_tokenizer.encode(prompt); + std::string& prompt = *input_prompt; + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + encoded_input = m_tokenizer.encode(prompt); + } } } diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index b9a55477cb..1f4e5ffcec 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -805,7 +805,15 @@ DecodedResults StatefulLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - tokenized_input = m_tokenizer.encode(prompt); + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + tokenized_input = m_tokenizer.encode(prompt); + } } auto encode_stop_time = std::chrono::steady_clock::now(); @@ -1273,7 +1281,15 @@ DecodedResults StatelessLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - tokenized_input = m_tokenizer.encode(prompt); + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + tokenized_input = m_tokenizer.encode(prompt); + } } auto encode_stop_time = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 2f18e87839..441b422584 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -587,6 +587,10 @@ class Tokenizer::TokenizerImpl { void set_chat_template(const std::string& chat_template) { m_chat_template = patch_chat_template(chat_template); } + + std::string get_chat_template() { + return m_chat_template; + } }; Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) { @@ -690,6 +694,10 @@ std::string Tokenizer::apply_chat_template(ChatHistory history, return m_pimpl->apply_chat_template(history, add_generation_prompt, chat_template); } +std::string Tokenizer::get_chat_template() const { + return m_pimpl->get_chat_template(); +} + void Tokenizer::set_chat_template(const std::string& chat_template) { m_pimpl->set_chat_template(chat_template); } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 4f3812862c..9608bce385 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -50,7 +50,7 @@ class InputsEmbedder::IInputsEmbedder { ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; public: - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) = 0; virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; @@ -155,7 +155,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) { + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool apply_chat_template = true) { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. @@ -169,9 +169,9 @@ class InputsEmbedder::IInputsEmbedder { m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; std::string new_templated_chat_history; - try { + if (!m_tokenizer.get_chat_template().empty()) { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - } catch (const std::exception& error) { + } else { // Use fallback chat template if it was not found in tokenizer_config.json new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } @@ -224,7 +224,21 @@ class InputsEmbedder::IInputsEmbedder { std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); } else { auto start_tokenizer_time = std::chrono::steady_clock::now(); - encoded_input_ids = m_tokenizer.encode(prompt).input_ids; + if (apply_chat_template) { + std::string templated_prompt; + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + + if (!m_tokenizer.get_chat_template().empty()) { + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + } else { + // Use fallback chat template if it was not found in tokenizer_config.json + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback); + } + encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids; + } else { + encoded_input_ids = m_tokenizer.encode(prompt).input_ids; + } auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_tokenized_history.clear(); @@ -320,7 +334,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string images_prompt; std::vector embeds; @@ -355,7 +369,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { } images_prompt += prompt; - ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics); + ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics, {}, apply_chat_template); ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); OPENVINO_ASSERT( @@ -618,7 +632,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; @@ -636,7 +650,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -731,7 +745,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { const ov::AnyMap device_config) : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; @@ -763,7 +777,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1058,7 +1072,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; @@ -1086,7 +1100,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, {}, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1203,7 +1217,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { ).create_infer_request(); } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string formatted_prompt; std::vector single_images = to_single_image_tensors(images); @@ -1235,7 +1249,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { // Adapted from Qwen/Qwen2-7B-Instruct std::string chat_template_fallback = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1605,8 +1619,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) { - return m_impl->get_inputs_embeds(prompt, images, metrics); +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) { + return m_impl->get_inputs_embeds(prompt, images, metrics, apply_chat_template); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 223d090b22..b21d0198cc 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -32,7 +32,7 @@ class InputsEmbedder { const ov::AnyMap device_config); // compute input embedding for prompt and multiple images - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 95e3064548..415e7fcae9 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -166,7 +166,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { generation_config.validate(); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); + ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics, generation_config.apply_chat_template); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist(); diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index e2a6d7062c..a7d7789a55 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -115,6 +115,7 @@ void init_generation_config(py::module_& m) { .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) .def_readwrite("adapters", &GenerationConfig::adapters) + .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index cbed01e20f..8f2d2b05b9 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -251,6 +251,8 @@ def run_hugging_face( # process prompt by promp as we have multiple generation configs for prompt, generation_config in zip(prompts, generation_configs): hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config) + if hf_tokenizer.chat_template: + prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) inputs = hf_tokenizer(prompt, return_tensors="pt") input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] prompt_len = 0 if generation_config.echo else input_ids.numel() @@ -265,8 +267,14 @@ def run_hugging_face( generation_result.m_scores = [score for score in generate_outputs.sequences_scores] generation_results.append(generation_result) else: + processed_prompts = [] + if hf_tokenizer.chat_template: + for prompt in prompts: + processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)) + else: + processed_prompts = prompts # process all prompts as a single batch as we have a single generation config for all prompts - inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left') + inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left') input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)