Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically apply chat template in non-chat scenarios #1533

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 35 additions & 9 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt')
prompt = 'Why is the Sun yellow?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -136,7 +139,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('69', return_tensors='pt')
prompt = '69'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -152,7 +158,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('Hi', return_tensors='pt')
prompt = 'Hi'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -168,7 +177,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('return 0', return_tensors='pt')
prompt = 'return 0'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -184,7 +196,10 @@ jobs:
with open('pred.txt', 'r', errors='ignore') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt')
prompt = '你好! 你好嗎?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
Expand All @@ -206,7 +221,9 @@ jobs:
'你好! 你好嗎?'
]
for prompt in prompts:
tokenized = tokenizer(prompt, return_tensors='pt')
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
Expand Down Expand Up @@ -255,7 +272,10 @@ jobs:
echo import transformers > ref.py
echo predictions = open('cpp.txt', 'r').read() >> ref.py
echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
echo prompt = '69'
echo if tokenizer.chat_template:
echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py
echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
echo idx = predictions.find(ref) >> ref.py
Expand Down Expand Up @@ -562,7 +582,10 @@ jobs:
with open('pred_greedy.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand Down Expand Up @@ -617,7 +640,10 @@ jobs:
with open('pred_greedy.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ from PIL import Image

# Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU")
pipe.start_chat()

image = Image.open("dog.jpg")
image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
Expand Down
2 changes: 1 addition & 1 deletion samples/cpp/text_generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat
./chat_sample <MODEL_DIR>
```
#### Missing chat template
If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model.
If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`.
The following template can be used as a default, but it may not work properly with every model:
```
"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}",
Expand Down
2 changes: 1 addition & 1 deletion samples/python/text_generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat
python chat_sample.py model_dir
```
#### Missing chat template
If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model.
If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`.
The following template can be used as a default, but it may not work properly with every model:
```
"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}",
Expand Down
2 changes: 2 additions & 0 deletions src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ output:
'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
```
>**Note**: The chat_template from tokenizer_config.json or from tokenizer/detokenizer model will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template("").
A simple chat in Python:
```python
import openvino_genai as ov_genai
Expand Down
2 changes: 2 additions & 0 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {

std::optional<AdapterConfig> adapters;

bool apply_chat_template = true;

/** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
* Otherwise verifies eos_token_id == tokenizer_eos_token_id.
*/
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param generation_config optional GenerationConfig
* @param streamer optional streamer
* @return DecodedResults decoded resulting text
* chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
* To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
*/
DecodedResults generate(
StringInputs inputs,
Expand All @@ -191,6 +193,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param inputs input prompt or a vector of prompts
* @param properties properties
* @return DecodedResults decoded resulting text
* chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
* To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
*/
template <typename... Properties>
util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
/// @param chat_template The new template to override with.
void set_chat_template(const std::string& chat_template);

// get information about a chat template to check its status, for example whether it is empty
std::string get_chat_template() const;

// information about <bos>, <eos> tokens should be public,
// they are used at least in StreamerBase descendants
int64_t get_bos_token_id() const;
Expand Down
8 changes: 8 additions & 0 deletions src/cpp/include/openvino/genai/visual_language/pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @param generation_config A config to follow for text generation.
/// @param streamer A streamer to acquire intermediate result.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
VLMDecodedResults generate(
const std::string& prompt,
const std::vector<ov::Tensor>& rgbs,
Expand All @@ -111,6 +113,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @param generation_config A config to follow for text generation.
/// @param streamer A streamer to acquire intermediate result.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
VLMDecodedResults generate(
const std::string& prompt,
const ov::Tensor& rgb,
Expand All @@ -124,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// for its members, StreamerVariant a single image or multiple
/// images.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
VLMDecodedResults generate(
const std::string& prompt,
const ov::AnyMap& config_map
Expand All @@ -137,6 +143,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @param ...properties ov::Property instances to be combined into
/// ov::AnyMap.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
template <typename... Properties>
util::EnableIfAllStringAny<VLMDecodedResults, Properties...> generate(
const std::string& prompt,
Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {
read_anymap_param(properties, "logprobs", logprobs);
read_anymap_param(properties, "num_return_sequences", num_return_sequences);
read_anymap_param(properties, "adapters", adapters);
read_anymap_param(properties, "apply_chat_template", apply_chat_template);

// penalties
read_anymap_param(properties, "frequency_penalty", frequency_penalty);
Expand Down
15 changes: 13 additions & 2 deletions src/cpp/src/icontinuous_batching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
} else {
input_ids.reserve(prompts.size());
timer.start();
for (const std::string& prompt : prompts) {
for (size_t i = 0; i < prompts.size(); i++) {
const std::string& prompt = prompts.at(i);
const auto encode_start = std::chrono::steady_clock::now();
input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
ov::Tensor encoded_inputs;
if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like you need to fix our test framework to do the same as chat template is applied only for chat cases

The same for tests in .github folder.

encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
} else {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_inputs = m_tokenizer.encode(prompt).input_ids;
}
input_ids.push_back(encoded_inputs);
tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start));
}
timer.end();
Expand Down
Loading
Loading