Skip to content

Commit

Permalink
update test and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Jan 13, 2025
1 parent 5a0e10d commit e5fa889
Show file tree
Hide file tree
Showing 13 changed files with 84 additions and 27 deletions.
42 changes: 34 additions & 8 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt')
prompt = 'Why is the Sun yellow?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -135,7 +138,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('69', return_tensors='pt')
prompt = '69'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -151,7 +157,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('Hi', return_tensors='pt')
prompt = 'Hi'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -167,7 +176,10 @@ jobs:
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('return 0', return_tensors='pt')
prompt = 'return 0'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -183,7 +195,10 @@ jobs:
with open('pred.txt', 'r', errors='ignore') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt')
prompt = '你好! 你好嗎?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
Expand All @@ -205,6 +220,8 @@ jobs:
'你好! 你好嗎?'
]
for prompt in prompts:
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
Expand Down Expand Up @@ -254,7 +271,10 @@ jobs:
echo import transformers > ref.py
echo predictions = open('cpp.txt', 'r').read() >> ref.py
echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
echo prompt = '69'
echo if tokenizer.chat_template:
echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
echo tokenized = tokenizer(prompt, return_tensors='pt') >> ref.py
echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
echo idx = predictions.find(ref) >> ref.py
Expand Down Expand Up @@ -559,7 +579,10 @@ jobs:
with open('pred_greedy.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand Down Expand Up @@ -614,7 +637,10 @@ jobs:
with open('pred_greedy.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand Down
2 changes: 1 addition & 1 deletion samples/cpp/chat_sample/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ If you encounter the error described in the example when sample is printing outp

#### Missing chat template

If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model.
If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`.
The following template can be used as a default, but it may not work properly with every model:
```
"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}",
Expand Down
2 changes: 1 addition & 1 deletion samples/python/chat_sample/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ If you encounter the error described in the example when sample is printing outp

#### Missing chat template

If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model.
If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`..
The following template can be used as a default, but it may not work properly with every model:
```
"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}",
Expand Down
2 changes: 1 addition & 1 deletion src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ output:
'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
```
>**Note**: The chat_template from tokenizer_config.json will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template("").
>**Note**: The chat_template from tokenizer_config.json or from tokenizer/detokenizer model will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template("").
A simple chat in Python:
```python
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param generation_config optional GenerationConfig
* @param streamer optional streamer
* @return DecodedResults decoded resulting text
* chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
* Use custom_chat_template = "" to disable it for non-chat mode.
*/
DecodedResults generate(
StringInputs inputs,
Expand All @@ -199,6 +201,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param inputs input prompt or a vector of prompts
* @param properties properties
* @return DecodedResults decoded resulting text
* chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
* Use custom_chat_template = "" to disable it for non-chat mode.
*/
template <typename... Properties>
util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
/// @param chat_template The new template to override with.
void set_chat_template(const std::string& chat_template);

// get information about a chat template to check its status, for example whether it is empty
std::string get_chat_template() const;

// information about <bos>, <eos> tokens should be public,
// they are used at least in StreamerBase descendants
int64_t get_bos_token_id() const;
Expand Down
8 changes: 8 additions & 0 deletions src/cpp/include/openvino/genai/visual_language/pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @param generation_config A config to follow for text generation.
/// @param streamer A streamer to acquire intermediate result.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// Use custom_chat_template="" to disable it for non-chat mode.
VLMDecodedResults generate(
const std::string& prompt,
const std::vector<ov::Tensor>& rgbs,
Expand All @@ -111,6 +113,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @param generation_config A config to follow for text generation.
/// @param streamer A streamer to acquire intermediate result.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// Use custom_chat_template="" to disable it for non-chat mode.
VLMDecodedResults generate(
const std::string& prompt,
const ov::Tensor& rgb,
Expand All @@ -124,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// for its members, StreamerVariant a single image or multiple
/// images.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// Use custom_chat_template="" to disable it for non-chat mode.
VLMDecodedResults generate(
const std::string& prompt,
const ov::AnyMap& config_map
Expand All @@ -137,6 +143,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @param ...properties ov::Property instances to be combined into
/// ov::AnyMap.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// Use custom_chat_template="" to disable it for non-chat mode.
template <typename... Properties>
util::EnableIfAllStringAny<VLMDecodedResults, Properties...> generate(
const std::string& prompt,
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/icontinuous_batching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
timer.start();
for (const std::string& prompt : prompts) {
ov::Tensor encoded_inputs;
try {
if (!m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
} catch (const std::exception& error) {
} else {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_inputs = m_tokenizer.encode(prompt).input_ids;
}
Expand Down
10 changes: 5 additions & 5 deletions src/cpp/src/llm_pipeline_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@ DecodedResults StatefulLLMPipeline::generate(
OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
std::vector<std::string> templated_input_vector;
for (auto& input : *input_vector) {
try {
if (!m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", input}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
templated_input_vector.push_back(templated_prompt);
} catch (const std::exception& error) {
} else {
// in case when chat_template was not found in tokenizer_config.json or set
templated_input_vector.push_back(input);
}
Expand All @@ -116,7 +116,7 @@ DecodedResults StatefulLLMPipeline::generate(

m_history.push_back({{"role", "user"}, {"content", prompt}});
constexpr bool add_generation_prompt = true;
auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
// Do not add special tokens in chat scenario to be aligned with HF.
auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
Expand Down Expand Up @@ -170,12 +170,12 @@ DecodedResults StatefulLLMPipeline::generate(
// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
std::string& prompt = *input_prompt;
try {
if (!m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} catch (const std::exception& error) {
} else {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_input = m_tokenizer.encode(prompt);
}
Expand Down
Loading

0 comments on commit e5fa889

Please sign in to comment.