Skip to content

Commit

Permalink
ci fix
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Jan 17, 2025
1 parent 4c77e51 commit 926a9cf
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 47 deletions.
36 changes: 18 additions & 18 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ jobs:
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = 'Why is the Sun yellow?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -140,8 +140,8 @@ jobs:
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = '69'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -160,7 +160,7 @@ jobs:
prompt = 'Hi'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -178,8 +178,8 @@ jobs:
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = 'return 0'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand All @@ -197,8 +197,8 @@ jobs:
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = '你好! 你好嗎?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
Expand All @@ -221,8 +221,8 @@ jobs:
]
for prompt in prompts:
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
Expand Down Expand Up @@ -272,9 +272,9 @@ jobs:
echo predictions = open('cpp.txt', 'r').read() >> ref.py
echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
echo prompt = '69'
echo if tokenizer.chat_template:
echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
echo tokenized = tokenizer(prompt, return_tensors='pt') >> ref.py
echo if tokenizer.chat_template:
echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py
echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
echo idx = predictions.find(ref) >> ref.py
Expand Down Expand Up @@ -581,8 +581,8 @@ jobs:
tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand Down Expand Up @@ -639,8 +639,8 @@ jobs:
tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt')
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
Expand Down
2 changes: 2 additions & 0 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {

std::optional<AdapterConfig> adapters;

bool apply_chat_template = true;

/** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
* Otherwise verifies eos_token_id == tokenizer_eos_token_id.
*/
Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {
read_anymap_param(properties, "logprobs", logprobs);
read_anymap_param(properties, "num_return_sequences", num_return_sequences);
read_anymap_param(properties, "adapters", adapters);
read_anymap_param(properties, "apply_chat_template", apply_chat_template);

// penalties
read_anymap_param(properties, "frequency_penalty", frequency_penalty);
Expand Down
5 changes: 3 additions & 2 deletions src/cpp/src/icontinuous_batching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
} else {
input_ids.reserve(prompts.size());
timer.start();
for (const std::string& prompt : prompts) {
for (size_t i = 0; i < prompts.size(); i++) {
const std::string& prompt = prompts.at(i);
const auto encode_start = std::chrono::steady_clock::now();
ov::Tensor encoded_inputs;
if (!m_tokenizer.get_chat_template().empty()) {
if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/llm_pipeline_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ DecodedResults StatefulLLMPipeline::generate(
OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
std::vector<std::string> templated_input_vector;
for (auto& input : *input_vector) {
if (!m_tokenizer.get_chat_template().empty()) {
if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", input}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
Expand Down Expand Up @@ -170,7 +170,7 @@ DecodedResults StatefulLLMPipeline::generate(
// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
std::string& prompt = *input_prompt;
if (!m_tokenizer.get_chat_template().empty()) {
if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ DecodedResults StatefulLLMPipeline::generate(
// for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
} else {
if (!m_tokenizer.get_chat_template().empty()) {
if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
Expand Down Expand Up @@ -1281,7 +1281,7 @@ DecodedResults StatelessLLMPipeline::generate(
// for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
} else {
if (!m_tokenizer.get_chat_template().empty()) {
if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
Expand Down
Loading

0 comments on commit 926a9cf

Please sign in to comment.