-
Notifications
You must be signed in to change notification settings - Fork 200
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Added performance metrics and updated Readme with description how to use them - Added cpp and python sample for benchmarking Sample to calculate and visualize performance metrics. ``` import openvino_genai as ov_genai import tqdm import pandas as pd import matplotlib.pylab as pl pipe = ov_genai.LLMPipeline('TinyLlama-1.1B-Chat-v1.0/') config = ov_genai.GenerationConfig(max_new_tokens=15) metrics_df = pd.DataFrame(columns=['batch_size', 'throughput', 'ttft', 'tpot', 'std_throughput', 'std_ttft', 'std_tpot']) num_iter = 3 for batch_size in tqdm.tqdm([1, 2, 4, 16, 32, 64, 128]): prompts = ["The Sky is blue because"] * batch_size res = pipe.generate(prompts, config) metrics = res.perf_metrics for _ in range(num_iter - 1): res = pipe.generate(prompts, config) metrics += res.perf_metrics metrics_df = metrics_df._append({ 'throughput': metrics.get_throughput().mean, 'ttft': metrics.get_ttft().mean, 'tpot': metrics.get_tpot().mean, 'std_throughput': metrics.get_throughput().std, 'std_ttft': metrics.get_ttft().std, 'std_tpot': metrics.get_tpot().std, 'batch_size': batch_size, }, ignore_index=True) fig, axes = pl.subplots(nrows=3, ncols=1, figsize=(6, 8), sharex=True) axes[0].plot(metrics_df['batch_size'], metrics_df['throughput'], '-o') axes[1].plot(metrics_df['batch_size'], metrics_df['ttft'], '-o', ) axes[2].plot(metrics_df['batch_size'], metrics_df['tpot'], '-o') axes[0].set_ylabel('Throughput'), axes[1].set_ylabel('TTFT'), axes[2].set_ylabel('TPOT') axes[2].set_xlabel('Batch Size') axes[0].grid(True), axes[1].grid(True), axes[2].grid(True) pl.tight_layout() ``` ![image](https://github.com/user-attachments/assets/021a94b4-fc75-4b5f-90e6-60db471a3810) ticket: CVS-132859
- Loading branch information
Showing
16 changed files
with
744 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Copyright (C) 2023-2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
||
find_package(OpenVINOGenAI REQUIRED PATHS | ||
"${CMAKE_BINARY_DIR}" # Reuse the package from the build. | ||
${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. | ||
) | ||
|
||
FetchContent_Declare(cxxopts | ||
URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz | ||
URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) | ||
FetchContent_MakeAvailable(cxxopts) | ||
|
||
add_executable(benchmark_genai benchmark_genai.cpp) | ||
target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) | ||
set_target_properties(benchmark_genai PROPERTIES | ||
COMPILE_PDB_NAME benchmark_genai | ||
# Ensure out of box LC_RPATH on macOS with SIP | ||
INSTALL_RPATH_USE_LINK_PATH ON) | ||
install(TARGETS benchmark_genai | ||
RUNTIME DESTINATION samples_bin/ | ||
COMPONENT samples_bin | ||
EXCLUDE_FROM_ALL) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# LLMs benchmarking sample | ||
|
||
This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. | ||
|
||
## Download and convert the model and tokenizers | ||
|
||
The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. | ||
|
||
It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. | ||
|
||
```sh | ||
pip install --upgrade-strategy eager -r ../../requirements.txt | ||
optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 | ||
``` | ||
|
||
## Usage | ||
|
||
```sh | ||
benchmark_vanilla_genai [OPTIONS] | ||
``` | ||
|
||
### Options | ||
|
||
- `-m, --model`: Path to the model and tokenizers base directory. | ||
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. | ||
- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. | ||
- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. | ||
- `-n, --num_iter` (default: `3`): Number of iterations. | ||
- `-d, --device` (default: `"CPU"`): Device to run the model on. | ||
|
||
### Output: | ||
|
||
``` | ||
benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 | ||
``` | ||
|
||
``` | ||
Load time: 3405.69 ms | ||
Generate time: 1430.77 ± 3.04 ms | ||
Tokenization time: 0.51 ± 0.02 ms | ||
Detokenization time: 0.37 ± 0.01 ms | ||
TTFT: 81.60 ± 0.54 ms | ||
TPOT: 71.52 ± 2.72 ms | ||
Throughput tokens/s: 13.98 ± 0.53 | ||
``` | ||
|
||
For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
// Copyright (C) 2023-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include "openvino/genai/llm_pipeline.hpp" | ||
#include <cxxopts.hpp> | ||
|
||
int main(int argc, char* argv[]) try { | ||
cxxopts::Options options("benchmark_vanilla_genai", "Help command"); | ||
|
||
options.add_options() | ||
("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value(".")) | ||
("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because")) | ||
("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1))) | ||
("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3))) | ||
("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20))) | ||
("d,device", "device", cxxopts::value<std::string>()->default_value("CPU")) | ||
("h,help", "Print usage"); | ||
|
||
cxxopts::ParseResult result; | ||
try { | ||
result = options.parse(argc, argv); | ||
} catch (const cxxopts::exceptions::exception& e) { | ||
std::cout << e.what() << "\n\n"; | ||
std::cout << options.help() << std::endl; | ||
return EXIT_FAILURE; | ||
} | ||
|
||
if (result.count("help")) { | ||
std::cout << options.help() << std::endl; | ||
return EXIT_SUCCESS; | ||
} | ||
|
||
std::string prompt = result["prompt"].as<std::string>(); | ||
const std::string model_path = result["model"].as<std::string>(); | ||
std::string device = result["device"].as<std::string>(); | ||
size_t num_warmup = result["num_warmup"].as<size_t>(); | ||
size_t num_iter = result["num_iter"].as<size_t>(); | ||
|
||
ov::genai::GenerationConfig config; | ||
config.max_new_tokens = result["max_new_tokens"].as<size_t>(); | ||
|
||
ov::genai::LLMPipeline pipe(model_path, device); | ||
|
||
for (size_t i = 0; i < num_warmup; i++) | ||
pipe.generate(prompt, config); | ||
|
||
ov::genai::DecodedResults res = pipe.generate(prompt, config); | ||
ov::genai::PerfMetrics metrics = res.perf_metrics; | ||
for (size_t i = 0; i < num_iter - 1; i++) { | ||
res = pipe.generate(prompt, config); | ||
metrics = metrics + res.perf_metrics; | ||
} | ||
|
||
std::cout << std::fixed << std::setprecision(2); | ||
std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl; | ||
std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl; | ||
std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl; | ||
std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl; | ||
std::cout << "TTFT: " << metrics.get_ttft().mean << " ± " << metrics.get_ttft().std << " ms" << std::endl; | ||
std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl; | ||
std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl; | ||
|
||
return 0; | ||
} catch (const std::exception& error) { | ||
std::cerr << error.what() << '\n'; | ||
return EXIT_FAILURE; | ||
} catch (...) { | ||
std::cerr << "Non-exception object thrown\n"; | ||
return EXIT_FAILURE; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# LLMs benchmarking sample | ||
|
||
This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. | ||
|
||
## Download and convert the model and tokenizers | ||
|
||
The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. | ||
|
||
It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. | ||
|
||
```sh | ||
pip install --upgrade-strategy eager -r ../../requirements.txt | ||
optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 | ||
``` | ||
|
||
## Usage | ||
|
||
```sh | ||
python benchmark_vanilla_genai.py [OPTIONS] | ||
``` | ||
|
||
### Options | ||
|
||
- `-m, --model`: Path to the model and tokenizers base directory. | ||
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. | ||
- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. | ||
- `-n, --num_iter` (default: `3`): Number of iterations. | ||
- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. | ||
- `-d, --device` (default: `"CPU"`): Device to run the model on. | ||
|
||
### Output: | ||
|
||
``` | ||
python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 | ||
``` | ||
|
||
``` | ||
Load time: 3405.69 ms | ||
Generate time: 1430.77 ± 3.04 ms | ||
Tokenization time: 0.51 ± 0.02 ms | ||
Detokenization time: 0.37 ± 0.01 ms | ||
TTFT: 81.60 ± 0.54 ms | ||
TPOT: 71.52 ± 2.72 ms | ||
Throughput tokens/s: 13.98 ± 0.53 | ||
``` | ||
|
||
For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Copyright (C) 2023-2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import argparse | ||
import openvino_genai as ov_genai | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Help command") | ||
parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") | ||
parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") | ||
parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") | ||
parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") | ||
parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") | ||
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") | ||
|
||
args = parser.parse_args() | ||
|
||
# Perf metrics is stored in DecodedResults. | ||
# In order to get DecodedResults instead of a string input should be a list. | ||
prompt = [args.prompt] | ||
model_path = args.model | ||
device = args.device | ||
num_warmup = args.num_warmup | ||
num_iter = args.num_iter | ||
|
||
config = ov_genai.GenerationConfig() | ||
config.max_new_tokens = args.max_new_tokens | ||
|
||
pipe = ov_genai.LLMPipeline(model_path, device) | ||
|
||
for _ in range(num_warmup): | ||
pipe.generate(prompt, config) | ||
|
||
res = pipe.generate(prompt, config) | ||
perf_metrics = res.perf_metrics | ||
for _ in range(num_iter - 1): | ||
res = pipe.generate(prompt, config) | ||
perf_metrics += res.perf_metrics | ||
|
||
print(f"Load time: {perf_metrics.get_load_time():.2f} ms") | ||
print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") | ||
print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") | ||
print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms") | ||
print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms") | ||
print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms") | ||
print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.