1. Add threading to README.md 2. set Torch thread nums when device is…

… cpu
wgzintel · Sep 27, 2024 · 59eddc3 · 59eddc3
1 parent 9c2f800
commit 59eddc3
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 13 deletions.
diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md
@@ -138,3 +138,4 @@ For example, --load_config config.json as following in OpenVINO 2024.0.0 will re
 > If you encounter any errors, please check **[NOTES.md](./doc/NOTES.md)** which provides solutions to the known errors.
 ### 2. Image generation
 > To configure more parameters for image generation models, reference to **[IMAGE_GEN.md](./doc/IMAGE_GEN.md)**
+### 3. Threading
diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
@@ -337,8 +337,8 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters
                 prompt_idx_list.append(i)
     if len(input_text_list) == 0:
         raise RuntimeError('==Failure prompts is empty ==')
-    log.info(f"Benchmarking iter nums(exclude warm-up): {num_iters}, "
-             f'prompt nums: {len(text_list)}, prompt idx: {prompt_idx_list}')
+    log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(text_list)}, '
+             f"prompt idx: {prompt_idx_list}, num_beams: {args['num_beams']}")
 
     # if num_iters == 0, just output warm-up data
     text_gen_fn = run_text_generation if not use_genai else run_text_generation_genai
@@ -725,9 +725,6 @@ def get_argprser():
 def main():
     logging_kwargs = {"encoding": "utf-8"} if sys.version_info[1] > 8 else {}
     log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, **logging_kwargs)
-    env_omp = os.getenv('OMP_WAIT_POLICY')
-    if env_omp is None or env_omp != 'PASSIVE':
-        log.warning('It is recommended to set the environment variable OMP_WAIT_POLICY to PASSIVE, so that OpenVINO inference can use all CPU resources without waiting.')
     args = get_argprser()
     model_path, framework, model_args, model_name = llm_bench_utils.model_utils.analyze_args(args)
 
@@ -742,14 +739,18 @@ def main():
         out_str += ', openvino runtime version: {}'.format(get_version())
         if model_args['config'].get('PREC_BF16') and model_args['config']['PREC_BF16'] is True:
             log.warning('[Warning] Param bf16/prec_bf16 only work for framework pt. It will be disabled.')
-        original_torch_thread_nums = torch.get_num_threads()
-        num_beams = model_args['num_beams']
-        if num_beams > 1:
-            torch.set_num_threads(int(original_torch_thread_nums / 2))
-        else:
-            torch.set_num_threads(1)
-        log.info(f'The num_beams is {num_beams}, update Torch thread num from {original_torch_thread_nums} to {torch.get_num_threads()}, '
-                 f'avoid to use the CPU cores for OpenVINO inference.')
+        if 'cpu' in args.device.lower():
+            env_omp = os.getenv('OMP_WAIT_POLICY')
+            if env_omp is None or env_omp != 'PASSIVE':
+                log.warning(f'It is recommended to set the environment variable OMP_WAIT_POLICY to PASSIVE, '
+                            f'so that OpenVINO inference can use all CPU resources without waiting.')
+            original_torch_thread_nums = torch.get_num_threads()
+            if model_args['num_beams'] > 1:
+                torch.set_num_threads(int(original_torch_thread_nums / 2))
+            else:
+                torch.set_num_threads(1)
+            log.info(f"The num_beams is {model_args['num_beams']}, update Torch thread num from "
+                    f'{original_torch_thread_nums} to {torch.get_num_threads()}, avoid to use the CPU cores for OpenVINO inference.')
     log.info(out_str)
     if args.memory_consumption:
         mem_consumption.start_collect_mem_consumption_thread()