|
21 | 21 | #ifdef OV_CORE_USE_XBYAK_JIT
|
22 | 22 | # include "openvino/core/parallel.hpp"
|
23 | 23 | # include "openvino/reference/utils/registers_pool.hpp"
|
| 24 | +# include "openvino/util/common_util.hpp" |
24 | 25 | #endif // OV_CORE_USE_XBYAK_JIT
|
25 | 26 |
|
26 | 27 | namespace ov {
|
@@ -822,77 +823,79 @@ void ComputeHash<isa>::fold_to_64(const Vmm& v_dst) {
|
822 | 823 |
|
823 | 824 | size_t compute_hash(const void* src, size_t size) {
|
824 | 825 | #ifdef OV_CORE_USE_XBYAK_JIT
|
825 |
| - if (Generator::mayiuse(avx2)) { |
826 |
| - uint64_t result = 0lu; |
827 |
| - |
828 |
| - // Parallel section |
829 |
| - constexpr uint64_t min_wa_per_thread = 131072lu; // 2^17 |
830 |
| - const uint64_t size_u64 = static_cast<uint64_t>(size); |
831 |
| - if (size_u64 >= min_wa_per_thread * 2lu) { |
832 |
| - static auto first_thr_kernel = Generator::mayiuse(avx512_core) |
833 |
| - ? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD}) |
834 |
| - : jit::ComputeHash<avx2>::create({jit::FIRST_THREAD}); |
835 |
| - static auto n_thr_kernel = Generator::mayiuse(avx512_core) |
836 |
| - ? jit::ComputeHash<avx512_core>::create({jit::N_THREAD}) |
837 |
| - : jit::ComputeHash<avx2>::create({jit::N_THREAD}); |
838 |
| - static auto final_fold_kernel = Generator::mayiuse(avx512_core) |
839 |
| - ? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD}) |
840 |
| - : jit::ComputeHash<avx2>::create({jit::FINAL_FOLD}); |
841 |
| - |
842 |
| - static const uint64_t max_thr_num = 2lu; |
843 |
| - uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num); |
844 |
| - const uint64_t el_per_thread = |
845 |
| - first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen()); |
846 |
| - std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen()); |
847 |
| - |
848 |
| - parallel_nt_static(static_cast<int>(thr_num), [&](const int ithr, const int nthr) { |
849 |
| - uint64_t start = el_per_thread * ithr; |
850 |
| - if (start >= size_u64) { |
851 |
| - return; |
852 |
| - } |
853 |
| - uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread; |
| 826 | + if (util::may_i_use_dynamic_code()) { |
| 827 | + if (Generator::mayiuse(avx2)) { |
| 828 | + uint64_t result = 0lu; |
| 829 | + |
| 830 | + // Parallel section |
| 831 | + constexpr uint64_t min_wa_per_thread = 131072lu; // 2^17 |
| 832 | + const uint64_t size_u64 = static_cast<uint64_t>(size); |
| 833 | + if (size_u64 >= min_wa_per_thread * 2lu) { |
| 834 | + static auto first_thr_kernel = Generator::mayiuse(avx512_core) |
| 835 | + ? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD}) |
| 836 | + : jit::ComputeHash<avx2>::create({jit::FIRST_THREAD}); |
| 837 | + static auto n_thr_kernel = Generator::mayiuse(avx512_core) |
| 838 | + ? jit::ComputeHash<avx512_core>::create({jit::N_THREAD}) |
| 839 | + : jit::ComputeHash<avx2>::create({jit::N_THREAD}); |
| 840 | + static auto final_fold_kernel = Generator::mayiuse(avx512_core) |
| 841 | + ? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD}) |
| 842 | + : jit::ComputeHash<avx2>::create({jit::FINAL_FOLD}); |
| 843 | + |
| 844 | + static const uint64_t max_thr_num = 2lu; |
| 845 | + uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num); |
| 846 | + const uint64_t el_per_thread = |
| 847 | + first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen()); |
| 848 | + std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen()); |
| 849 | + |
| 850 | + parallel_nt_static(static_cast<int>(thr_num), [&](const int ithr, const int nthr) { |
| 851 | + uint64_t start = el_per_thread * ithr; |
| 852 | + if (start >= size_u64) { |
| 853 | + return; |
| 854 | + } |
| 855 | + uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread; |
| 856 | + |
| 857 | + jit::ComputeHashCallArgs args; |
| 858 | + |
| 859 | + args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr; |
| 860 | + args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]); |
| 861 | + args.k_ptr = jit::K_PULL; |
| 862 | + args.work_amount = work_amount; |
| 863 | + args.size = size_u64; |
| 864 | + args.threads_num = thr_num; |
| 865 | + |
| 866 | + if (ithr == 0) { |
| 867 | + (*first_thr_kernel)(&args); |
| 868 | + } else { |
| 869 | + (*n_thr_kernel)(&args); |
| 870 | + } |
| 871 | + }); |
854 | 872 |
|
855 | 873 | jit::ComputeHashCallArgs args;
|
| 874 | + args.work_amount = size_u64 - el_per_thread * thr_num; |
| 875 | + args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size_u64 - args.work_amount; |
| 876 | + args.dst_ptr = &result; |
| 877 | + args.k_ptr = jit::K_PULL; |
| 878 | + args.size = size_u64; |
| 879 | + args.intermediate_ptr = intermediate.data(); |
856 | 880 |
|
857 |
| - args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr; |
858 |
| - args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]); |
| 881 | + (*final_fold_kernel)(&args); |
| 882 | + } else { |
| 883 | + static auto single_thr_kernel = Generator::mayiuse(avx512_core) |
| 884 | + ? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD}) |
| 885 | + : jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD}); |
| 886 | + |
| 887 | + jit::ComputeHashCallArgs args; |
| 888 | + args.src_ptr = src; |
| 889 | + args.dst_ptr = &result; |
859 | 890 | args.k_ptr = jit::K_PULL;
|
860 |
| - args.work_amount = work_amount; |
| 891 | + args.work_amount = size_u64; |
861 | 892 | args.size = size_u64;
|
862 |
| - args.threads_num = thr_num; |
863 |
| - |
864 |
| - if (ithr == 0) { |
865 |
| - (*first_thr_kernel)(&args); |
866 |
| - } else { |
867 |
| - (*n_thr_kernel)(&args); |
868 |
| - } |
869 |
| - }); |
870 |
| - |
871 |
| - jit::ComputeHashCallArgs args; |
872 |
| - args.work_amount = size_u64 - el_per_thread * thr_num; |
873 |
| - args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size_u64 - args.work_amount; |
874 |
| - args.dst_ptr = &result; |
875 |
| - args.k_ptr = jit::K_PULL; |
876 |
| - args.size = size_u64; |
877 |
| - args.intermediate_ptr = intermediate.data(); |
878 |
| - |
879 |
| - (*final_fold_kernel)(&args); |
880 |
| - } else { |
881 |
| - static auto single_thr_kernel = Generator::mayiuse(avx512_core) |
882 |
| - ? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD}) |
883 |
| - : jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD}); |
884 |
| - |
885 |
| - jit::ComputeHashCallArgs args; |
886 |
| - args.src_ptr = src; |
887 |
| - args.dst_ptr = &result; |
888 |
| - args.k_ptr = jit::K_PULL; |
889 |
| - args.work_amount = size_u64; |
890 |
| - args.size = size_u64; |
891 |
| - |
892 |
| - (*single_thr_kernel)(&args); |
893 |
| - } |
894 | 893 |
|
895 |
| - return result; |
| 894 | + (*single_thr_kernel)(&args); |
| 895 | + } |
| 896 | + |
| 897 | + return result; |
| 898 | + } |
896 | 899 | }
|
897 | 900 |
|
898 | 901 | #endif // OV_CORE_USE_XBYAK_JIT
|
|
0 commit comments