From 84c10bffef402a2f9912673b1c4f8984498b3629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semenov=20Herman=20=28=D0=A1=D0=B5=D0=BC=D0=B5=D0=BD=D0=BE?= =?UTF-8?q?=D0=B2=20=D0=93=D0=B5=D1=80=D0=BC=D0=B0=D0=BD=29?= Date: Wed, 1 Jan 2025 17:24:38 +0300 Subject: [PATCH] performance: using reserve() for reduce cost time inserts --- .../source/decoder/zebin_manipulator.cpp | 1 + .../source/ocloc_supported_devices_helper.cpp | 8 ++++++ .../aub_mem_dump_pvc_and_later.inl | 1 + .../aub_mem_dump_xehp_and_later.inl | 1 + .../source/command_container/cmdcontainer.cpp | 1 + .../command_stream/submissions_aggregator.cpp | 1 + .../compiler_interface/oclc_extensions.cpp | 8 ++++++ shared/source/device/device.cpp | 2 ++ .../device_binary_format/elf/elf_decoder.cpp | 6 +++-- .../device_binary_format/elf/elf_rewriter.h | 2 ++ .../gen12lp/gfx_core_helper_gen12lp.cpp | 1 + .../source/helpers/bindless_heaps_helper.cpp | 1 + .../gfx_core_helper_xehp_and_later.inl | 1 + shared/source/helpers/l3_range.h | 25 ++++++++++++++++--- .../gfx_core_helper_xe_hpc_core.cpp | 1 + .../gfx_core_helper_xe_hpg_core.cpp | 1 + 16 files changed, 56 insertions(+), 5 deletions(-) diff --git a/shared/offline_compiler/source/decoder/zebin_manipulator.cpp b/shared/offline_compiler/source/decoder/zebin_manipulator.cpp index ebc2365b91fc9..098b51f8c6910 100644 --- a/shared/offline_compiler/source/decoder/zebin_manipulator.cpp +++ b/shared/offline_compiler/source/decoder/zebin_manipulator.cpp @@ -280,6 +280,7 @@ void ZebinDecoder::dumpSymtab(ElfT &elf, ArrayRef symtab template std::vector ZebinDecoder::dumpElfSections(ElfT &elf) { std::vector sectionInfos; + sectionInfos.reserve(elf.sectionHeaders.size() - 1U); for (size_t secId = 1U; secId < elf.sectionHeaders.size(); secId++) { auto &[header, data] = elf.sectionHeaders[secId]; auto sectionName = elf.getSectionName(static_cast(secId)); diff --git a/shared/offline_compiler/source/ocloc_supported_devices_helper.cpp b/shared/offline_compiler/source/ocloc_supported_devices_helper.cpp index f3b0aa16403df..7b090849aa1c5 100644 --- a/shared/offline_compiler/source/ocloc_supported_devices_helper.cpp +++ b/shared/offline_compiler/source/ocloc_supported_devices_helper.cpp @@ -27,13 +27,18 @@ SupportedDevicesHelper::SupportedDevicesData SupportedDevicesHelper::collectSupp SupportedDevicesData data; // Populate IP Versions, Device Infos, Acronyms + data.deviceIpVersions.reserve(enabledDevices.size()); + data.deviceInfos.reserve(enabledDevices.size()); + data.acronyms.reserve(enabledDevices.size()); for (const auto &device : enabledDevices) { data.deviceIpVersions.push_back(device.aotConfig.value); + data.deviceInfos.reserve((*device.deviceIds).size()); for (const auto &deviceId : *device.deviceIds) { data.deviceInfos.push_back({deviceId, device.aotConfig.revision, device.aotConfig.value}); } + data.acronyms.reserve(device.deviceAcronyms.size()); for (const auto &acronym : device.deviceAcronyms) { data.acronyms.push_back({acronym.data(), device.aotConfig.value}); } @@ -44,6 +49,7 @@ SupportedDevicesHelper::SupportedDevicesData SupportedDevicesHelper::collectSupp for (const auto &device : enabledDevices) { groupedDevices[device.family].push_back(device.aotConfig.value); } + data.familyGroups.reserve(groupedDevices.size()); for (const auto &entry : groupedDevices) { data.familyGroups.push_back({productConfigHelper->getAcronymFromAFamily(entry.first).data(), entry.second}); } @@ -271,6 +277,7 @@ SupportedDevicesHelper::SupportedDevicesData SupportedDevicesHelper::mergeOclocD [](const auto &a, const auto &b) { return a.second < b.second; }); // Sort FamilyGroups (alphabetically by group name) + mergedData.familyGroups.reserve(uniqueFamilyGroups.size()); for (const auto &[family, ipVersions] : uniqueFamilyGroups) { mergedData.familyGroups.push_back({family, std::vector(ipVersions.begin(), ipVersions.end())}); } @@ -278,6 +285,7 @@ SupportedDevicesHelper::SupportedDevicesData SupportedDevicesHelper::mergeOclocD [](const auto &a, const auto &b) { return a.first < b.first; }); // Sort ReleaseGroups (alphabetically by group name) + mergedData.releaseGroups.reserve(uniqueReleaseGroups.size()); for (const auto &[release, ipVersions] : uniqueReleaseGroups) { mergedData.releaseGroups.push_back({release, std::vector(ipVersions.begin(), ipVersions.end())}); } diff --git a/shared/source/aub_mem_dump/aub_mem_dump_pvc_and_later.inl b/shared/source/aub_mem_dump/aub_mem_dump_pvc_and_later.inl index c2a8d32ac8769..37eb7af6078ce 100644 --- a/shared/source/aub_mem_dump/aub_mem_dump_pvc_and_later.inl +++ b/shared/source/aub_mem_dump/aub_mem_dump_pvc_and_later.inl @@ -252,6 +252,7 @@ static const MMIOList mmioListVECS = { static MMIOList mmioListCCSInstance(uint32_t mmioBase) { MMIOList mmioList; + mmioList.reserve(17); mmioList.push_back(MMIOPair(0x0000ce90, 0x00030003)); // GFX_MULT_CTXT_CTL - enable multi-context with 4CCS mmioList.push_back(MMIOPair(0x0000b170, 0x00030003)); // MULT_CTXT_CTL - enable multi-context with 4CCS mmioList.push_back(MMIOPair(0x00014800, 0xFFFF0001)); // RCU_MODE diff --git a/shared/source/aub_mem_dump/aub_mem_dump_xehp_and_later.inl b/shared/source/aub_mem_dump/aub_mem_dump_xehp_and_later.inl index 9de1b47b33d46..9f0bcf6ca3d2c 100644 --- a/shared/source/aub_mem_dump/aub_mem_dump_xehp_and_later.inl +++ b/shared/source/aub_mem_dump/aub_mem_dump_xehp_and_later.inl @@ -234,6 +234,7 @@ static const MMIOList mmioListVECS = { static MMIOList mmioListCCSInstance(uint32_t mmioBase) { MMIOList mmioList; + mmioList.reserve(17); mmioList.push_back(MMIOPair(0x0000ce90, 0x00030003)); // GFX_MULT_CTXT_CTL - enable multi-context with 4CCS mmioList.push_back(MMIOPair(0x0000b170, 0x00030003)); // MULT_CTXT_CTL - enable multi-context with 4CCS mmioList.push_back(MMIOPair(0x00014800, 0xFFFF0001)); // RCU_MODE diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp index 3814dad31cb52..85d2d729f4dd0 100644 --- a/shared/source/command_container/cmdcontainer.cpp +++ b/shared/source/command_container/cmdcontainer.cpp @@ -525,6 +525,7 @@ void CommandContainer::fillReusableAllocationLists() { return; } + this->getResidencyContainer().reserve(amountToFill); for (auto i = 0u; i < amountToFill; i++) { auto allocToReuse = obtainNextCommandBufferAllocation(); this->immediateReusableAllocationList->pushTailOne(*allocToReuse); diff --git a/shared/source/command_stream/submissions_aggregator.cpp b/shared/source/command_stream/submissions_aggregator.cpp index f6cbe46ae8cc8..b4fdbf684434f 100644 --- a/shared/source/command_stream/submissions_aggregator.cpp +++ b/shared/source/command_stream/submissions_aggregator.cpp @@ -83,6 +83,7 @@ void NEO::SubmissionAggregator::aggregateCommandBuffers(ResourcePackage &resourc totalUsedSize += nextCommandBufferNewResourcesSize; currentNode->inspectionId = currentInspection; + resourcePackage.reserve(newResources.size()); for (auto &newResource : newResources) { resourcePackage.push_back(newResource); } diff --git a/shared/source/compiler_interface/oclc_extensions.cpp b/shared/source/compiler_interface/oclc_extensions.cpp index f5b5b9c36cdc8..bd54b54d50100 100644 --- a/shared/source/compiler_interface/oclc_extensions.cpp +++ b/shared/source/compiler_interface/oclc_extensions.cpp @@ -26,6 +26,8 @@ void getOpenclCFeaturesList(const HardwareInfo &hwInfo, OpenClCFeaturesContainer openclCFeatures.push_back(openClCFeature); if (hwInfo.capabilityTable.supportsImages) { + openclCFeatures.reserve(3); + strcpy_s(openClCFeature.name, CL_NAME_VERSION_MAX_NAME_SIZE, "__opencl_c_3d_image_writes"); openclCFeatures.push_back(openClCFeature); @@ -37,6 +39,8 @@ void getOpenclCFeaturesList(const HardwareInfo &hwInfo, OpenClCFeaturesContainer } if (hwInfo.capabilityTable.supportsOcl21Features) { + openclCFeatures.reserve(8); + strcpy_s(openClCFeature.name, CL_NAME_VERSION_MAX_NAME_SIZE, "__opencl_c_atomic_order_acq_rel"); openclCFeatures.push_back(openClCFeature); @@ -62,6 +66,8 @@ void getOpenclCFeaturesList(const HardwareInfo &hwInfo, OpenClCFeaturesContainer openclCFeatures.push_back(openClCFeature); if (hwInfo.capabilityTable.supportsFloatAtomics) { + openclCFeatures.reserve(8); + strcpy_s(openClCFeature.name, CL_NAME_VERSION_MAX_NAME_SIZE, "__opencl_c_ext_fp32_global_atomic_add"); openclCFeatures.push_back(openClCFeature); @@ -102,6 +108,8 @@ void getOpenclCFeaturesList(const HardwareInfo &hwInfo, OpenClCFeaturesContainer openclCFeatures.push_back(openClCFeature); if (hwInfo.capabilityTable.supportsOcl21Features && hwInfo.capabilityTable.supportsFloatAtomics) { + openclCFeatures.reserve(4); + strcpy_s(openClCFeature.name, CL_NAME_VERSION_MAX_NAME_SIZE, "__opencl_c_ext_fp64_global_atomic_add"); openclCFeatures.push_back(openClCFeature); diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp index 7916cc333bcbd..66177b7d24391 100644 --- a/shared/source/device/device.cpp +++ b/shared/source/device/device.cpp @@ -894,6 +894,7 @@ void Device::initializeRayTracing(uint32_t maxBvhLevels) { rtMemoryBackedBuffer = getMemoryManager()->allocateGraphicsMemoryWithProperties(allocProps); } + rtDispatchGlobalsInfos.reserve(maxBvhLevels - rtDispatchGlobalsInfos.size() + 1); while (rtDispatchGlobalsInfos.size() <= maxBvhLevels) { rtDispatchGlobalsInfos.push_back(nullptr); } @@ -1103,6 +1104,7 @@ void Device::allocateRTDispatchGlobals(uint32_t maxBvhLevels) { return; } + dispatchGlobalsInfo->rtStacks.reserve(tileCount); for (unsigned int tile = 0; tile < tileCount; tile++) { DeviceBitfield deviceBitfield = (tileCount == 1) diff --git a/shared/source/device_binary_format/elf/elf_decoder.cpp b/shared/source/device_binary_format/elf/elf_decoder.cpp index 94492c9ce2137..30f3434af4f8e 100644 --- a/shared/source/device_binary_format/elf/elf_decoder.cpp +++ b/shared/source/device_binary_format/elf/elf_decoder.cpp @@ -80,6 +80,7 @@ Elf decodeElf(const ArrayRef binary, std::string &outErr } const ElfProgramHeader *programHeader = reinterpret_cast *>(binary.begin() + ret.elfFileHeader->phOff); + ret.programHeaders.reserve(ret.elfFileHeader->phNum); for (decltype(ret.elfFileHeader->phNum) i = 0; i < ret.elfFileHeader->phNum; ++i) { if (programHeader->offset + programHeader->fileSz > binary.size()) { outErrReason = "Out of bounds program header offset/filesz, program header idx : " + std::to_string(i); @@ -91,6 +92,7 @@ Elf decodeElf(const ArrayRef binary, std::string &outErr } const ElfSectionHeader *sectionHeader = reinterpret_cast *>(binary.begin() + ret.elfFileHeader->shOff); + ret.sectionHeaders.reserve(ret.elfFileHeader->shNum); for (decltype(ret.elfFileHeader->shNum) i = 0; i < ret.elfFileHeader->shNum; ++i) { ArrayRef data; if (SHT_NOBITS != sectionHeader->type) { @@ -151,7 +153,7 @@ bool Elf::decodeRelocations(SectionHeaderAndData §ionHeade // there may be multiple rela sections, reserve additional size auto previousEntries = relocations.size(); auto allEntries = previousEntries + numberOfEntries; - relocs.reserve(allEntries); + relocs.reserve(allEntries - previousEntries); for (auto i = previousEntries; i < allEntries; i++) { @@ -186,7 +188,7 @@ bool Elf::decodeRelocations(SectionHeaderAndData §ionHeade // there may be multiple rel sections, reserve additional size auto previousEntries = relocations.size(); auto allEntries = previousEntries + numberOfEntries; - relocs.reserve(allEntries); + relocs.reserve(allEntries - previousEntries); for (auto i = previousEntries; i < allEntries; i++) { int symbolIndex = extractSymbolIndex>(*reloc); diff --git a/shared/source/device_binary_format/elf/elf_rewriter.h b/shared/source/device_binary_format/elf/elf_rewriter.h index 975ca272002e8..dc3e93ec0cdd3 100644 --- a/shared/source/device_binary_format/elf/elf_rewriter.h +++ b/shared/source/device_binary_format/elf/elf_rewriter.h @@ -48,9 +48,11 @@ struct ElfRewriter { ElfRewriter(NEO::Elf::Elf &src) { elfFileHeader = *src.elfFileHeader; + this->sectionHeaders.reserve(src.sectionHeaders.size()); for (const auto &sh : src.sectionHeaders) { this->sectionHeaders.push_back(std::make_unique>(src.getName(sh.header->name), *sh.header, std::vector{sh.data.begin(), sh.data.end()})); } + this->programHeaders.reserve(src.programHeaders.size()); for (const auto &ph : src.programHeaders) { this->programHeaders.push_back(std::make_unique>(*ph.header, std::vector{ph.data.begin(), ph.data.end()})); for (const auto &sh : this->sectionHeaders) { diff --git a/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp b/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp index aa32e404d7a3a..634e1040f8f0c 100644 --- a/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp +++ b/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp @@ -101,6 +101,7 @@ const EngineInstancesContainer GfxCoreHelperHw::getGpgpuEngineInstances( engines.push_back({aub_stream::ENGINE_CCS, EngineUsage::regular}); } + engines.reserve(3); engines.push_back({aub_stream::ENGINE_RCS, EngineUsage::regular}); engines.push_back({aub_stream::ENGINE_RCS, EngineUsage::lowPriority}); // low priority engines.push_back({defaultEngine, EngineUsage::internal}); // internal usage diff --git a/shared/source/helpers/bindless_heaps_helper.cpp b/shared/source/helpers/bindless_heaps_helper.cpp index 726b0640b866d..ed10c207b6b7f 100644 --- a/shared/source/helpers/bindless_heaps_helper.cpp +++ b/shared/source/helpers/bindless_heaps_helper.cpp @@ -55,6 +55,7 @@ BindlessHeapsHelper::BindlessHeapsHelper(Device *rootDevice, bool isMultiOsConte rootDeviceIndex(rootDevice->getRootDeviceIndex()), deviceBitfield(rootDevice->getDeviceBitfield()) { + ssHeapsAllocations.reserve(BindlesHeapType::numHeapTypes); for (auto heapType = 0; heapType < BindlesHeapType::numHeapTypes; heapType++) { auto size = MemoryConstants::pageSize64k; diff --git a/shared/source/helpers/gfx_core_helper_xehp_and_later.inl b/shared/source/helpers/gfx_core_helper_xehp_and_later.inl index 4cd3466de279f..522c47fb7c8af 100644 --- a/shared/source/helpers/gfx_core_helper_xehp_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_xehp_and_later.inl @@ -128,6 +128,7 @@ aub_stream::MMIOList GfxCoreHelperHw::getExtraMmioList(const Hardware uint32_t value = 1; // [0] enable value |= (format << 3); // [3:7] compression_format + mmioList.reserve(3); mmioList.push_back({0x519C, value}); mmioList.push_back({0xB0F0, value}); mmioList.push_back({0xE4C0, value}); diff --git a/shared/source/helpers/l3_range.h b/shared/source/helpers/l3_range.h index 6099a79cefb5a..58b4bc6cd040b 100644 --- a/shared/source/helpers/l3_range.h +++ b/shared/source/helpers/l3_range.h @@ -106,6 +106,12 @@ inline bool operator!=(const L3Range &lhs, const L3Range &rhs) { return (false == (lhs == rhs)); } +template +constexpr bool isCanPreallocStlContainer(const std::vector& v) { return true; } + +template +constexpr bool isCanPreallocStlContainer(const std::unordered_map& v) { return true; } + template inline void coverRangeExactImpl(uint64_t address, uint64_t size, ContainerT &ret, uint64_t policy) { UNRECOVERABLE_IF(false == L3Range::meetsMinimumAlignment(address)); @@ -114,10 +120,23 @@ inline void coverRangeExactImpl(uint64_t address, uint64_t size, ContainerT &ret const uint64_t end = address + size; uint64_t offset = address; + + uint64_t maxRangeSizeBySize; + uint64_t maxRangeSizeByOffset; + uint64_t rangeSize; + + if constexpr (is_can_prealloc_stl_container(ret)) { + maxRangeSizeBySize = Math::prevPowerOfTwo(end - offset); + maxRangeSizeByOffset = offset ? (1ULL << Math::ffs(offset)) : L3Range::maxSingleRange; + rangeSize = std::min(maxRangeSizeBySize, maxRangeSizeByOffset); + rangeSize = std::min(rangeSize, +L3Range::maxSingleRange); + ret.reserve((end - offset) / rangeSize); + } + while (offset < end) { - uint64_t maxRangeSizeBySize = Math::prevPowerOfTwo(end - offset); - uint64_t maxRangeSizeByOffset = offset ? (1ULL << Math::ffs(offset)) : L3Range::maxSingleRange; - uint64_t rangeSize = std::min(maxRangeSizeBySize, maxRangeSizeByOffset); + maxRangeSizeBySize = Math::prevPowerOfTwo(end - offset); + maxRangeSizeByOffset = offset ? (1ULL << Math::ffs(offset)) : L3Range::maxSingleRange; + rangeSize = std::min(maxRangeSizeBySize, maxRangeSizeByOffset); rangeSize = std::min(rangeSize, +L3Range::maxSingleRange); ret.push_back(L3Range::fromAddressSizeWithPolicy(offset, rangeSize, policy)); offset += rangeSize; diff --git a/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp b/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp index c049d0ebda040..85e7ea8d98427 100644 --- a/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp @@ -50,6 +50,7 @@ const EngineInstancesContainer GfxCoreHelperHw::getGpgpuEngineInstances( EngineInstancesContainer engines; if (hwInfo.featureTable.flags.ftrCCSNode) { + engines.reserve(hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled); for (uint32_t i = 0; i < hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled; i++) { engines.push_back({static_cast(i + aub_stream::ENGINE_CCS), EngineUsage::regular}); if (productHelper.isCooperativeEngineSupported(hwInfo)) { diff --git a/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp b/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp index 09e7e693a5f49..86e9b255e31c2 100644 --- a/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp @@ -173,6 +173,7 @@ const EngineInstancesContainer GfxCoreHelperHw::getGpgpuEngineInstances( auto ailHelper = rootDeviceEnvironment.getAILConfigurationHelper(); auto forceRcs = ailHelper && ailHelper->forceRcs(); if (hwInfo.featureTable.flags.ftrCCSNode && !forceRcs) { + engines.reserve(hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled); for (uint32_t i = 0; i < hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled; i++) { engines.push_back({static_cast(i + aub_stream::ENGINE_CCS), EngineUsage::regular}); }