From e97493b2a1687b66cd283eddec375d459118e6fd Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:17:18 +0100 Subject: [PATCH] Optimize ChunkedHeap --- .../Latte/Renderer/Vulkan/VKRMemoryManager.h | 12 +- src/Common/precompiled.h | 19 ++ src/util/ChunkedHeap/ChunkedHeap.h | 170 +++++++++--------- 3 files changed, 114 insertions(+), 87 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h index ecf53996..08af5882 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h @@ -44,7 +44,7 @@ struct VkImageMemAllocation uint32 getAllocationSize() { return allocationSize; } }; -class VkTextureChunkedHeap : private ChunkedHeap +class VkTextureChunkedHeap : private ChunkedHeap<> { public: VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { }; @@ -80,8 +80,8 @@ public: void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const { - totalHeapSize = numHeapBytes; - allocatedBytes = numAllocatedBytes; + totalHeapSize = m_numHeapBytes; + allocatedBytes = m_numAllocatedBytes; } private: @@ -92,7 +92,7 @@ public: std::vector m_list_chunkInfo; }; -class VkBufferChunkedHeap : private ChunkedHeap +class VkBufferChunkedHeap : private ChunkedHeap<> { public: VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; @@ -123,8 +123,8 @@ class VkBufferChunkedHeap : private ChunkedHeap void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const { numBuffers = m_chunkBuffers.size(); - totalBufferSize = numHeapBytes; - freeBufferSize = numHeapBytes - numAllocatedBytes; + totalBufferSize = m_numHeapBytes; + freeBufferSize = m_numHeapBytes - m_numAllocatedBytes; } bool RequiresFlush(uint32 index) const diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h index d4df4343..3dfeaf74 100644 --- a/src/Common/precompiled.h +++ b/src/Common/precompiled.h @@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor, #define NOEXPORT __attribute__ ((visibility ("hidden"))) #endif +#if defined(_MSC_VER) +#define FORCE_INLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE inline __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined +{ +#if defined(_MSC_VER) + return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF +#elif defined(__GNUC__) || defined(__clang__) + return __builtin_ctz(v); +#else + return std::countr_zero(v); +#endif +} + // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers #if defined(__aarch64__) diff --git a/src/util/ChunkedHeap/ChunkedHeap.h b/src/util/ChunkedHeap/ChunkedHeap.h index abc45429..21a1b868 100644 --- a/src/util/ChunkedHeap/ChunkedHeap.h +++ b/src/util/ChunkedHeap/ChunkedHeap.h @@ -1,35 +1,39 @@ #pragma once +#include + struct CHAddr { uint32 offset; uint32 chunkIndex; + void* internal; // AllocRange - CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {}; + CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {}; CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {}; bool isValid() { return chunkIndex != 0xFFFFFFFF; }; static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); }; }; +template class ChunkedHeap { - struct allocRange_t + struct AllocRange { - allocRange_t* nextFree{}; - allocRange_t* prevFree{}; - allocRange_t* prevOrdered{}; - allocRange_t* nextOrdered{}; + AllocRange* nextFree{}; + AllocRange* prevFree{}; + AllocRange* prevOrdered{}; + AllocRange* nextOrdered{}; uint32 offset; uint32 chunkIndex; uint32 size; bool isFree; - allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {}; + AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {}; }; - struct chunk_t + struct Chunk { - std::unordered_map map_allocatedRange; + uint32 size; }; public: @@ -47,45 +51,32 @@ public: _free(addr); } - virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) - { - return 0; - } + virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0; private: unsigned ulog2(uint32 v) { - static const unsigned MUL_DE_BRUIJN_BIT[] = - { - 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, - 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 - }; - - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - - return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27]; + cemu_assert_debug(v != 0); + return 31 - std::countl_zero(v); } - void trackFreeRange(allocRange_t* range) + void trackFreeRange(AllocRange* range) { // get index of msb cemu_assert_debug(range->size != 0); // size of zero is not allowed uint32 bucketIndex = ulog2(range->size); - range->nextFree = bucketFreeRange[bucketIndex]; - if (bucketFreeRange[bucketIndex]) - bucketFreeRange[bucketIndex]->prevFree = range; + range->nextFree = m_bucketFreeRange[bucketIndex]; + if (m_bucketFreeRange[bucketIndex]) + m_bucketFreeRange[bucketIndex]->prevFree = range; range->prevFree = nullptr; - bucketFreeRange[bucketIndex] = range; + m_bucketFreeRange[bucketIndex] = range; + m_bucketUseMask |= (1u << bucketIndex); } - void forgetFreeRange(allocRange_t* range, uint32 bucketIndex) + void forgetFreeRange(AllocRange* range, uint32 bucketIndex) { - allocRange_t* prevRange = range->prevFree; - allocRange_t* nextRange = range->nextFree; + AllocRange* prevRange = range->prevFree; + AllocRange* nextRange = range->nextFree; if (prevRange) { prevRange->nextFree = nextRange; @@ -94,36 +85,42 @@ private: } else { - if (bucketFreeRange[bucketIndex] != range) - assert_dbg(); - bucketFreeRange[bucketIndex] = nextRange; + cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range); + m_bucketFreeRange[bucketIndex] = nextRange; if (nextRange) nextRange->prevFree = nullptr; + else + m_bucketUseMask &= ~(1u << bucketIndex); } } bool allocateChunk(uint32 minimumAllocationSize) { - uint32 chunkIndex = (uint32)list_chunks.size(); - list_chunks.emplace_back(new chunk_t()); + uint32 chunkIndex = (uint32)m_chunks.size(); + m_chunks.emplace_back(); uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize); + cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment if (chunkSize == 0) return false; - allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true); + cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB + AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true); trackFreeRange(range); - numHeapBytes += chunkSize; + m_numHeapBytes += chunkSize; return true; } - void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize) + void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize) { + cemu_assert_debug(allocSize > 0); // remove the range from the chain of free ranges forgetFreeRange(range, bucketIndex); // split head, allocation and tail into separate ranges - if (allocOffset > range->offset) + uint32 headBytes = allocOffset - range->offset; + if (headBytes > 0) { // alignment padding -> create free range - allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true); + cemu_assert_debug(headBytes >= TMinimumAlignment); + AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true); trackFreeRange(head); if (range->prevOrdered) range->prevOrdered->nextOrdered = head; @@ -131,10 +128,12 @@ private: head->nextOrdered = range; range->prevOrdered = head; } - if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes? + uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize); + if (tailBytes > 0) { // tail -> create free range - allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true); + cemu_assert_debug(tailBytes >= TMinimumAlignment); + AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true); trackFreeRange(tail); if (range->nextOrdered) range->nextOrdered->prevOrdered = tail; @@ -149,36 +148,51 @@ private: CHAddr _alloc(uint32 size, uint32 alignment) { + cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment)); + // make sure size is not zero and align it + if(size == 0) [[unlikely]] + size = TMinimumAlignment; + else + size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1); // find smallest bucket to scan uint32 alignmentM1 = alignment - 1; uint32 bucketIndex = ulog2(size); - while (bucketIndex < 32) + // check if the bucket is available + if( !(m_bucketUseMask & (1u << bucketIndex)) ) { - allocRange_t* range = bucketFreeRange[bucketIndex]; + // skip to next non-empty bucket + uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex); + bucketIndex += nextIndex; + } + while (bucketIndex < 31) + { + AllocRange* range = m_bucketFreeRange[bucketIndex]; while (range) { if (range->size >= size) { // verify if aligned allocation fits uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1; - uint32 alignmentLoss = alignedOffset - range->offset; - if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size) + uint32 endOffset = alignedOffset + size; + if((range->offset+range->size) >= endOffset) { _allocFrom(range, bucketIndex, alignedOffset, size); - list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range); - numAllocatedBytes += size; - return CHAddr(alignedOffset, range->chunkIndex); + m_numAllocatedBytes += size; + return CHAddr(alignedOffset, range->chunkIndex, range); } } range = range->nextFree; } - bucketIndex++; // try higher bucket + // check next non-empty bucket or skip to end + bucketIndex++; + uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex); + bucketIndex += emptyBuckets; } - if(allocationLimitReached) + if(m_allocationLimitReached) return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); if (!allocateChunk(size)) { - allocationLimitReached = true; + m_allocationLimitReached = true; return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); } return _alloc(size, alignment); @@ -186,24 +200,16 @@ private: void _free(CHAddr addr) { - auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset); - if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end()) + if(!addr.internal) { cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset); - cemuLog_log(LogType::Force, "Debug info:"); - for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange) - { - cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size); - } return; } - - allocRange_t* range = it->second; - numAllocatedBytes -= it->second->size; - list_chunks[range->chunkIndex]->map_allocatedRange.erase(it); + AllocRange* range = (AllocRange*)addr.internal; + m_numAllocatedBytes -= range->size; // try merge left or right - allocRange_t* prevRange = range->prevOrdered; - allocRange_t* nextRange = range->nextOrdered; + AllocRange* prevRange = range->prevOrdered; + AllocRange* nextRange = range->nextOrdered; if (prevRange && prevRange->isFree) { if (nextRange && nextRange->isFree) @@ -216,8 +222,8 @@ private: forgetFreeRange(prevRange, ulog2(prevRange->size)); prevRange->size = newSize; trackFreeRange(prevRange); - delete range; - delete nextRange; + m_allocEntriesPool.freeObj(range); + m_allocEntriesPool.freeObj(nextRange); } else { @@ -228,7 +234,7 @@ private: forgetFreeRange(prevRange, ulog2(prevRange->size)); prevRange->size = newSize; trackFreeRange(prevRange); - delete range; + m_allocEntriesPool.freeObj(range); } } else if (nextRange && nextRange->isFree) @@ -242,7 +248,7 @@ private: range->prevOrdered->nextOrdered = nextRange; nextRange->prevOrdered = range->prevOrdered; trackFreeRange(nextRange); - delete range; + m_allocEntriesPool.freeObj(range); } else { @@ -265,7 +271,7 @@ private: for (uint32 i = 0; i < 32; i++) { - allocRange_t* ar = bucketFreeRange[i]; + AllocRange* ar = m_bucketFreeRange[i]; while (ar) { availableRange_t dbgRange; @@ -278,7 +284,7 @@ private: if (itr.chunkIndex != dbgRange.chunkIndex) continue; if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset)) - assert_dbg(); + cemu_assert_error(); } availRanges.emplace_back(dbgRange); @@ -290,14 +296,16 @@ private: } private: - std::vector list_chunks; - allocRange_t* bucketFreeRange[32]{}; - bool allocationLimitReached = false; + std::vector m_chunks; + uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction + AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed) + bool m_allocationLimitReached = false; + MemoryPool m_allocEntriesPool{64}; public: // statistics - uint32 numHeapBytes{}; // total size of the heap - uint32 numAllocatedBytes{}; + uint32 m_numHeapBytes{}; // total size of the heap + uint32 m_numAllocatedBytes{}; }; class VGenericHeap @@ -633,7 +641,7 @@ public: uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; } uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; } - + private: void allocateAdditionalChunk() {