From e97493b2a1687b66cd283eddec375d459118e6fd Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Thu, 5 Dec 2024 12:17:18 +0100
Subject: [PATCH] Optimize ChunkedHeap

---
 .../Latte/Renderer/Vulkan/VKRMemoryManager.h  |  12 +-
 src/Common/precompiled.h                      |  19 ++
 src/util/ChunkedHeap/ChunkedHeap.h            | 170 +++++++++---------
 3 files changed, 114 insertions(+), 87 deletions(-)
diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
index ecf53996..08af5882 100644
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
@@ -44,7 +44,7 @@ struct VkImageMemAllocation
 	uint32 getAllocationSize() { return allocationSize; }
 };
 
-class VkTextureChunkedHeap : private ChunkedHeap
+class VkTextureChunkedHeap : private ChunkedHeap<>
 {
 public:
 	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
@@ -80,8 +80,8 @@ public:
 
 	void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
 	{
-		totalHeapSize = numHeapBytes;
-		allocatedBytes = numAllocatedBytes;
+		totalHeapSize = m_numHeapBytes;
+		allocatedBytes = m_numAllocatedBytes;
 	}
 
   private:
@@ -92,7 +92,7 @@ public:
 	std::vector<ChunkInfo> m_list_chunkInfo;
 };
 
-class VkBufferChunkedHeap : private ChunkedHeap
+class VkBufferChunkedHeap : private ChunkedHeap<>
 {
   public:
 	VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
@@ -123,8 +123,8 @@ class VkBufferChunkedHeap : private ChunkedHeap
 	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
 	{
 		numBuffers = m_chunkBuffers.size();
-		totalBufferSize = numHeapBytes;
-		freeBufferSize = numHeapBytes - numAllocatedBytes;
+		totalBufferSize = m_numHeapBytes;
+		freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
 	}
 
 	bool RequiresFlush(uint32 index) const
diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h
index d4df4343..3dfeaf74 100644
--- a/src/Common/precompiled.h
+++ b/src/Common/precompiled.h
@@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
 	#define NOEXPORT __attribute__ ((visibility ("hidden")))
 #endif
 
+#if defined(_MSC_VER)
+#define FORCE_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FORCE_INLINE inline __attribute__((always_inline))
+#else
+#define FORCE_INLINE
+#endif
+
+FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined
+{
+#if defined(_MSC_VER)
+	return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF
+#elif defined(__GNUC__) || defined(__clang__)
+	return __builtin_ctz(v);
+#else
+	return std::countr_zero(v);
+#endif
+}
+
 // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
 #if defined(__aarch64__)
 
diff --git a/src/util/ChunkedHeap/ChunkedHeap.h b/src/util/ChunkedHeap/ChunkedHeap.h
index abc45429..21a1b868 100644
--- a/src/util/ChunkedHeap/ChunkedHeap.h
+++ b/src/util/ChunkedHeap/ChunkedHeap.h
@@ -1,35 +1,39 @@
 #pragma once
 
+#include <util/helpers/MemoryPool.h>
+
 struct CHAddr
 {
 	uint32 offset;
 	uint32 chunkIndex;
+	void* internal; // AllocRange
 
-	CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {};
+	CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {};
 	CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};
 
 	bool isValid() { return chunkIndex != 0xFFFFFFFF; };
 	static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
 };
 
+template<uint32 TMinimumAlignment = 32>
 class ChunkedHeap
 {
-	struct allocRange_t
+	struct AllocRange
 	{
-		allocRange_t* nextFree{};
-		allocRange_t* prevFree{};
-		allocRange_t* prevOrdered{};
-		allocRange_t* nextOrdered{};
+		AllocRange* nextFree{};
+		AllocRange* prevFree{};
+		AllocRange* prevOrdered{};
+		AllocRange* nextOrdered{};
 		uint32 offset;
 		uint32 chunkIndex;
 		uint32 size;
 		bool isFree;
-		allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
+		AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
 	};
 
-	struct chunk_t
+	struct Chunk
 	{
-		std::unordered_map<uint32, allocRange_t*> map_allocatedRange;
+		uint32 size;
 	};
 
 public:
@@ -47,45 +51,32 @@ public:
 		_free(addr);
 	}
 
-	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
-	{
-		return 0;
-	}
+	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0;
 
 private:
 	unsigned ulog2(uint32 v)
 	{
-		static const unsigned MUL_DE_BRUIJN_BIT[] =
-		{
-		   0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
-		   8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
-		};
-
-		v |= v >> 1;
-		v |= v >> 2;
-		v |= v >> 4;
-		v |= v >> 8;
-		v |= v >> 16;
-
-		return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27];
+		cemu_assert_debug(v != 0);
+		return 31 - std::countl_zero(v);
 	}
 
-	void trackFreeRange(allocRange_t* range)
+	void trackFreeRange(AllocRange* range)
 	{
 		// get index of msb
 		cemu_assert_debug(range->size != 0); // size of zero is not allowed
 		uint32 bucketIndex = ulog2(range->size);
-		range->nextFree = bucketFreeRange[bucketIndex];
-		if (bucketFreeRange[bucketIndex])
-			bucketFreeRange[bucketIndex]->prevFree = range;
+		range->nextFree = m_bucketFreeRange[bucketIndex];
+		if (m_bucketFreeRange[bucketIndex])
+			m_bucketFreeRange[bucketIndex]->prevFree = range;
 		range->prevFree = nullptr;
-		bucketFreeRange[bucketIndex] = range;
+		m_bucketFreeRange[bucketIndex] = range;
+		m_bucketUseMask |= (1u << bucketIndex);
 	}
 
-	void forgetFreeRange(allocRange_t* range, uint32 bucketIndex)
+	void forgetFreeRange(AllocRange* range, uint32 bucketIndex)
 	{
-		allocRange_t* prevRange = range->prevFree;
-		allocRange_t* nextRange = range->nextFree;
+		AllocRange* prevRange = range->prevFree;
+		AllocRange* nextRange = range->nextFree;
 		if (prevRange)
 		{
 			prevRange->nextFree = nextRange;
@@ -94,36 +85,42 @@ private:
 		}
 		else
 		{
-			if (bucketFreeRange[bucketIndex] != range)
-				assert_dbg();
-			bucketFreeRange[bucketIndex] = nextRange;
+			cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range);
+			m_bucketFreeRange[bucketIndex] = nextRange;
 			if (nextRange)
 				nextRange->prevFree = nullptr;
+			else
+				m_bucketUseMask &= ~(1u << bucketIndex);
 		}
 	}
 
 	bool allocateChunk(uint32 minimumAllocationSize)
 	{
-		uint32 chunkIndex = (uint32)list_chunks.size();
-		list_chunks.emplace_back(new chunk_t());
+		uint32 chunkIndex = (uint32)m_chunks.size();
+		m_chunks.emplace_back();
 		uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
+		cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment
 		if (chunkSize == 0)
 			return false;
-		allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true);
+		cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB
+		AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true);
 		trackFreeRange(range);
-		numHeapBytes += chunkSize;
+		m_numHeapBytes += chunkSize;
 		return true;
 	}
 
-	void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
+	void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
 	{
+		cemu_assert_debug(allocSize > 0);
 		// remove the range from the chain of free ranges
 		forgetFreeRange(range, bucketIndex);
 		// split head, allocation and tail into separate ranges
-		if (allocOffset > range->offset)
+		uint32 headBytes = allocOffset - range->offset;
+		if (headBytes > 0)
 		{
 			// alignment padding -> create free range
-			allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true);
+			cemu_assert_debug(headBytes >= TMinimumAlignment);
+			AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true);
 			trackFreeRange(head);
 			if (range->prevOrdered)
 				range->prevOrdered->nextOrdered = head;
@@ -131,10 +128,12 @@ private:
 			head->nextOrdered = range;
 			range->prevOrdered = head;
 		}
-		if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes?
+		uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize);
+		if (tailBytes > 0)
 		{
 			// tail -> create free range
-			allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true);
+			cemu_assert_debug(tailBytes >= TMinimumAlignment);
+			AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true);
 			trackFreeRange(tail);
 			if (range->nextOrdered)
 				range->nextOrdered->prevOrdered = tail;
@@ -149,36 +148,51 @@ private:
 
 	CHAddr _alloc(uint32 size, uint32 alignment)
 	{
+		cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment));
+		// make sure size is not zero and align it
+		if(size == 0) [[unlikely]]
+			size = TMinimumAlignment;
+		else
+			size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1);
 		// find smallest bucket to scan
 		uint32 alignmentM1 = alignment - 1;
 		uint32 bucketIndex = ulog2(size);
-		while (bucketIndex < 32)
+		// check if the bucket is available
+		if( !(m_bucketUseMask & (1u << bucketIndex)) )
 		{
-			allocRange_t* range = bucketFreeRange[bucketIndex];
+			// skip to next non-empty bucket
+			uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex);
+			bucketIndex += nextIndex;
+		}
+		while (bucketIndex < 31)
+		{
+			AllocRange* range = m_bucketFreeRange[bucketIndex];
 			while (range)
 			{
 				if (range->size >= size)
 				{
 					// verify if aligned allocation fits
 					uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
-					uint32 alignmentLoss = alignedOffset - range->offset;
-					if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size)
+					uint32 endOffset = alignedOffset + size;
+					if((range->offset+range->size) >= endOffset)
 					{
 						_allocFrom(range, bucketIndex, alignedOffset, size);
-						list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range);
-						numAllocatedBytes += size;
-						return CHAddr(alignedOffset, range->chunkIndex);
+						m_numAllocatedBytes += size;
+						return CHAddr(alignedOffset, range->chunkIndex, range);
 					}
 				}
 				range = range->nextFree;
 			}
-			bucketIndex++; // try higher bucket
+			// check next non-empty bucket or skip to end
+			bucketIndex++;
+			uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex);
+			bucketIndex += emptyBuckets;
 		}
-		if(allocationLimitReached)
+		if(m_allocationLimitReached)
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		if (!allocateChunk(size))
 		{
-			allocationLimitReached = true;
+			m_allocationLimitReached = true;
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		}
 		return _alloc(size, alignment);
@@ -186,24 +200,16 @@ private:
 
 	void _free(CHAddr addr)
 	{
-		auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset);
-		if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end())
+		if(!addr.internal)
 		{
 			cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
-			cemuLog_log(LogType::Force, "Debug info:");
-			for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange)
-			{
-				cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size);
-			}
 			return;
 		}
-
-		allocRange_t* range = it->second;
-		numAllocatedBytes -= it->second->size;
-		list_chunks[range->chunkIndex]->map_allocatedRange.erase(it);
+		AllocRange* range = (AllocRange*)addr.internal;
+		m_numAllocatedBytes -= range->size;
 		// try merge left or right
-		allocRange_t* prevRange = range->prevOrdered;
-		allocRange_t* nextRange = range->nextOrdered;
+		AllocRange* prevRange = range->prevOrdered;
+		AllocRange* nextRange = range->nextOrdered;
 		if (prevRange && prevRange->isFree)
 		{
 			if (nextRange && nextRange->isFree)
@@ -216,8 +222,8 @@ private:
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
-				delete nextRange;
+				m_allocEntriesPool.freeObj(range);
+				m_allocEntriesPool.freeObj(nextRange);
 			}
 			else
 			{
@@ -228,7 +234,7 @@ private:
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
+				m_allocEntriesPool.freeObj(range);
 			}
 		}
 		else if (nextRange && nextRange->isFree)
@@ -242,7 +248,7 @@ private:
 				range->prevOrdered->nextOrdered = nextRange;
 			nextRange->prevOrdered = range->prevOrdered;
 			trackFreeRange(nextRange);
-			delete range;
+			m_allocEntriesPool.freeObj(range);
 		}
 		else
 		{
@@ -265,7 +271,7 @@ private:
 
 		for (uint32 i = 0; i < 32; i++)
 		{
-			allocRange_t* ar = bucketFreeRange[i];
+			AllocRange* ar = m_bucketFreeRange[i];
 			while (ar)
 			{
 				availableRange_t dbgRange;
@@ -278,7 +284,7 @@ private:
 					if (itr.chunkIndex != dbgRange.chunkIndex)
 						continue;
 					if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
-						assert_dbg();
+						cemu_assert_error();
 				}
 
 				availRanges.emplace_back(dbgRange);
@@ -290,14 +296,16 @@ private:
 	}
 
 private:
-	std::vector<chunk_t*> list_chunks;
-	allocRange_t* bucketFreeRange[32]{};
-	bool allocationLimitReached = false;
+	std::vector<Chunk> m_chunks;
+	uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction
+	AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed)
+	bool m_allocationLimitReached = false;
+	MemoryPool<AllocRange> m_allocEntriesPool{64};
 
 public:
 	// statistics
-	uint32 numHeapBytes{}; // total size of the heap
-	uint32 numAllocatedBytes{};
+	uint32 m_numHeapBytes{}; // total size of the heap
+	uint32 m_numAllocatedBytes{};
 };
 
 class VGenericHeap
@@ -633,7 +641,7 @@ public:
 
 	uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; }
 	uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; }
-	
+
 private:
 	void allocateAdditionalChunk()
 	{