Latte/Vulkan: Add multiple entry LRU cache support for indices

2024-11-23 18:25:58 +01:00 · 2024-11-23 18:25:58 +01:00 · 13979d490f
parent 409f12b13a
commit 13979d490f
13 changed files with 395 additions and 104 deletions
--- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp
@ -1,6 +1,7 @@
 #include "Cafe/HW/Latte/Core/LatteConst.h"
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
+#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Common/cpu_features.h"

 #if defined(ARCH_X86_64) && defined(__GNUC__)
@ -9,32 +10,53 @@

 struct  
 {
-	const void* lastPtr;
-	uint32 lastCount;
-	LattePrimitiveMode lastPrimitiveMode;
-	LatteIndexType lastIndexType;
-	// output
-	uint32 indexMin;
-	uint32 indexMax;
-	Renderer::INDEX_TYPE renderIndexType;
-	uint32 outputCount;
-	uint32 indexBufferOffset;
-	uint32 indexBufferIndex;
+	struct CacheEntry
+	{
+		// input data
+		const void* lastPtr;
+		uint32 lastCount;
+		LattePrimitiveMode lastPrimitiveMode;
+		LatteIndexType lastIndexType;
+		uint64 lastUsed;
+		// output
+		uint32 indexMin;
+		uint32 indexMax;
+		Renderer::INDEX_TYPE renderIndexType;
+		uint32 outputCount;
+		Renderer::IndexAllocation indexAllocation;
+	};
+	std::array<CacheEntry, 8> entry;
+	uint64 currentUsageCounter{0};
 }LatteIndexCache{};

 void LatteIndices_invalidate(const void* memPtr, uint32 size)
 {
-	if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
+	for(auto& entry : LatteIndexCache.entry)
 	{
-		LatteIndexCache.lastPtr = nullptr;
-		LatteIndexCache.lastCount = 0;
+		if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
+		{
+			if(entry.lastPtr != nullptr)
+				g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
+			entry.lastPtr = nullptr;
+			entry.lastCount = 0;
+		}
 	}
 }

 void LatteIndices_invalidateAll()
 {
-	LatteIndexCache.lastPtr = nullptr;
-	LatteIndexCache.lastCount = 0;
+	for(auto& entry : LatteIndexCache.entry)
+	{
+		if (entry.lastPtr != nullptr)
+			g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
+		entry.lastPtr = nullptr;
+		entry.lastCount = 0;
+	}
+}
+
+uint64 LatteIndices_GetNextUsageIndex()
+{
+	return LatteIndexCache.currentUsageCounter++;
 }

 uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
 	}
 }

-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
 {
 	// what this should do:
 	// [x] use fast SIMD-based index decoding
@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 	// [ ] better cache implementation, allow to cache across frames

 	// reuse from cache if data didn't change
-	if (LatteIndexCache.lastPtr == indexData &&
-		LatteIndexCache.lastCount == count &&
-		LatteIndexCache.lastPrimitiveMode == primitiveMode &&
-		LatteIndexCache.lastIndexType == indexType)
+	auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
 	{
-		indexMin = LatteIndexCache.indexMin;
-		indexMax = LatteIndexCache.indexMax;
-		renderIndexType = LatteIndexCache.renderIndexType;
-		outputCount = LatteIndexCache.outputCount;
-		indexBufferOffset = LatteIndexCache.indexBufferOffset;
-		indexBufferIndex = LatteIndexCache.indexBufferIndex;
+		return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
+	});
+	if (cacheEntry != LatteIndexCache.entry.end())
+	{
+		indexMin = cacheEntry->indexMin;
+		indexMax = cacheEntry->indexMax;
+		renderIndexType = cacheEntry->renderIndexType;
+		outputCount = cacheEntry->outputCount;
+		indexAllocation = cacheEntry->indexAllocation;
+		cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 		return;
 	}

@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		indexMin = 0;
 		indexMax = std::max(count, 1u)-1;
 		renderIndexType = Renderer::INDEX_TYPE::NONE;
+		indexAllocation = {};
 		return; // no indices
 	}
 	// query index buffer from renderer
-	void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
+	indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
+	void* indexOutputPtr = indexAllocation.mem;

 	// decode indices
 	indexMin = std::numeric_limits<uint32>::max();
@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		// recalculate index range but filter out primitive restart index
 		LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
 	}
-	g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
+	g_renderer->indexData_uploadIndexMemory(indexAllocation);
+	performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
+	// get least recently used cache entry
+	auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
+	{
+		return a.lastUsed < b.lastUsed;
+	});
+	// invalidate previous allocation
+	if(lruEntry->lastPtr != nullptr)
+		g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
 	// update cache
-	LatteIndexCache.lastPtr = indexData;
-	LatteIndexCache.lastCount = count;
-	LatteIndexCache.lastPrimitiveMode = primitiveMode;
-	LatteIndexCache.lastIndexType = indexType;
-	LatteIndexCache.indexMin = indexMin;
-	LatteIndexCache.indexMax = indexMax;
-	LatteIndexCache.renderIndexType = renderIndexType;
-	LatteIndexCache.outputCount = outputCount;
-	LatteIndexCache.indexBufferOffset = indexBufferOffset;
-	LatteIndexCache.indexBufferIndex = indexBufferIndex;
+	lruEntry->lastPtr = indexData;
+	lruEntry->lastCount = count;
+	lruEntry->lastPrimitiveMode = primitiveMode;
+	lruEntry->lastIndexType = indexType;
+	lruEntry->indexMin = indexMin;
+	lruEntry->indexMax = indexMax;
+	lruEntry->renderIndexType = renderIndexType;
+	lruEntry->outputCount = outputCount;
+	lruEntry->indexAllocation = indexAllocation;
+	lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 }
--- a/src/Cafe/HW/Latte/Core/LatteIndices.h
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.h
@ -4,4 +4,4 @@

 void LatteIndices_invalidate(const void* memPtr, uint32 size);
 void LatteIndices_invalidateAll();
-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
--- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
 				ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);

 			if (config.overlay.debug)
+			{
+				// general debug info
+				ImGui::Text("--- Debug info ---");
+				ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
+				// backend specific info
 				g_renderer->AppendOverlayDebugInfo();
+			}

 			position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
 		}
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
 		uniformBankDataUploadedPerFrame /= 1024ULL;
 		uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
 		uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
-		indexDataUploadPerFrame /= 1024ULL;

 		double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
 		uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
 		uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		// set stats
-
+		performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
 		// next counter cycle
 		sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
 		performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
@ -131,6 +131,12 @@ typedef struct
 		LattePerfStatCounter numDrawBarriersPerFrame;
 		LattePerfStatCounter numBeginRenderpassPerFrame;
 	}vk;
+
+	// calculated stats (per frame)
+	struct
+	{
+		uint32 indexDataUploadPerFrame;
+	}stats;
 }performanceMonitor_t;

 extern performanceMonitor_t performanceMonitor;
--- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
@ -11,7 +11,6 @@
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Cafe/GraphicPack/GraphicPack2.h"
 #include "config/ActiveSettings.h"
-#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
 #include "gui/guiWrapper.h"
 #include "Cafe/OS/libs/erreula/erreula.h"
 #include "input/InputManager.h"
--- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
@ -102,16 +102,21 @@ public:
 	static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
 	static void SetArrayElementBuffer(GLuint arrayElementBuffer);

-	// index
-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
+	// index (not used by OpenGL renderer yet)
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override
 	{
-		assert_dbg();
-		return nullptr;
+		cemu_assert_unimplemented();
+		return {};
 	}

-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override
 	{
-		assert_dbg();
+		cemu_assert_unimplemented();
+	}
+
+	void indexData_uploadIndexMemory(IndexAllocation& allocation) override
+	{
+		cemu_assert_unimplemented();
 	}

 	// uniform
--- a/src/Cafe/HW/Latte/Renderer/Renderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Renderer.h
@ -138,8 +138,15 @@ public:
 	virtual void draw_endSequence() = 0;

 	// index
-	virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
-	virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
+	struct IndexAllocation
+	{
+		void* mem; // pointer to index data inside buffer
+		void* rendererInternal; // for renderer use
+	};
+
+	virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
+	virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
+	virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;

 	// occlusion queries
 	virtual LatteQueryObject* occlusionQuery_create() = 0;
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq
 	AllocatorBuffer_t newBuffer{};
 	newBuffer.writeIndex = 0;
 	newBuffer.basePtr = nullptr;
-	if (m_bufferType == BUFFER_TYPE::STAGING)
+	if (m_bufferType == VKR_BUFFER_TYPE::STAGING)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::INDEX)
+	else if (m_bufferType == VKR_BUFFER_TYPE::INDEX)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::STRIDE)
+	else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
 	else
 		cemu_assert_debug(false);
@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
 		uint32 distanceToSyncPoint;
 		if (!itr.queue_syncPoints.empty())
 		{
-			if(itr.queue_syncPoints.front().offset < itr.writeIndex)
+			if (itr.queue_syncPoints.front().offset < itr.writeIndex)
 				distanceToSyncPoint = 0xFFFFFFFF;
 			else
 				distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato

 void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
 {
-	cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
+	cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
 	// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
 	VkMappedMemoryRange flushedRange{};
 	flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf
 	}
 }

+/* VKRSynchronizedHeapAllocator */
+
+VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize)
+	: m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {};
+
+VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
+{
+	CHAddr addr = m_chunkedHeap.alloc(size, alignment);
+	m_activeAllocations.emplace_back(addr);
+	AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
+	res->bufferIndex = addr.chunkIndex;
+	res->bufferOffset = addr.offset;
+	res->size = size;
+	res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
+	m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem);
+	return res;
+}
+
+void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
+{
+	// put the allocation on a delayed release queue for the current command buffer
+	uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId();
+	auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
+	cemu_assert_debug(it != m_activeAllocations.end());
+	m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation);
+	m_activeAllocations.erase(it);
+	m_poolAllocatorReservation.freeObj(uploadReservation);
+}
+
+void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
+{
+	if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex))
+	{
+		VkMappedMemoryRange flushedRange{};
+		flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+		flushedRange.memory = uploadReservation->vkMem;
+		flushedRange.offset = uploadReservation->bufferOffset;
+		flushedRange.size = uploadReservation->size;
+		vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange);
+	}
+}
+
+void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId)
+{
+	auto it = m_releaseQueue.begin();
+	while (it != m_releaseQueue.end())
+	{
+		if (it->first <= latestFinishedCommandBufferId)
+		{
+			// release allocations
+			for(auto& addr : it->second)
+				m_chunkedHeap.free(addr);
+			it = m_releaseQueue.erase(it);
+			continue;
+		}
+		it++;
+	}
+}
+
+void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
+{
+	m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
+}
+
 /* VkTextureChunkedHeap */

 uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1);

 	// pad minimumAllocationSize to 32KB alignment
-	minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1);
+	minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1);

 	uint32 allocationSize = 1024 * 1024 * 128;
 	if (chunkIndex == 0)
@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
 	std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
 	// remove device local memory types from host local vector
-	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool
-	{
+	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool {
 		return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
 	};
 	hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;

 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;

 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -238,6 +301,66 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	return 0;
 }

+/* VkBufferChunkedHeap */
+
+VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties)
+{
+	auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager();
+	VkBuffer buffer;
+	VkDeviceMemory bufferMemory;
+	bool allocSuccess;
+	if (bufferType == VKR_BUFFER_TYPE::STAGING)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory);
+	else if (bufferType == VKR_BUFFER_TYPE::INDEX)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory);
+	else if (bufferType == VKR_BUFFER_TYPE::STRIDE)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory);
+	else
+		cemu_assert_debug(false);
+	if (!allocSuccess)
+		return nullptr;
+
+	VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory);
+	// if host visible, then map buffer
+	void* data = nullptr;
+	if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+	{
+		vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data);
+		bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+	}
+	bufferObj->m_mappedMemory = (uint8*)data;
+	return bufferObj;
+}
+
+VKRBuffer::~VKRBuffer()
+{
+	if(m_mappedMemory)
+		vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
+	vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
+	vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
+}
+
+VkBufferChunkedHeap::~VkBufferChunkedHeap()
+{
+	for (auto& chunk : m_chunkBuffers)
+		delete chunk;
+}
+
+uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
+{
+	size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
+	VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+	if(!buffer)
+		buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+	if(!buffer)
+		VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap");
+	cemu_assert_debug(buffer);
+	cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
+	m_chunkBuffers.emplace_back(buffer);
+	// todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it?
+	return allocationSize;
+}
+
 uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
 {
 	VkPhysicalDeviceMemoryProperties memProperties;
@ -423,7 +546,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz
 	importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
 	importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
 	importHostMem.pHostPointer = hostPointer;
-	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or 
+	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or
 	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT
 	// whats the difference ?

@ -469,7 +592,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image)
 	auto it = map_textureHeap.find(typeFilter);
 	if (it == map_textureHeap.end())
 	{
-		texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice());
+		texHeap = new VkTextureChunkedHeap(this, typeFilter);
 		map_textureHeap.emplace(typeFilter, texHeap);
 	}
 	else
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
@ -2,6 +2,36 @@
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
 #include "util/ChunkedHeap/ChunkedHeap.h"
+#include "util/helpers/MemoryPool.h"
+
+enum class VKR_BUFFER_TYPE
+{
+	STAGING, // staging upload buffer
+	INDEX, // buffer for index data
+	STRIDE, // buffer for stride-adjusted vertex data
+};
+
+class VKRBuffer
+{
+  public:
+	static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties);
+	~VKRBuffer();
+
+	VkBuffer GetVkBuffer() const { return m_buffer; }
+	VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; }
+
+	uint8* GetPtr() const { return m_mappedMemory; }
+
+	bool RequiresFlush() const { return m_requiresFlush; }
+
+  private:
+	VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { };
+
+	VkBuffer m_buffer;
+	VkDeviceMemory m_bufferMemory;
+	uint8* m_mappedMemory;
+	bool m_requiresFlush{false};
+};

 struct VkImageMemAllocation
 {
@ -17,15 +47,13 @@ struct VkImageMemAllocation
 class VkTextureChunkedHeap : private ChunkedHeap
 {
 public:
-	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { };
+	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };

 	struct ChunkInfo
 	{
 		VkDeviceMemory mem;
 	};

-	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
-
 	CHAddr allocMem(uint32 size, uint32 alignment)
 	{
 		if (alignment < 4)
@ -43,11 +71,6 @@ public:
 		this->free(addr);
 	}

-	void setDevice(VkDevice dev)
-	{
-		m_device = dev;
-	}
-
 	VkDeviceMemory getChunkMem(uint32 index)
 	{
 		if (index >= m_list_chunkInfo.size())
@ -61,24 +84,69 @@ public:
 		allocatedBytes = numAllocatedBytes;
 	}

-	VkDevice m_device;
+  private:
+	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
+
 	uint32 m_typeFilter{ 0xFFFFFFFF };
 	class VKRMemoryManager* m_vkrMemoryManager;
 	std::vector<ChunkInfo> m_list_chunkInfo;
 };

+class VkBufferChunkedHeap : private ChunkedHeap
+{
+  public:
+	VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
+	~VkBufferChunkedHeap();
+
+	using ChunkedHeap::alloc;
+	using ChunkedHeap::free;
+
+	uint8* GetChunkPtr(uint32 index) const
+	{
+		if (index >= m_chunkBuffers.size())
+			return nullptr;
+		return m_chunkBuffers[index]->GetPtr();
+	}
+
+	void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem)
+	{
+		if (index >= m_chunkBuffers.size())
+		{
+			buffer = VK_NULL_HANDLE;
+			mem = VK_NULL_HANDLE;
+			return;
+		}
+		buffer = m_chunkBuffers[index]->GetVkBuffer();
+		mem = m_chunkBuffers[index]->GetVkBufferMemory();
+	}
+
+	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
+	{
+		numBuffers = m_chunkBuffers.size();
+		totalBufferSize = numHeapBytes;
+		freeBufferSize = numHeapBytes - numAllocatedBytes;
+	}
+
+	bool RequiresFlush(uint32 index) const
+	{
+		if (index >= m_chunkBuffers.size())
+			return false;
+		return m_chunkBuffers[index]->RequiresFlush();
+	}
+
+  private:
+	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
+
+	VKR_BUFFER_TYPE m_bufferType;
+	std::vector<VKRBuffer*> m_chunkBuffers;
+	size_t m_minimumBufferAllocationSize;
+};
+
 // a circular ring-buffer which tracks and releases memory per command-buffer
 class VKRSynchronizedRingAllocator
 {
 public:
-	enum class BUFFER_TYPE
-	{
-		STAGING, // staging upload buffer
-		INDEX, // buffer for index data
-		STRIDE, // buffer for stride-adjusted vertex data
-	};
-
-	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
+	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
 	VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy

 	struct BufferSyncPoint_t
@ -126,13 +194,53 @@ private:

 	const class VulkanRenderer* m_vkr;
 	const class VKRMemoryManager* m_vkrMemMgr;
-	const BUFFER_TYPE m_bufferType;
+	const VKR_BUFFER_TYPE m_bufferType;
 	const uint32 m_minimumBufferAllocSize;

 	std::vector<AllocatorBuffer_t> m_buffers;

 };

+// heap style allocator with released memory being freed after the current command buffer finishes
+class VKRSynchronizedHeapAllocator
+{
+	struct TrackedAllocation
+	{
+		TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
+		CHAddr allocation;
+	};
+
+  public:
+	VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize);
+	VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy
+
+	struct AllocatorReservation
+	{
+		VkBuffer vkBuffer;
+		VkDeviceMemory vkMem;
+		uint8* memPtr;
+		uint32 bufferOffset;
+		uint32 size;
+		uint32 bufferIndex;
+	};
+
+	AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
+	void FreeReservation(AllocatorReservation* uploadReservation);
+	void FlushReservation(AllocatorReservation* uploadReservation);
+
+	void CleanupBuffer(uint64 latestFinishedCommandBufferId);
+
+	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
+  private:
+	const class VKRMemoryManager* m_vkrMemMgr;
+	VkBufferChunkedHeap m_chunkedHeap;
+	// allocations
+	std::vector<TrackedAllocation> m_activeAllocations;
+	MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
+	// release queue
+	std::unordered_map<uint64, std::vector<CHAddr>> m_releaseQueue;
+};
+
 void LatteIndices_invalidateAll();

 class VKRMemoryManager
@ -140,9 +248,9 @@ class VKRMemoryManager
 	friend class VKRSynchronizedRingAllocator;
 public:
 	VKRMemoryManager(class VulkanRenderer* renderer) :
-			m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
-			m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
-			m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
+			m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
+			m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
+			m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
 	{
 		m_vkr = renderer;
 	}
@ -167,7 +275,7 @@ public:
 	}

 	VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
-	VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data
+	VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data
 	VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data

 	void cleanupBuffers(uint64 latestFinishedCommandBufferId)
@ -202,6 +310,6 @@ public:
 	private:
 		class VulkanRenderer* m_vkr;
 		VKRSynchronizedRingAllocator m_stagingBuffer;
-		VKRSynchronizedRingAllocator m_indexBuffer;
+		VKRSynchronizedHeapAllocator m_indexBuffer;
 		VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
 };
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
@ -3699,7 +3699,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin

 void VulkanRenderer::AppendOverlayDebugInfo()
 {
-	ImGui::Text("--- Vulkan info ---");
+	ImGui::Text("--- Vulkan debug info ---");
 	ImGui::Text("GfxPipelines   %u", performanceMonitor.vk.numGraphicPipelines.get());
 	ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
 	ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
@ -3716,7 +3716,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()

 	ImGui::Text("BeginRP/f      %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
 	ImGui::Text("Barriers/f     %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
-	ImGui::Text("--- Cache info ---");
+	ImGui::Text("--- Cache debug info ---");

 	uint32 bufferCacheHeapSize = 0;
 	uint32 bufferCacheAllocationSize = 0;
@ -3736,7 +3736,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);

-	memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
+	memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
 	ImGui::Text("Index");
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
@ -328,8 +328,9 @@ public:

 	RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;

-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override;
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
+	void indexData_uploadIndexMemory(IndexAllocation& allocation) override;

 	// externally callable
 	void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount
 	return draw_createGraphicsPipeline(indexCount);
 }

-void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
+Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size)
 {
-	auto& indexAllocator = this->memoryManager->getIndexAllocator();
-	auto resv = indexAllocator.AllocateBufferMemory(size, 32);
-	offset = resv.bufferOffset;
-	bufferIndex = resv.bufferIndex;
-	return resv.memPtr;
+	VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32);
+	return { resv->memPtr, resv };
 }

-void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size)
+void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
 {
-	// does nothing since the index buffer memory is coherent
+	memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
+}
+
+void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
+{
+	memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
 }

 float s_vkUniformData[512 * 4];
@ -1415,14 +1417,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 	uint32 hostIndexCount;
 	uint32 indexMin = 0;
 	uint32 indexMax = 0;
-	uint32 indexBufferOffset = 0;
-	uint32 indexBufferIndex = 0;
-	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
-
+	Renderer::IndexAllocation indexAllocation;
+	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
+	VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal;
 	// update index binding
 	bool isPrevIndexData = false;
 	if (hostIndexType != INDEX_TYPE::NONE)
 	{
+		uint32 indexBufferIndex = indexReservation->bufferIndex;
+		uint32 indexBufferOffset = indexReservation->bufferOffset;
 		if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
 		{
 			m_state.activeIndexType = hostIndexType;
@ -1435,7 +1438,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 				vkType = VK_INDEX_TYPE_UINT32;
 			else
 				cemu_assert(false);
-			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType);
+			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType);
 		}
 		else
 			isPrevIndexData = true;