Latte/Vulkan: Add multiple entry LRU cache support for indices

This commit is contained in:
Exzap 2024-11-23 18:25:58 +01:00
parent 409f12b13a
commit 13979d490f
13 changed files with 395 additions and 104 deletions

View File

@ -1,6 +1,7 @@
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
#include "Common/cpu_features.h"
#if defined(ARCH_X86_64) && defined(__GNUC__)
@ -9,32 +10,53 @@
struct
{
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
uint32 indexBufferOffset;
uint32 indexBufferIndex;
struct CacheEntry
{
// input data
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
uint64 lastUsed;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
Renderer::IndexAllocation indexAllocation;
};
std::array<CacheEntry, 8> entry;
uint64 currentUsageCounter{0};
}LatteIndexCache{};
void LatteIndices_invalidate(const void* memPtr, uint32 size)
{
if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
for(auto& entry : LatteIndexCache.entry)
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
{
if(entry.lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
entry.lastPtr = nullptr;
entry.lastCount = 0;
}
}
}
void LatteIndices_invalidateAll()
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
for(auto& entry : LatteIndexCache.entry)
{
if (entry.lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
entry.lastPtr = nullptr;
entry.lastCount = 0;
}
}
uint64 LatteIndices_GetNextUsageIndex()
{
return LatteIndexCache.currentUsageCounter++;
}
uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
}
}
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
{
// what this should do:
// [x] use fast SIMD-based index decoding
@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
// [ ] better cache implementation, allow to cache across frames
// reuse from cache if data didn't change
if (LatteIndexCache.lastPtr == indexData &&
LatteIndexCache.lastCount == count &&
LatteIndexCache.lastPrimitiveMode == primitiveMode &&
LatteIndexCache.lastIndexType == indexType)
auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
{
indexMin = LatteIndexCache.indexMin;
indexMax = LatteIndexCache.indexMax;
renderIndexType = LatteIndexCache.renderIndexType;
outputCount = LatteIndexCache.outputCount;
indexBufferOffset = LatteIndexCache.indexBufferOffset;
indexBufferIndex = LatteIndexCache.indexBufferIndex;
return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
});
if (cacheEntry != LatteIndexCache.entry.end())
{
indexMin = cacheEntry->indexMin;
indexMax = cacheEntry->indexMax;
renderIndexType = cacheEntry->renderIndexType;
outputCount = cacheEntry->outputCount;
indexAllocation = cacheEntry->indexAllocation;
cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
return;
}
@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
indexMin = 0;
indexMax = std::max(count, 1u)-1;
renderIndexType = Renderer::INDEX_TYPE::NONE;
indexAllocation = {};
return; // no indices
}
// query index buffer from renderer
void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
void* indexOutputPtr = indexAllocation.mem;
// decode indices
indexMin = std::numeric_limits<uint32>::max();
@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
// recalculate index range but filter out primitive restart index
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
}
g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
g_renderer->indexData_uploadIndexMemory(indexAllocation);
performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
// get least recently used cache entry
auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
{
return a.lastUsed < b.lastUsed;
});
// invalidate previous allocation
if(lruEntry->lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
// update cache
LatteIndexCache.lastPtr = indexData;
LatteIndexCache.lastCount = count;
LatteIndexCache.lastPrimitiveMode = primitiveMode;
LatteIndexCache.lastIndexType = indexType;
LatteIndexCache.indexMin = indexMin;
LatteIndexCache.indexMax = indexMax;
LatteIndexCache.renderIndexType = renderIndexType;
LatteIndexCache.outputCount = outputCount;
LatteIndexCache.indexBufferOffset = indexBufferOffset;
LatteIndexCache.indexBufferIndex = indexBufferIndex;
lruEntry->lastPtr = indexData;
lruEntry->lastCount = count;
lruEntry->lastPrimitiveMode = primitiveMode;
lruEntry->lastIndexType = indexType;
lruEntry->indexMin = indexMin;
lruEntry->indexMax = indexMax;
lruEntry->renderIndexType = renderIndexType;
lruEntry->outputCount = outputCount;
lruEntry->indexAllocation = indexAllocation;
lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
}

View File

@ -4,4 +4,4 @@
void LatteIndices_invalidate(const void* memPtr, uint32 size);
void LatteIndices_invalidateAll();
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);

View File

@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);
if (config.overlay.debug)
{
// general debug info
ImGui::Text("--- Debug info ---");
ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
// backend specific info
g_renderer->AppendOverlayDebugInfo();
}
position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
}

View File

@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
uniformBankDataUploadedPerFrame /= 1024ULL;
uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
indexDataUploadPerFrame /= 1024ULL;
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
// set stats
performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
// next counter cycle
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;

View File

@ -131,6 +131,12 @@ typedef struct
LattePerfStatCounter numDrawBarriersPerFrame;
LattePerfStatCounter numBeginRenderpassPerFrame;
}vk;
// calculated stats (per frame)
struct
{
uint32 indexDataUploadPerFrame;
}stats;
}performanceMonitor_t;
extern performanceMonitor_t performanceMonitor;

View File

@ -11,7 +11,6 @@
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
#include "Cafe/GraphicPack/GraphicPack2.h"
#include "config/ActiveSettings.h"
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
#include "gui/guiWrapper.h"
#include "Cafe/OS/libs/erreula/erreula.h"
#include "input/InputManager.h"

View File

@ -102,16 +102,21 @@ public:
static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
static void SetArrayElementBuffer(GLuint arrayElementBuffer);
// index
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
// index (not used by OpenGL renderer yet)
IndexAllocation indexData_reserveIndexMemory(uint32 size) override
{
assert_dbg();
return nullptr;
cemu_assert_unimplemented();
return {};
}
void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
void indexData_releaseIndexMemory(IndexAllocation& allocation) override
{
assert_dbg();
cemu_assert_unimplemented();
}
void indexData_uploadIndexMemory(IndexAllocation& allocation) override
{
cemu_assert_unimplemented();
}
// uniform

View File

@ -138,8 +138,15 @@ public:
virtual void draw_endSequence() = 0;
// index
virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
struct IndexAllocation
{
void* mem; // pointer to index data inside buffer
void* rendererInternal; // for renderer use
};
virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;
// occlusion queries
virtual LatteQueryObject* occlusionQuery_create() = 0;

View File

@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq
AllocatorBuffer_t newBuffer{};
newBuffer.writeIndex = 0;
newBuffer.basePtr = nullptr;
if (m_bufferType == BUFFER_TYPE::STAGING)
if (m_bufferType == VKR_BUFFER_TYPE::STAGING)
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
else if (m_bufferType == BUFFER_TYPE::INDEX)
else if (m_bufferType == VKR_BUFFER_TYPE::INDEX)
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
else if (m_bufferType == BUFFER_TYPE::STRIDE)
else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE)
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
else
cemu_assert_debug(false);
@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
uint32 distanceToSyncPoint;
if (!itr.queue_syncPoints.empty())
{
if(itr.queue_syncPoints.front().offset < itr.writeIndex)
if (itr.queue_syncPoints.front().offset < itr.writeIndex)
distanceToSyncPoint = 0xFFFFFFFF;
else
distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
{
cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
VkMappedMemoryRange flushedRange{};
flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf
}
}
/* VKRSynchronizedHeapAllocator */
VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize)
: m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {};
VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
{
CHAddr addr = m_chunkedHeap.alloc(size, alignment);
m_activeAllocations.emplace_back(addr);
AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
res->bufferIndex = addr.chunkIndex;
res->bufferOffset = addr.offset;
res->size = size;
res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem);
return res;
}
void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
{
// put the allocation on a delayed release queue for the current command buffer
uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId();
auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
cemu_assert_debug(it != m_activeAllocations.end());
m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation);
m_activeAllocations.erase(it);
m_poolAllocatorReservation.freeObj(uploadReservation);
}
void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
{
if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex))
{
VkMappedMemoryRange flushedRange{};
flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
flushedRange.memory = uploadReservation->vkMem;
flushedRange.offset = uploadReservation->bufferOffset;
flushedRange.size = uploadReservation->size;
vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange);
}
}
void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId)
{
auto it = m_releaseQueue.begin();
while (it != m_releaseQueue.end())
{
if (it->first <= latestFinishedCommandBufferId)
{
// release allocations
for(auto& addr : it->second)
m_chunkedHeap.free(addr);
it = m_releaseQueue.erase(it);
continue;
}
it++;
}
}
void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
{
m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
}
/* VkTextureChunkedHeap */
uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1);
// pad minimumAllocationSize to 32KB alignment
minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1);
minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1);
uint32 allocationSize = 1024 * 1024 * 128;
if (chunkIndex == 0)
@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
// remove device local memory types from host local vector
auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool
{
auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool {
return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
};
hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
allocInfo.memoryTypeIndex = memType;
VkDeviceMemory imageMemory;
VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
if (r != VK_SUCCESS)
continue;
m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
allocInfo.memoryTypeIndex = memType;
VkDeviceMemory imageMemory;
VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
if (r != VK_SUCCESS)
continue;
m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -238,6 +301,66 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
return 0;
}
/* VkBufferChunkedHeap */
VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties)
{
auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager();
VkBuffer buffer;
VkDeviceMemory bufferMemory;
bool allocSuccess;
if (bufferType == VKR_BUFFER_TYPE::STAGING)
allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory);
else if (bufferType == VKR_BUFFER_TYPE::INDEX)
allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory);
else if (bufferType == VKR_BUFFER_TYPE::STRIDE)
allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory);
else
cemu_assert_debug(false);
if (!allocSuccess)
return nullptr;
VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory);
// if host visible, then map buffer
void* data = nullptr;
if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
{
vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data);
bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
}
bufferObj->m_mappedMemory = (uint8*)data;
return bufferObj;
}
VKRBuffer::~VKRBuffer()
{
if(m_mappedMemory)
vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
}
VkBufferChunkedHeap::~VkBufferChunkedHeap()
{
for (auto& chunk : m_chunkBuffers)
delete chunk;
}
uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
{
size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
if(!buffer)
buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
if(!buffer)
VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap");
cemu_assert_debug(buffer);
cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
m_chunkBuffers.emplace_back(buffer);
// todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it?
return allocationSize;
}
uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
{
VkPhysicalDeviceMemoryProperties memProperties;
@ -423,7 +546,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz
importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
importHostMem.pHostPointer = hostPointer;
// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or
// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or
// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT
// whats the difference ?
@ -469,7 +592,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image)
auto it = map_textureHeap.find(typeFilter);
if (it == map_textureHeap.end())
{
texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice());
texHeap = new VkTextureChunkedHeap(this, typeFilter);
map_textureHeap.emplace(typeFilter, texHeap);
}
else

View File

@ -2,6 +2,36 @@
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
#include "util/ChunkedHeap/ChunkedHeap.h"
#include "util/helpers/MemoryPool.h"
enum class VKR_BUFFER_TYPE
{
STAGING, // staging upload buffer
INDEX, // buffer for index data
STRIDE, // buffer for stride-adjusted vertex data
};
class VKRBuffer
{
public:
static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties);
~VKRBuffer();
VkBuffer GetVkBuffer() const { return m_buffer; }
VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; }
uint8* GetPtr() const { return m_mappedMemory; }
bool RequiresFlush() const { return m_requiresFlush; }
private:
VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { };
VkBuffer m_buffer;
VkDeviceMemory m_bufferMemory;
uint8* m_mappedMemory;
bool m_requiresFlush{false};
};
struct VkImageMemAllocation
{
@ -17,15 +47,13 @@ struct VkImageMemAllocation
class VkTextureChunkedHeap : private ChunkedHeap
{
public:
VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { };
VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
struct ChunkInfo
{
VkDeviceMemory mem;
};
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
CHAddr allocMem(uint32 size, uint32 alignment)
{
if (alignment < 4)
@ -43,11 +71,6 @@ public:
this->free(addr);
}
void setDevice(VkDevice dev)
{
m_device = dev;
}
VkDeviceMemory getChunkMem(uint32 index)
{
if (index >= m_list_chunkInfo.size())
@ -61,24 +84,69 @@ public:
allocatedBytes = numAllocatedBytes;
}
VkDevice m_device;
private:
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
uint32 m_typeFilter{ 0xFFFFFFFF };
class VKRMemoryManager* m_vkrMemoryManager;
std::vector<ChunkInfo> m_list_chunkInfo;
};
class VkBufferChunkedHeap : private ChunkedHeap
{
public:
VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
~VkBufferChunkedHeap();
using ChunkedHeap::alloc;
using ChunkedHeap::free;
uint8* GetChunkPtr(uint32 index) const
{
if (index >= m_chunkBuffers.size())
return nullptr;
return m_chunkBuffers[index]->GetPtr();
}
void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem)
{
if (index >= m_chunkBuffers.size())
{
buffer = VK_NULL_HANDLE;
mem = VK_NULL_HANDLE;
return;
}
buffer = m_chunkBuffers[index]->GetVkBuffer();
mem = m_chunkBuffers[index]->GetVkBufferMemory();
}
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
{
numBuffers = m_chunkBuffers.size();
totalBufferSize = numHeapBytes;
freeBufferSize = numHeapBytes - numAllocatedBytes;
}
bool RequiresFlush(uint32 index) const
{
if (index >= m_chunkBuffers.size())
return false;
return m_chunkBuffers[index]->RequiresFlush();
}
private:
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
VKR_BUFFER_TYPE m_bufferType;
std::vector<VKRBuffer*> m_chunkBuffers;
size_t m_minimumBufferAllocationSize;
};
// a circular ring-buffer which tracks and releases memory per command-buffer
class VKRSynchronizedRingAllocator
{
public:
enum class BUFFER_TYPE
{
STAGING, // staging upload buffer
INDEX, // buffer for index data
STRIDE, // buffer for stride-adjusted vertex data
};
VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy
struct BufferSyncPoint_t
@ -126,13 +194,53 @@ private:
const class VulkanRenderer* m_vkr;
const class VKRMemoryManager* m_vkrMemMgr;
const BUFFER_TYPE m_bufferType;
const VKR_BUFFER_TYPE m_bufferType;
const uint32 m_minimumBufferAllocSize;
std::vector<AllocatorBuffer_t> m_buffers;
};
// heap style allocator with released memory being freed after the current command buffer finishes
class VKRSynchronizedHeapAllocator
{
struct TrackedAllocation
{
TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
CHAddr allocation;
};
public:
VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize);
VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy
struct AllocatorReservation
{
VkBuffer vkBuffer;
VkDeviceMemory vkMem;
uint8* memPtr;
uint32 bufferOffset;
uint32 size;
uint32 bufferIndex;
};
AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
void FreeReservation(AllocatorReservation* uploadReservation);
void FlushReservation(AllocatorReservation* uploadReservation);
void CleanupBuffer(uint64 latestFinishedCommandBufferId);
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
private:
const class VKRMemoryManager* m_vkrMemMgr;
VkBufferChunkedHeap m_chunkedHeap;
// allocations
std::vector<TrackedAllocation> m_activeAllocations;
MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
// release queue
std::unordered_map<uint64, std::vector<CHAddr>> m_releaseQueue;
};
void LatteIndices_invalidateAll();
class VKRMemoryManager
@ -140,9 +248,9 @@ class VKRMemoryManager
friend class VKRSynchronizedRingAllocator;
public:
VKRMemoryManager(class VulkanRenderer* renderer) :
m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
{
m_vkr = renderer;
}
@ -167,7 +275,7 @@ public:
}
VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data
VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data
VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data
void cleanupBuffers(uint64 latestFinishedCommandBufferId)
@ -202,6 +310,6 @@ public:
private:
class VulkanRenderer* m_vkr;
VKRSynchronizedRingAllocator m_stagingBuffer;
VKRSynchronizedRingAllocator m_indexBuffer;
VKRSynchronizedHeapAllocator m_indexBuffer;
VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
};

View File

@ -3699,7 +3699,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin
void VulkanRenderer::AppendOverlayDebugInfo()
{
ImGui::Text("--- Vulkan info ---");
ImGui::Text("--- Vulkan debug info ---");
ImGui::Text("GfxPipelines %u", performanceMonitor.vk.numGraphicPipelines.get());
ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
@ -3716,7 +3716,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
ImGui::Text("BeginRP/f %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
ImGui::Text("Barriers/f %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
ImGui::Text("--- Cache info ---");
ImGui::Text("--- Cache debug info ---");
uint32 bufferCacheHeapSize = 0;
uint32 bufferCacheAllocationSize = 0;
@ -3736,7 +3736,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
ImGui::Text("Index");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);

View File

@ -328,8 +328,9 @@ public:
RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
void indexData_uploadIndexMemory(uint32 offset, uint32 size) override;
IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
void indexData_uploadIndexMemory(IndexAllocation& allocation) override;
// externally callable
void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);

View File

@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount
return draw_createGraphicsPipeline(indexCount);
}
void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size)
{
auto& indexAllocator = this->memoryManager->getIndexAllocator();
auto resv = indexAllocator.AllocateBufferMemory(size, 32);
offset = resv.bufferOffset;
bufferIndex = resv.bufferIndex;
return resv.memPtr;
VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32);
return { resv->memPtr, resv };
}
void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size)
void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
{
// does nothing since the index buffer memory is coherent
memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
}
void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
{
memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
}
float s_vkUniformData[512 * 4];
@ -1415,14 +1417,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
uint32 hostIndexCount;
uint32 indexMin = 0;
uint32 indexMax = 0;
uint32 indexBufferOffset = 0;
uint32 indexBufferIndex = 0;
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
Renderer::IndexAllocation indexAllocation;
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal;
// update index binding
bool isPrevIndexData = false;
if (hostIndexType != INDEX_TYPE::NONE)
{
uint32 indexBufferIndex = indexReservation->bufferIndex;
uint32 indexBufferOffset = indexReservation->bufferOffset;
if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
{
m_state.activeIndexType = hostIndexType;
@ -1435,7 +1438,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
vkType = VK_INDEX_TYPE_UINT32;
else
cemu_assert(false);
vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType);
vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType);
}
else
isPrevIndexData = true;