mirror of https://github.com/cemu-project/Cemu.git
Optimize ChunkedHeap
This commit is contained in:
parent
13979d490f
commit
e97493b2a1
|
@ -44,7 +44,7 @@ struct VkImageMemAllocation
|
|||
uint32 getAllocationSize() { return allocationSize; }
|
||||
};
|
||||
|
||||
class VkTextureChunkedHeap : private ChunkedHeap
|
||||
class VkTextureChunkedHeap : private ChunkedHeap<>
|
||||
{
|
||||
public:
|
||||
VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
|
||||
|
@ -80,8 +80,8 @@ public:
|
|||
|
||||
void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
|
||||
{
|
||||
totalHeapSize = numHeapBytes;
|
||||
allocatedBytes = numAllocatedBytes;
|
||||
totalHeapSize = m_numHeapBytes;
|
||||
allocatedBytes = m_numAllocatedBytes;
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -92,7 +92,7 @@ public:
|
|||
std::vector<ChunkInfo> m_list_chunkInfo;
|
||||
};
|
||||
|
||||
class VkBufferChunkedHeap : private ChunkedHeap
|
||||
class VkBufferChunkedHeap : private ChunkedHeap<>
|
||||
{
|
||||
public:
|
||||
VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
|
||||
|
@ -123,8 +123,8 @@ class VkBufferChunkedHeap : private ChunkedHeap
|
|||
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
|
||||
{
|
||||
numBuffers = m_chunkBuffers.size();
|
||||
totalBufferSize = numHeapBytes;
|
||||
freeBufferSize = numHeapBytes - numAllocatedBytes;
|
||||
totalBufferSize = m_numHeapBytes;
|
||||
freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
|
||||
}
|
||||
|
||||
bool RequiresFlush(uint32 index) const
|
||||
|
|
|
@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
|
|||
#define NOEXPORT __attribute__ ((visibility ("hidden")))
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define FORCE_INLINE __forceinline
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#define FORCE_INLINE inline __attribute__((always_inline))
|
||||
#else
|
||||
#define FORCE_INLINE
|
||||
#endif
|
||||
|
||||
FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined
|
||||
{
|
||||
#if defined(_MSC_VER)
|
||||
return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
return __builtin_ctz(v);
|
||||
#else
|
||||
return std::countr_zero(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
|
||||
#if defined(__aarch64__)
|
||||
|
||||
|
|
|
@ -1,35 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include <util/helpers/MemoryPool.h>
|
||||
|
||||
struct CHAddr
|
||||
{
|
||||
uint32 offset;
|
||||
uint32 chunkIndex;
|
||||
void* internal; // AllocRange
|
||||
|
||||
CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {};
|
||||
CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {};
|
||||
CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};
|
||||
|
||||
bool isValid() { return chunkIndex != 0xFFFFFFFF; };
|
||||
static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
|
||||
};
|
||||
|
||||
template<uint32 TMinimumAlignment = 32>
|
||||
class ChunkedHeap
|
||||
{
|
||||
struct allocRange_t
|
||||
struct AllocRange
|
||||
{
|
||||
allocRange_t* nextFree{};
|
||||
allocRange_t* prevFree{};
|
||||
allocRange_t* prevOrdered{};
|
||||
allocRange_t* nextOrdered{};
|
||||
AllocRange* nextFree{};
|
||||
AllocRange* prevFree{};
|
||||
AllocRange* prevOrdered{};
|
||||
AllocRange* nextOrdered{};
|
||||
uint32 offset;
|
||||
uint32 chunkIndex;
|
||||
uint32 size;
|
||||
bool isFree;
|
||||
allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
|
||||
AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
|
||||
};
|
||||
|
||||
struct chunk_t
|
||||
struct Chunk
|
||||
{
|
||||
std::unordered_map<uint32, allocRange_t*> map_allocatedRange;
|
||||
uint32 size;
|
||||
};
|
||||
|
||||
public:
|
||||
|
@ -47,45 +51,32 @@ public:
|
|||
_free(addr);
|
||||
}
|
||||
|
||||
virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0;
|
||||
|
||||
private:
|
||||
unsigned ulog2(uint32 v)
|
||||
{
|
||||
static const unsigned MUL_DE_BRUIJN_BIT[] =
|
||||
{
|
||||
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
||||
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
|
||||
};
|
||||
|
||||
v |= v >> 1;
|
||||
v |= v >> 2;
|
||||
v |= v >> 4;
|
||||
v |= v >> 8;
|
||||
v |= v >> 16;
|
||||
|
||||
return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27];
|
||||
cemu_assert_debug(v != 0);
|
||||
return 31 - std::countl_zero(v);
|
||||
}
|
||||
|
||||
void trackFreeRange(allocRange_t* range)
|
||||
void trackFreeRange(AllocRange* range)
|
||||
{
|
||||
// get index of msb
|
||||
cemu_assert_debug(range->size != 0); // size of zero is not allowed
|
||||
uint32 bucketIndex = ulog2(range->size);
|
||||
range->nextFree = bucketFreeRange[bucketIndex];
|
||||
if (bucketFreeRange[bucketIndex])
|
||||
bucketFreeRange[bucketIndex]->prevFree = range;
|
||||
range->nextFree = m_bucketFreeRange[bucketIndex];
|
||||
if (m_bucketFreeRange[bucketIndex])
|
||||
m_bucketFreeRange[bucketIndex]->prevFree = range;
|
||||
range->prevFree = nullptr;
|
||||
bucketFreeRange[bucketIndex] = range;
|
||||
m_bucketFreeRange[bucketIndex] = range;
|
||||
m_bucketUseMask |= (1u << bucketIndex);
|
||||
}
|
||||
|
||||
void forgetFreeRange(allocRange_t* range, uint32 bucketIndex)
|
||||
void forgetFreeRange(AllocRange* range, uint32 bucketIndex)
|
||||
{
|
||||
allocRange_t* prevRange = range->prevFree;
|
||||
allocRange_t* nextRange = range->nextFree;
|
||||
AllocRange* prevRange = range->prevFree;
|
||||
AllocRange* nextRange = range->nextFree;
|
||||
if (prevRange)
|
||||
{
|
||||
prevRange->nextFree = nextRange;
|
||||
|
@ -94,36 +85,42 @@ private:
|
|||
}
|
||||
else
|
||||
{
|
||||
if (bucketFreeRange[bucketIndex] != range)
|
||||
assert_dbg();
|
||||
bucketFreeRange[bucketIndex] = nextRange;
|
||||
cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range);
|
||||
m_bucketFreeRange[bucketIndex] = nextRange;
|
||||
if (nextRange)
|
||||
nextRange->prevFree = nullptr;
|
||||
else
|
||||
m_bucketUseMask &= ~(1u << bucketIndex);
|
||||
}
|
||||
}
|
||||
|
||||
bool allocateChunk(uint32 minimumAllocationSize)
|
||||
{
|
||||
uint32 chunkIndex = (uint32)list_chunks.size();
|
||||
list_chunks.emplace_back(new chunk_t());
|
||||
uint32 chunkIndex = (uint32)m_chunks.size();
|
||||
m_chunks.emplace_back();
|
||||
uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
|
||||
cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment
|
||||
if (chunkSize == 0)
|
||||
return false;
|
||||
allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true);
|
||||
cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB
|
||||
AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true);
|
||||
trackFreeRange(range);
|
||||
numHeapBytes += chunkSize;
|
||||
m_numHeapBytes += chunkSize;
|
||||
return true;
|
||||
}
|
||||
|
||||
void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
|
||||
void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
|
||||
{
|
||||
cemu_assert_debug(allocSize > 0);
|
||||
// remove the range from the chain of free ranges
|
||||
forgetFreeRange(range, bucketIndex);
|
||||
// split head, allocation and tail into separate ranges
|
||||
if (allocOffset > range->offset)
|
||||
uint32 headBytes = allocOffset - range->offset;
|
||||
if (headBytes > 0)
|
||||
{
|
||||
// alignment padding -> create free range
|
||||
allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true);
|
||||
cemu_assert_debug(headBytes >= TMinimumAlignment);
|
||||
AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true);
|
||||
trackFreeRange(head);
|
||||
if (range->prevOrdered)
|
||||
range->prevOrdered->nextOrdered = head;
|
||||
|
@ -131,10 +128,12 @@ private:
|
|||
head->nextOrdered = range;
|
||||
range->prevOrdered = head;
|
||||
}
|
||||
if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes?
|
||||
uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize);
|
||||
if (tailBytes > 0)
|
||||
{
|
||||
// tail -> create free range
|
||||
allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true);
|
||||
cemu_assert_debug(tailBytes >= TMinimumAlignment);
|
||||
AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true);
|
||||
trackFreeRange(tail);
|
||||
if (range->nextOrdered)
|
||||
range->nextOrdered->prevOrdered = tail;
|
||||
|
@ -149,36 +148,51 @@ private:
|
|||
|
||||
CHAddr _alloc(uint32 size, uint32 alignment)
|
||||
{
|
||||
cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment));
|
||||
// make sure size is not zero and align it
|
||||
if(size == 0) [[unlikely]]
|
||||
size = TMinimumAlignment;
|
||||
else
|
||||
size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1);
|
||||
// find smallest bucket to scan
|
||||
uint32 alignmentM1 = alignment - 1;
|
||||
uint32 bucketIndex = ulog2(size);
|
||||
while (bucketIndex < 32)
|
||||
// check if the bucket is available
|
||||
if( !(m_bucketUseMask & (1u << bucketIndex)) )
|
||||
{
|
||||
allocRange_t* range = bucketFreeRange[bucketIndex];
|
||||
// skip to next non-empty bucket
|
||||
uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex);
|
||||
bucketIndex += nextIndex;
|
||||
}
|
||||
while (bucketIndex < 31)
|
||||
{
|
||||
AllocRange* range = m_bucketFreeRange[bucketIndex];
|
||||
while (range)
|
||||
{
|
||||
if (range->size >= size)
|
||||
{
|
||||
// verify if aligned allocation fits
|
||||
uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
|
||||
uint32 alignmentLoss = alignedOffset - range->offset;
|
||||
if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size)
|
||||
uint32 endOffset = alignedOffset + size;
|
||||
if((range->offset+range->size) >= endOffset)
|
||||
{
|
||||
_allocFrom(range, bucketIndex, alignedOffset, size);
|
||||
list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range);
|
||||
numAllocatedBytes += size;
|
||||
return CHAddr(alignedOffset, range->chunkIndex);
|
||||
m_numAllocatedBytes += size;
|
||||
return CHAddr(alignedOffset, range->chunkIndex, range);
|
||||
}
|
||||
}
|
||||
range = range->nextFree;
|
||||
}
|
||||
bucketIndex++; // try higher bucket
|
||||
// check next non-empty bucket or skip to end
|
||||
bucketIndex++;
|
||||
uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex);
|
||||
bucketIndex += emptyBuckets;
|
||||
}
|
||||
if(allocationLimitReached)
|
||||
if(m_allocationLimitReached)
|
||||
return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
|
||||
if (!allocateChunk(size))
|
||||
{
|
||||
allocationLimitReached = true;
|
||||
m_allocationLimitReached = true;
|
||||
return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
|
||||
}
|
||||
return _alloc(size, alignment);
|
||||
|
@ -186,24 +200,16 @@ private:
|
|||
|
||||
void _free(CHAddr addr)
|
||||
{
|
||||
auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset);
|
||||
if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end())
|
||||
if(!addr.internal)
|
||||
{
|
||||
cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
|
||||
cemuLog_log(LogType::Force, "Debug info:");
|
||||
for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange)
|
||||
{
|
||||
cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
allocRange_t* range = it->second;
|
||||
numAllocatedBytes -= it->second->size;
|
||||
list_chunks[range->chunkIndex]->map_allocatedRange.erase(it);
|
||||
AllocRange* range = (AllocRange*)addr.internal;
|
||||
m_numAllocatedBytes -= range->size;
|
||||
// try merge left or right
|
||||
allocRange_t* prevRange = range->prevOrdered;
|
||||
allocRange_t* nextRange = range->nextOrdered;
|
||||
AllocRange* prevRange = range->prevOrdered;
|
||||
AllocRange* nextRange = range->nextOrdered;
|
||||
if (prevRange && prevRange->isFree)
|
||||
{
|
||||
if (nextRange && nextRange->isFree)
|
||||
|
@ -216,8 +222,8 @@ private:
|
|||
forgetFreeRange(prevRange, ulog2(prevRange->size));
|
||||
prevRange->size = newSize;
|
||||
trackFreeRange(prevRange);
|
||||
delete range;
|
||||
delete nextRange;
|
||||
m_allocEntriesPool.freeObj(range);
|
||||
m_allocEntriesPool.freeObj(nextRange);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -228,7 +234,7 @@ private:
|
|||
forgetFreeRange(prevRange, ulog2(prevRange->size));
|
||||
prevRange->size = newSize;
|
||||
trackFreeRange(prevRange);
|
||||
delete range;
|
||||
m_allocEntriesPool.freeObj(range);
|
||||
}
|
||||
}
|
||||
else if (nextRange && nextRange->isFree)
|
||||
|
@ -242,7 +248,7 @@ private:
|
|||
range->prevOrdered->nextOrdered = nextRange;
|
||||
nextRange->prevOrdered = range->prevOrdered;
|
||||
trackFreeRange(nextRange);
|
||||
delete range;
|
||||
m_allocEntriesPool.freeObj(range);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -265,7 +271,7 @@ private:
|
|||
|
||||
for (uint32 i = 0; i < 32; i++)
|
||||
{
|
||||
allocRange_t* ar = bucketFreeRange[i];
|
||||
AllocRange* ar = m_bucketFreeRange[i];
|
||||
while (ar)
|
||||
{
|
||||
availableRange_t dbgRange;
|
||||
|
@ -278,7 +284,7 @@ private:
|
|||
if (itr.chunkIndex != dbgRange.chunkIndex)
|
||||
continue;
|
||||
if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
|
||||
assert_dbg();
|
||||
cemu_assert_error();
|
||||
}
|
||||
|
||||
availRanges.emplace_back(dbgRange);
|
||||
|
@ -290,14 +296,16 @@ private:
|
|||
}
|
||||
|
||||
private:
|
||||
std::vector<chunk_t*> list_chunks;
|
||||
allocRange_t* bucketFreeRange[32]{};
|
||||
bool allocationLimitReached = false;
|
||||
std::vector<Chunk> m_chunks;
|
||||
uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction
|
||||
AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed)
|
||||
bool m_allocationLimitReached = false;
|
||||
MemoryPool<AllocRange> m_allocEntriesPool{64};
|
||||
|
||||
public:
|
||||
// statistics
|
||||
uint32 numHeapBytes{}; // total size of the heap
|
||||
uint32 numAllocatedBytes{};
|
||||
uint32 m_numHeapBytes{}; // total size of the heap
|
||||
uint32 m_numAllocatedBytes{};
|
||||
};
|
||||
|
||||
class VGenericHeap
|
||||
|
@ -633,7 +641,7 @@ public:
|
|||
|
||||
uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; }
|
||||
uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; }
|
||||
|
||||
|
||||
private:
|
||||
void allocateAdditionalChunk()
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue