mirror of https://github.com/cemu-project/Cemu.git
Linux/MacOS: Greatly improve performance (#370)
std::unordered_set is implemented as a flat hashtable on libstdc++ which makes clearing expensive due to invoking memset on the entire table. To get the best performance across all platforms this replaces the unordered_set with a custom high-performance sparse bitset
This commit is contained in:
parent
a19ed46b2a
commit
ada8bbb3b4
|
@ -1005,8 +1005,67 @@ void LatteBufferCache_getStats(uint32& heapSize, uint32& allocationSize, uint32&
|
|||
}
|
||||
|
||||
FSpinlock g_spinlockDCFlushQueue;
|
||||
std::unordered_set<uint32>* g_DCFlushQueue = new std::unordered_set<uint32>(); // queued pages
|
||||
std::unordered_set<uint32>* g_DCFlushQueueAlternate = new std::unordered_set<uint32>();
|
||||
|
||||
class SparseBitset
|
||||
{
|
||||
static inline constexpr size_t TABLE_MASK = 0xFF;
|
||||
|
||||
public:
|
||||
bool Empty() const
|
||||
{
|
||||
return m_numNonEmptyVectors == 0;
|
||||
}
|
||||
|
||||
void Set(uint32 index)
|
||||
{
|
||||
auto& v = m_bits[index & TABLE_MASK];
|
||||
if (std::find(v.cbegin(), v.cend(), index) != v.end())
|
||||
return;
|
||||
if (v.empty())
|
||||
{
|
||||
m_nonEmptyVectors[m_numNonEmptyVectors] = &v;
|
||||
m_numNonEmptyVectors++;
|
||||
}
|
||||
v.emplace_back(index);
|
||||
}
|
||||
|
||||
template<typename TFunc>
|
||||
void ForAllAndClear(TFunc callbackFunc)
|
||||
{
|
||||
auto vCurrent = m_nonEmptyVectors + 0;
|
||||
auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors;
|
||||
while (vCurrent < vEnd)
|
||||
{
|
||||
std::vector<uint32>* vec = *vCurrent;
|
||||
vCurrent++;
|
||||
for (const auto& it : *vec)
|
||||
callbackFunc(it);
|
||||
vec->clear();
|
||||
}
|
||||
m_numNonEmptyVectors = 0;
|
||||
}
|
||||
|
||||
void Clear()
|
||||
{
|
||||
auto vCurrent = m_nonEmptyVectors + 0;
|
||||
auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors;
|
||||
while (vCurrent < vEnd)
|
||||
{
|
||||
std::vector<uint32>* vec = *vCurrent;
|
||||
vCurrent++;
|
||||
vec->clear();
|
||||
}
|
||||
m_numNonEmptyVectors = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<uint32> m_bits[TABLE_MASK + 1];
|
||||
std::vector<uint32>* m_nonEmptyVectors[TABLE_MASK + 1];
|
||||
size_t m_numNonEmptyVectors{ 0 };
|
||||
};
|
||||
|
||||
SparseBitset* s_DCFlushQueue = new SparseBitset();
|
||||
SparseBitset* s_DCFlushQueueAlternate = new SparseBitset();
|
||||
|
||||
void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
|
||||
{
|
||||
|
@ -1017,20 +1076,18 @@ void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
|
|||
uint32 lastPage = (address + size - 1) / CACHE_PAGE_SIZE;
|
||||
g_spinlockDCFlushQueue.acquire();
|
||||
for (uint32 i = firstPage; i <= lastPage; i++)
|
||||
g_DCFlushQueue->emplace(i);
|
||||
s_DCFlushQueue->Set(i);
|
||||
g_spinlockDCFlushQueue.release();
|
||||
}
|
||||
|
||||
void LatteBufferCache_processDCFlushQueue()
|
||||
{
|
||||
if (g_DCFlushQueue->empty()) // accessing this outside of the lock is technically undefined/unsafe behavior but on all known implementations this is fine and we can avoid the spinlock
|
||||
if (s_DCFlushQueue->Empty()) // quick check to avoid locking if there is no work to do
|
||||
return;
|
||||
g_spinlockDCFlushQueue.acquire();
|
||||
std::swap(g_DCFlushQueue, g_DCFlushQueueAlternate);
|
||||
std::swap(s_DCFlushQueue, s_DCFlushQueueAlternate);
|
||||
g_spinlockDCFlushQueue.release();
|
||||
for (auto& itr : *g_DCFlushQueueAlternate)
|
||||
LatteBufferCache_invalidatePage(itr * CACHE_PAGE_SIZE);
|
||||
g_DCFlushQueueAlternate->clear();
|
||||
s_DCFlushQueueAlternate->ForAllAndClear([](uint32 index) {LatteBufferCache_invalidatePage(index * CACHE_PAGE_SIZE); });
|
||||
}
|
||||
|
||||
void LatteBufferCache_notifyDrawDone()
|
||||
|
|
Loading…
Reference in New Issue