CPU/CodeCache: Always dynamically allocate code buffer

Reduces .bss size.
This commit is contained in:
Stenzek
2024-06-29 18:12:30 +10:00
parent 0d3e674500
commit be8fbafd71
12 changed files with 453 additions and 398 deletions

View File

@@ -123,22 +123,27 @@ PerfScope MIPSPerfScope("MIPS");
#endif
// Currently remapping the code buffer doesn't work in macOS. TODO: Make dynamic instead...
#ifndef __APPLE__
#define USE_STATIC_CODE_BUFFER 1
#endif
#if defined(CPU_ARCH_ARM32)
// Use a smaller code buffer size on AArch32 to have a better chance of being in range.
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 16 * 1024 * 1024;
static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 8 * 1024 * 1024;
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 20 * 1024 * 1024;
static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 4 * 1024 * 1024;
#else
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 32 * 1024 * 1024;
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 48 * 1024 * 1024;
static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 16 * 1024 * 1024;
#endif
#ifdef USE_STATIC_CODE_BUFFER
alignas(HOST_PAGE_SIZE) static u8 s_code_storage[RECOMPILER_CODE_CACHE_SIZE + RECOMPILER_FAR_CODE_CACHE_SIZE];
// On Linux ARM32/ARM64, we use a dedicated section in the ELF for storing code.
// This is because without ASLR, or on certain ASLR offsets, the sbrk() heap ends up immediately following the text/data
// sections, which means there isn't a large enough gap to fit within range on ARM32.
#if defined(__linux__) && (defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64))
#define USE_CODE_BUFFER_SECTION 1
#ifdef __clang__
#pragma clang section bss = ".jitstorage"
__attribute__((aligned(HOST_PAGE_SIZE))) static u8 s_code_buffer_ptr[RECOMPILER_CODE_CACHE_SIZE];
#pragma clang section bss = ""
#endif
#else
static u8* s_code_buffer_ptr = nullptr;
#endif
static JitCodeBuffer s_code_buffer;
@@ -162,20 +167,26 @@ bool CPU::CodeCache::IsUsingFastmem()
bool CPU::CodeCache::ProcessStartup(Error* error)
{
AllocateLUTs();
#ifdef USE_STATIC_CODE_BUFFER
const bool has_buffer =
s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, HOST_PAGE_SIZE);
#ifdef USE_CODE_BUFFER_SECTION
const u8* module_base = static_cast<const u8*>(MemMap::GetBaseAddress());
INFO_LOG("Using JIT buffer section of size {} at {} (0x{:X} bytes / {} MB away)", sizeof(s_code_buffer_ptr),
static_cast<void*>(s_code_buffer_ptr), std::abs(static_cast<ptrdiff_t>(s_code_buffer_ptr - module_base)),
(std::abs(static_cast<ptrdiff_t>(s_code_buffer_ptr - module_base)) + (1024 * 1024 - 1)) / (1024 * 1024));
const bool code_buffer_allocated =
MemMap::MemProtect(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, PageProtect::ReadWriteExecute);
#else
const bool has_buffer = false;
s_code_buffer_ptr = static_cast<u8*>(MemMap::AllocateJITMemory(RECOMPILER_CODE_CACHE_SIZE));
const bool code_buffer_allocated = (s_code_buffer_ptr != nullptr);
#endif
if (!has_buffer && !s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE))
if (!code_buffer_allocated) [[unlikely]]
{
Error::SetStringView(error, "Failed to initialize code space");
Error::SetStringView(error, "Failed to allocate code storage. The log may contain more information, you will need "
"to run DuckStation with -earlyconsole in the command line.");
return false;
}
AllocateLUTs();
if (!PageFaultHandler::Install(error))
return false;
@@ -184,17 +195,21 @@ bool CPU::CodeCache::ProcessStartup(Error* error)
void CPU::CodeCache::ProcessShutdown()
{
s_code_buffer.Destroy();
DeallocateLUTs();
#ifndef USE_CODE_BUFFER_SECTION
MemMap::ReleaseJITMemory(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE);
#endif
}
void CPU::CodeCache::Initialize()
{
Assert(s_blocks.empty());
// TODO: Reduce far code size when not using memory exceptions.
if (IsUsingAnyRecompiler())
{
s_code_buffer.Reset();
s_code_buffer.Reset(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE);
CompileASMFunctions();
ResetCodeLUT();
}
@@ -219,7 +234,7 @@ void CPU::CodeCache::Reset()
if (IsUsingAnyRecompiler())
{
ClearASMFunctions();
s_code_buffer.Reset();
s_code_buffer.Reset(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE);
CompileASMFunctions();
ResetCodeLUT();
}

View File

@@ -1,9 +1,10 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
@@ -171,7 +172,7 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
}
if (flush_icache)
JitCodeBuffer::FlushInstructionCache(code, kA32InstructionSizeInBytes);
MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes);
return kA32InstructionSizeInBytes;
}
@@ -202,7 +203,7 @@ u8* CPU::Recompiler::armGetJumpTrampoline(const void* target)
s_trampoline_targets.emplace(target, offset);
s_trampoline_used = offset + static_cast<u32>(size);
JitCodeBuffer::FlushInstructionCache(start, size);
MemMap::FlushInstructionCache(start, size);
return start;
}
@@ -1790,7 +1791,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
MemMap::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)

View File

@@ -1,9 +1,11 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
#include "cpu_core_private.h"
@@ -274,7 +276,7 @@ u8* CPU::Recompiler::armGetJumpTrampoline(const void* target)
s_trampoline_targets.emplace(target, offset);
s_trampoline_used = offset + static_cast<u32>(size);
JitCodeBuffer::FlushInstructionCache(start, size);
MemMap::FlushInstructionCache(start, size);
return start;
}
@@ -316,7 +318,7 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
const u32 new_code = B | Assembler::ImmUncondBranch(disp);
std::memcpy(code, &new_code, sizeof(new_code));
if (flush_icache)
JitCodeBuffer::FlushInstructionCache(code, kInstructionSize);
MemMap::FlushInstructionCache(code, kInstructionSize);
return kInstructionSize;
}
@@ -2100,7 +2102,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
MemMap::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)

View File

@@ -12,6 +12,7 @@
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#ifdef CPU_ARCH_X64
@@ -1768,15 +1769,8 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size)
void CodeGenerator::EmitCall(const void* ptr)
{
if (Xbyak::inner::IsInInt32(reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(m_emit->getCurr())))
{
m_emit->call(ptr);
}
else
{
m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast<size_t>(ptr));
m_emit->call(GetHostReg64(RRETURN));
}
DebugAssert(Xbyak::inner::IsInInt32(reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(m_emit->getCurr())));
m_emit->call(ptr);
}
void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr)
@@ -2530,7 +2524,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore
for (s32 i = 0; i < nops; i++)
cg.nop();
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
MemMap::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)

View File

@@ -318,6 +318,34 @@ void System::CheckCacheLineSize()
}
}
bool System::Internal::ProcessStartup(Error* error)
{
Common::Timer timer;
// Allocate JIT memory as soon as possible.
if (!CPU::CodeCache::ProcessStartup(error))
return false;
// Fastmem alloc *must* come after JIT alloc, otherwise it tends to eat the 4GB region after the executable on MacOS.
if (!Bus::AllocateMemory(error))
{
CPU::CodeCache::ProcessShutdown();
return false;
}
VERBOSE_LOG("Memory allocation took {} ms.", timer.GetTimeMilliseconds());
CheckCacheLineSize();
return true;
}
void System::Internal::ProcessShutdown()
{
Bus::ReleaseMemory();
CPU::CodeCache::ProcessShutdown();
}
bool System::Internal::CPUThreadInitialize(Error* error)
{
#ifdef _WIN32
@@ -332,17 +360,9 @@ bool System::Internal::CPUThreadInitialize(Error* error)
}
#endif
if (!CPU::CodeCache::ProcessStartup(error) || !Bus::AllocateMemory(error))
{
CPUThreadShutdown();
return false;
}
// This will call back to Host::LoadSettings() -> ReloadSources().
LoadSettings(false);
CheckCacheLineSize();
#ifdef ENABLE_RAINTEGRATION
if (Host::GetBaseBoolSettingValue("Cheevos", "UseRAIntegration", false))
Achievements::SwitchToRAIntegration();
@@ -377,9 +397,6 @@ void System::Internal::CPUThreadShutdown()
InputManager::CloseSources();
CPU::CodeCache::ProcessShutdown();
Bus::ReleaseMemory();
#ifdef _WIN32
CoUninitialize();
#endif

View File

@@ -504,10 +504,16 @@ namespace Internal {
/// Performs mandatory hardware checks.
bool PerformEarlyHardwareChecks(Error* error);
/// Called on process startup.
bool CPUThreadInitialize(Error* error);
/// Called on process startup, as early as possible.
bool ProcessStartup(Error* error);
/// Called on process shutdown.
void ProcessShutdown();
/// Called on CPU thread initialization.
bool CPUThreadInitialize(Error* error);
/// Called on CPU thread shutdown.
void CPUThreadShutdown();
/// Polls input, updates subsystems which are present while paused/inactive.