CDImageCHD: Add SSE/NEON implementation of CopyAndSwap()

This commit is contained in:
Stenzek
2023-10-01 14:44:28 +10:00
parent ecb082b672
commit e2efec12b7
2 changed files with 83 additions and 27 deletions

View File

@ -9,6 +9,8 @@
#include "common/error.h"
#include "common/file_system.h"
#include "common/hash_combine.h"
#include "common/heap_array.h"
#include "common/intrin.h"
#include "common/log.h"
#include "common/path.h"
#include "common/string_util.h"
@ -76,11 +78,13 @@ private:
chd_file* OpenCHD(std::string_view filename, FileSystem::ManagedCFilePtr fp, Error* error, u32 recursion_level);
bool ReadHunk(u32 hunk_index);
static void CopyAndSwap(void* dst_ptr, const u8* src_ptr);
chd_file* m_chd = nullptr;
u32 m_hunk_size = 0;
u32 m_sectors_per_hunk = 0;
std::vector<u8> m_hunk_buffer;
DynamicHeapArray<u8, 16> m_hunk_buffer;
u32 m_current_hunk_index = static_cast<u32>(-1);
bool m_precached = false;
@ -443,12 +447,39 @@ bool CDImageCHD::IsPrecached() const
return m_precached;
}
// There's probably a more efficient way of doing this with vectorization...
ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data_size)
ALWAYS_INLINE_RELEASE void CDImageCHD::CopyAndSwap(void* dst_ptr, const u8* src_ptr)
{
constexpr u32 data_size = RAW_SECTOR_SIZE;
u8* dst_ptr_byte = static_cast<u8*>(dst_ptr);
#if defined(CPU_ARCH_X64) || defined(CPU_ARCH_ARM64)
const u32 num_values = data_size / 8;
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
static_assert((data_size % 16) == 0);
constexpr u32 num_values = data_size / 16;
#if defined(CPU_ARCH_SSE)
// Requires SSSE3.
//const __m128i mask = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
for (u32 i = 0; i < num_values; i++)
{
__m128i value = _mm_load_si128(reinterpret_cast<const __m128i*>(src_ptr));
//value = _mm_shuffle_epi8(value, mask);
value = _mm_or_si128(_mm_slli_epi16(value, 8), _mm_srli_epi16(value, 8));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr_byte), value);
src_ptr += sizeof(value);
dst_ptr_byte += sizeof(value);
}
#elif defined(CPU_ARCH_NEON)
for (u32 i = 0; i < num_values; i++)
{
uint16x8_t value = vld1q_u16(reinterpret_cast<const u16*>(src_ptr));
value = vorrq_u16(vshlq_n_u16(value, 8), vshrq_n_u16(value, 8));
vst1q_u16(reinterpret_cast<u16*>(dst_ptr_byte), value);
src_ptr += sizeof(value);
dst_ptr_byte += sizeof(value);
}
#endif
#elif defined(CPU_ARCH_RISCV64)
constexpr u32 num_values = data_size / 8;
for (u32 i = 0; i < num_values; i++)
{
u64 value;
@ -458,8 +489,8 @@ ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data
src_ptr += sizeof(value);
dst_ptr_byte += sizeof(value);
}
#elif defined(CPU_ARCH_X86) || defined(CPU_ARCH_ARM32)
const u32 num_values = data_size / 4;
#else
constexpr u32 num_values = data_size / 4;
for (u32 i = 0; i < num_values; i++)
{
u32 value;
@ -469,17 +500,6 @@ ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data
src_ptr += sizeof(value);
dst_ptr_byte += sizeof(value);
}
#else
const u32 num_values = data_size / sizeof(u16);
for (u32 i = 0; i < num_values; i++)
{
u16 value;
std::memcpy(&value, src_ptr, sizeof(value));
value = (value << 8) | (value >> 8);
std::memcpy(dst_ptr_byte, &value, sizeof(value));
src_ptr += sizeof(value);
dst_ptr_byte += sizeof(value);
}
#endif
}
@ -495,7 +515,7 @@ bool CDImageCHD::ReadSectorFromIndex(void* buffer, const Index& index, LBA lba_i
// Audio data is in big-endian, so we have to swap it for little endian hosts...
if (index.mode == TrackMode::Audio)
CopyAndSwap(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE);
CopyAndSwap(buffer, &m_hunk_buffer[hunk_offset]);
else
std::memcpy(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE);