GPU: Use half width vector types where appropriate

This commit is contained in:
Stenzek
2024-07-11 18:37:51 +10:00
parent d70f5ddb32
commit 59a2309a83
8 changed files with 2614 additions and 220 deletions

View File

@@ -5,7 +5,6 @@
#pragma once
#include "common/assert.h"
#include "common/types.h"
#include <algorithm>
@@ -15,7 +14,820 @@
#define GSVECTOR_HAS_UNSIGNED 1
#define GSVECTOR_HAS_SRLV 1
class GSVector2;
class GSVector2i;
class GSVector4;
class GSVector4i;
#define SSATURATE8(expr) static_cast<s8>(std::clamp<decltype(expr)>(expr, -128, 127))
#define USATURATE8(expr) static_cast<u8>(std::clamp<decltype(expr)>(expr, 0, 255))
#define SSATURATE16(expr) static_cast<s16>(std::clamp<decltype(expr)>(expr, -32768, 32767))
#define USATURATE16(expr) static_cast<u16>(std::clamp<decltype(expr)>(expr, 0, 65535))
#define ALL_LANES_8(expr) \
GSVector2i ret; \
for (size_t i = 0; i < 8; i++) \
expr; \
return ret;
#define ALL_LANES_16(expr) \
GSVector2i ret; \
for (size_t i = 0; i < 4; i++) \
expr; \
return ret;
#define ALL_LANES_32(expr) \
GSVector2i ret; \
for (size_t i = 0; i < 2; i++) \
expr; \
return ret;
class alignas(16) GSVector2i
{
struct cxpr_init_tag
{
};
static constexpr cxpr_init_tag cxpr_init{};
constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : I32{x, y} {}
constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : I16{s0, s1, s2, s3} {}
constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
: I8{b0, b1, b2, b3, b4, b5, b6, b7}
{
}
public:
union
{
struct
{
s32 x, y;
};
struct
{
s32 r, g;
};
float F32[2];
s8 I8[8];
s16 I16[4];
s32 I32[2];
s64 I64[1];
u8 U8[8];
u16 U16[4];
u32 U32[2];
u64 U64[1];
};
GSVector2i() = default;
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
{
return GSVector2i(cxpr_init, s0, s1, s2, s3);
}
ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
{
return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
}
ALWAYS_INLINE GSVector2i(s32 x, s32 y)
{
this->x = x;
this->y = y;
}
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3)
{
I16[0] = s0;
I16[1] = s1;
I16[2] = s2;
I16[3] = s3;
}
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
: I8{b0, b1, b2, b3, b4, b5, b6, b7}
{
}
ALWAYS_INLINE explicit GSVector2i(const GSVector2i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
// MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
ALWAYS_INLINE void operator=(const GSVector2i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
ALWAYS_INLINE void operator=(s32 i)
{
x = i;
y = i;
}
ALWAYS_INLINE GSVector2i sat_i8(const GSVector2i& min, const GSVector2i& max) const
{
return max_i8(min).min_i8(max);
}
ALWAYS_INLINE GSVector2i sat_i16(const GSVector2i& min, const GSVector2i& max) const
{
return max_i16(min).min_i16(max);
}
ALWAYS_INLINE GSVector2i sat_i32(const GSVector2i& min, const GSVector2i& max) const
{
return max_i32(min).min_i32(max);
}
ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
{
return max_u8(min).min_u8(max);
}
ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
{
return max_u16(min).min_u16(max);
}
ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
{
return max_u32(min).min_u32(max);
}
GSVector2i min_i8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); }
GSVector2i max_i8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); }
GSVector2i min_i16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); }
GSVector2i max_i16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); }
GSVector2i min_i32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); }
GSVector2i max_i32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); }
GSVector2i min_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); }
GSVector2i max_u8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); }
GSVector2i min_u16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); }
GSVector2i max_u16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); }
GSVector2i min_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); }
GSVector2i max_u32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); }
u8 minv_u8() const
{
return std::min(
U8[0],
std::min(U8[1], std::min(U8[2], std::min(U8[3], std::min(U8[4], std::min(U8[5], std::min(U8[6], U8[7])))))));
}
u16 maxv_u8() const
{
return std::max(
U8[0],
std::max(U8[1], std::max(U8[2], std::max(U8[3], std::max(U8[4], std::max(U8[5], std::max(U8[6], U8[7])))))));
}
u16 minv_u16() const { return std::min(U16[0], std::min(U16[1], std::min(U16[2], U16[3]))); }
u16 maxv_u16() const { return std::max(U16[0], std::max(U16[1], std::max(U16[2], U16[3]))); }
s32 minv_s32() const { return std::min(x, y); }
u32 minv_u32() const { return std::min(U32[0], U32[1]); }
s32 maxv_s32() const { return std::max(x, y); }
u32 maxv_u32() const { return std::max(U32[0], U32[1]); }
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
GSVector2i blend8(const GSVector2i& v, const GSVector2i& mask) const
{
GSVector2i ret;
for (size_t i = 0; i < 8; i++)
ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i];
return ret;
}
template<s32 mask>
GSVector2i blend16(const GSVector2i& v) const
{
GSVector2i ret;
for (size_t i = 0; i < 4; i++)
ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i];
return ret;
}
template<s32 mask>
GSVector2i blend32(const GSVector2i& v) const
{
GSVector2i ret;
for (size_t i = 0; i < 2; i++)
ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i];
return ret;
}
GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
{
GSVector2i ret;
ret.U64[0] = (v.U64[0] & mask.U64[0]);
return ret;
}
ALWAYS_INLINE GSVector2i mix16(const GSVector2i& v) const { return blend16<0xa>(v); }
GSVector2i shuffle8(const GSVector2i& mask) const
{
ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf]));
}
GSVector2i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[(i < 4) ? i : (i - 4)])); }
GSVector2i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 4) ? i : (i - 4)])); }
GSVector2i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I32[(i < 2) ? i : (i - 2)])); }
GSVector2i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 2) ? i : (i - 2)])); }
GSVector2i upl8() const { return GSVector2i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0); }
GSVector2i upl16() const { return GSVector2i(I16[0], 0, I16[1], 0); }
GSVector2i upl32() const { return GSVector2i(I32[0], 0); }
GSVector2i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); }
template<s32 v>
GSVector2i srl() const
{
GSVector2i ret = {};
if constexpr (v < 8)
{
for (s32 i = 0; i < (8 - v); i++)
ret.U8[i] = U8[v + i];
}
return ret;
}
template<s32 v>
GSVector2i sll() const
{
GSVector2i ret = {};
if constexpr (v < 8)
{
for (s32 i = 0; i < (8 - v); i++)
ret.U8[v + i] = U8[i];
}
return ret;
}
template<s32 v>
GSVector2i sll16() const
{
ALL_LANES_16(ret.U16[i] = U16[i] << v);
}
GSVector2i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); }
GSVector2i sllv16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); }
template<s32 v>
GSVector2i srl16() const
{
ALL_LANES_16(ret.U16[i] = U16[i] >> v);
}
GSVector2i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); }
GSVector2i srlv16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); }
template<s32 v>
GSVector2i sra16() const
{
ALL_LANES_16(ret.I16[i] = I16[i] >> v);
}
GSVector2i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); }
GSVector2i srav16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); }
template<s32 v>
GSVector2i sll32() const
{
ALL_LANES_32(ret.U32[i] = U32[i] << v);
}
GSVector2i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); }
GSVector2i sllv32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); }
template<s32 v>
GSVector2i srl32() const
{
ALL_LANES_32(ret.U32[i] = U32[i] >> v);
}
GSVector2i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); }
GSVector2i srlv32(const GSVector2i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); }
template<s32 v>
GSVector2i sra32() const
{
ALL_LANES_32(ret.I32[i] = I32[i] >> v);
}
GSVector2i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); }
GSVector2i srav32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); }
GSVector2i add8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); }
GSVector2i add16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); }
GSVector2i add32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); }
GSVector2i adds8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); }
GSVector2i adds16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); }
GSVector2i addus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); }
GSVector2i addus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); }
GSVector2i sub8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); }
GSVector2i sub16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); }
GSVector2i sub32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); }
GSVector2i subs8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); }
GSVector2i subs16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); }
GSVector2i subus8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); }
GSVector2i subus16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
GSVector2i avg8(const GSVector2i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
GSVector2i avg16(const GSVector2i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
GSVector2i mul16l(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); }
GSVector2i mul32l(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); }
ALWAYS_INLINE bool eq(const GSVector2i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; }
GSVector2i eq8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); }
GSVector2i eq16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); }
GSVector2i eq32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); }
GSVector2i neq8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); }
GSVector2i neq16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); }
GSVector2i neq32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); }
GSVector2i gt8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); }
GSVector2i gt16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); }
GSVector2i gt32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); }
GSVector2i ge8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); }
GSVector2i ge16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); }
GSVector2i ge32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); }
GSVector2i lt8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); }
GSVector2i lt16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); }
GSVector2i lt32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); }
GSVector2i le8(const GSVector2i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); }
GSVector2i le16(const GSVector2i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); }
GSVector2i le32(const GSVector2i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); }
ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const
{
GSVector2i ret;
ret.U64[0] = (~v.U64[0]) & U64[0];
return ret;
}
s32 mask() const
{
return static_cast<s32>((static_cast<u32>(U8[0] >> 7) << 0) | (static_cast<u32>(U8[1] >> 7) << 1) |
(static_cast<u32>(U8[2] >> 7) << 2) | (static_cast<u32>(U8[3] >> 7) << 3) |
(static_cast<u32>(U8[4] >> 7) << 4) | (static_cast<u32>(U8[5] >> 7) << 5) |
(static_cast<u32>(U8[6] >> 7) << 6) | (static_cast<u32>(U8[7] >> 7) << 7));
}
ALWAYS_INLINE bool alltrue() const { return (U64[0] == 0xFFFFFFFFFFFFFFFFULL); }
ALWAYS_INLINE bool allfalse() const { return (U64[0] == 0); }
template<s32 i>
ALWAYS_INLINE GSVector2i insert8(s32 a) const
{
GSVector2i ret = *this;
ret.I8[i] = static_cast<s8>(a);
return ret;
}
template<s32 i>
ALWAYS_INLINE s32 extract8() const
{
return I8[i];
}
template<s32 i>
ALWAYS_INLINE GSVector2i insert16(s32 a) const
{
GSVector2i ret = *this;
ret.I16[i] = static_cast<s16>(a);
return ret;
}
template<s32 i>
ALWAYS_INLINE s32 extract16() const
{
return I16[i];
}
template<s32 i>
ALWAYS_INLINE GSVector2i insert32(s32 a) const
{
GSVector2i ret = *this;
ret.I32[i] = a;
return ret;
}
template<s32 i>
ALWAYS_INLINE s32 extract32() const
{
return I32[i];
}
ALWAYS_INLINE static GSVector2i load32(const void* p)
{
GSVector2i ret;
std::memcpy(&ret.x, p, sizeof(s32));
ret.y = 0;
return ret;
}
ALWAYS_INLINE static GSVector2i load(const void* p)
{
GSVector2i ret;
std::memcpy(ret.I32, p, sizeof(ret.I32));
return ret;
}
ALWAYS_INLINE static GSVector2i load(s32 i)
{
GSVector2i ret;
ret.x = i;
return ret;
}
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.I32, sizeof(I32)); }
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
ALWAYS_INLINE static s32 store(const GSVector2i& v) { return v.x; }
ALWAYS_INLINE void operator&=(const GSVector2i& v) { U64[0] &= v.U64[0]; }
ALWAYS_INLINE void operator|=(const GSVector2i& v) { U64[0] |= v.U64[0]; }
ALWAYS_INLINE void operator^=(const GSVector2i& v) { U64[0] ^= v.U64[0]; }
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
{
GSVector2i ret;
ret.U64[0] = v1.U64[0] & v2.U64[0];
return ret;
}
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
{
GSVector2i ret;
ret.U64[0] = v1.U64[0] | v2.U64[0];
return ret;
}
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
{
GSVector2i ret;
ret.U64[0] = v1.U64[0] ^ v2.U64[0];
return ret;
}
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, s32 i) { return v & GSVector2i(i); }
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, s32 i) { return v | GSVector2i(i); }
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, s32 i) { return v ^ GSVector2i(i); }
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
ALWAYS_INLINE static constexpr GSVector2i zero() { return GSVector2i::cxpr(0, 0); }
ALWAYS_INLINE GSVector2i xy() const { return *this; }
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(x, x); }
ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(y, x); }
ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(y, y); }
};
class alignas(16) GSVector2
{
struct cxpr_init_tag
{
};
static constexpr cxpr_init_tag cxpr_init{};
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
public:
union
{
struct
{
float x, y;
};
struct
{
float r, g;
};
float F32[4];
double F64[2];
s8 I8[16];
s16 I16[8];
s32 I32[4];
s64 I64[2];
u8 U8[16];
u16 U16[8];
u32 U32[4];
u64 U64[2];
};
GSVector2() = default;
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
ALWAYS_INLINE GSVector2(float x, float y)
{
this->x = x;
this->y = y;
}
ALWAYS_INLINE GSVector2(int x, int y)
{
this->x = static_cast<float>(x);
this->y = static_cast<float>(y);
}
ALWAYS_INLINE explicit GSVector2(float f) { x = y = f; }
ALWAYS_INLINE explicit GSVector2(int i) { x = y = static_cast<float>(i); }
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
ALWAYS_INLINE void operator=(float f) { x = y = f; }
GSVector2 abs() const { return GSVector2(std::fabs(x), std::fabs(y)); }
GSVector2 neg() const { return GSVector2(-x, -y); }
GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); }
GSVector2 rcpnr() const
{
GSVector2 v_ = rcp();
return (v_ + v_) - (v_ * v_) * *this;
}
GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); }
GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); }
GSVector2 sat(const GSVector2& min, const GSVector2& max) const
{
return GSVector2(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y));
}
GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
GSVector2 min(const GSVector2& v) const { return GSVector2(std::min(x, v.x), std::min(y, v.y)); }
GSVector2 max(const GSVector2& v) const { return GSVector2(std::max(x, v.x), std::max(y, v.y)); }
template<int mask>
GSVector2 blend32(const GSVector2& v) const
{
return GSVector2(v.F32[mask & 1], v.F32[(mask >> 1) & 1]);
}
ALWAYS_INLINE GSVector2 blend32(const GSVector2& v, const GSVector2& mask) const
{
return GSVector2((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y);
}
ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
{
GSVector2 ret;
ret.U32[0] = ((~v.U32[0]) & U32[0]);
ret.U32[1] = ((~v.U32[1]) & U32[1]);
return ret;
}
ALWAYS_INLINE int mask() const { return (U32[0] >> 31) | ((U32[1] >> 30) & 2); }
ALWAYS_INLINE bool alltrue() const { return (U64[0] == 0xFFFFFFFFFFFFFFFFULL); }
ALWAYS_INLINE bool allfalse() const { return (U64[0] == 0); }
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
template<int src, int dst>
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
{
GSVector2 ret = *this;
ret.F32[dst] = v.F32[src];
return ret;
}
template<int i>
ALWAYS_INLINE int extract32() const
{
return I32[i];
}
ALWAYS_INLINE static constexpr GSVector2 zero() { return GSVector2::cxpr(0.0f, 0.0f); }
ALWAYS_INLINE static constexpr GSVector2 xffffffff()
{
GSVector2 ret = zero();
ret.U64[0] = ~ret.U64[0];
return ret;
}
ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(f, f); }
ALWAYS_INLINE static GSVector2 load(const void* p)
{
GSVector2 ret;
std::memcpy(ret.F32, p, sizeof(F32));
return ret;
}
ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
void operator+=(const GSVector2& v_)
{
x = x + v_.x;
y = y + v_.y;
}
void operator-=(const GSVector2& v_)
{
x = x - v_.x;
y = y - v_.y;
}
void operator*=(const GSVector2& v_)
{
x = x * v_.x;
y = y * v_.y;
}
void operator/=(const GSVector2& v_)
{
x = x / v_.x;
y = y / v_.y;
}
void operator+=(const float v_)
{
x = x + v_;
y = y + v_;
}
void operator-=(const float v_)
{
x = x - v_;
y = y - v_;
}
void operator*=(const float v_)
{
x = x * v_;
y = y * v_;
}
void operator/=(const float v_)
{
x = x / v_;
y = y / v_;
}
void operator&=(const GSVector2& v_) { U64[0] &= v_.U64[0]; }
void operator|=(const GSVector2& v_) { U64[0] |= v_.U64[0]; }
void operator^=(const GSVector2& v_) { U64[0] ^= v_.U64[0]; }
friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x + v2.x, v1.y + v2.y); }
friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x - v2.x, v1.y - v2.y); }
friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x * v2.x, v1.y * v2.y); }
friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) { return GSVector2(v1.x / v2.x, v1.y / v2.y); }
friend GSVector2 operator+(const GSVector2& v, float f) { return GSVector2(v.x + f, v.y + f); }
friend GSVector2 operator-(const GSVector2& v, float f) { return GSVector2(v.x - f, v.y - f); }
friend GSVector2 operator*(const GSVector2& v, float f) { return GSVector2(v.x * f, v.y * f); }
friend GSVector2 operator/(const GSVector2& v, float f) { return GSVector2(v.x / f, v.y / f); }
friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.U64[0] = v1.U64[0] & v2.U64[0];
return ret;
}
ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.U64[0] = v1.U64[0] | v2.U64[0];
return ret;
}
ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.U64[0] = v1.U64[0] ^ v2.U64[0];
return ret;
}
ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.I32[0] = (v1.x == v2.x) ? -1 : 0;
ret.I32[1] = (v1.y == v2.y) ? -1 : 0;
return ret;
}
ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.I32[0] = (v1.x != v2.x) ? -1 : 0;
ret.I32[1] = (v1.y != v2.y) ? -1 : 0;
return ret;
}
ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.I32[0] = (v1.x > v2.x) ? -1 : 0;
ret.I32[1] = (v1.y > v2.y) ? -1 : 0;
return ret;
}
ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.I32[0] = (v1.x < v2.x) ? -1 : 0;
ret.I32[1] = (v1.y < v2.y) ? -1 : 0;
return ret;
}
ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.I32[0] = (v1.x >= v2.x) ? -1 : 0;
ret.I32[1] = (v1.y >= v2.y) ? -1 : 0;
return ret;
}
ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
{
GSVector2 ret;
ret.I32[0] = (v1.x <= v2.x) ? -1 : 0;
ret.I32[1] = (v1.y <= v2.y) ? -1 : 0;
return ret;
}
ALWAYS_INLINE GSVector2 xy() const { return *this; }
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(x, x); }
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(y, x); }
ALWAYS_INLINE GSVector2 yy() const { return GSVector2(y, y); }
};
#undef ALL_LANES_8
#undef ALL_LANES_16
#undef ALL_LANES_32
#define ALL_LANES_8(expr) \
GSVector4i ret; \
@@ -37,10 +849,6 @@ class GSVector4;
for (size_t i = 0; i < 2; i++) \
expr; \
return ret;
#define SSATURATE8(expr) static_cast<s8>(std::clamp<decltype(expr)>(expr, -128, 127))
#define USATURATE8(expr) static_cast<u8>(std::clamp<decltype(expr)>(expr, 0, 255))
#define SSATURATE16(expr) static_cast<s8>(std::clamp<decltype(expr)>(expr, -32768, 32767))
#define USATURATE16(expr) static_cast<u8>(std::clamp<decltype(expr)>(expr, 0, 65535))
class alignas(16) GSVector4i
{
@@ -139,14 +947,7 @@ public:
}
ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v)
{
x = v.x;
y = v.y;
z = 0;
w = 0;
}
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : I32{v.I32[0], v.I32[1], 0, 0} {}
// MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
// so leave the non-constexpr version default
@@ -374,7 +1175,7 @@ public:
{
GSVector4i ret;
for (size_t i = 0; i < 2; i++)
ret.U64[0] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]);
ret.U64[i] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]);
return ret;
}
@@ -385,14 +1186,20 @@ public:
ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf]));
}
GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i])); }
GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[i])); }
GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE16((i < 8) ? U16[i] : v.U16[i])); }
GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[i])); }
GSVector4i ps32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 8) ? I32[i] : v.I32[i])); }
GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE8(I32[i])); }
GSVector4i pu32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16((i < 8) ? U32[i] : v.U32[i])); }
GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE8(U32[i])); }
GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i - 8])); }
GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[(i < 8) ? i : (i - 8)])); }
GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8((i < 8) ? U16[i] : v.U16[i - 8])); }
GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[(i < 8) ? i : (i - 8)])); }
GSVector4i ps32(const GSVector4i& v) const
{
ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 4) ? I32[i] : v.I32[i - 4]));
}
GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I32[(i < 4) ? i : (i - 4)])); }
GSVector4i pu32(const GSVector4i& v) const
{
ALL_LANES_16(ret.U16[i] = USATURATE16((i < 4) ? U32[i] : v.U32[i - 4]));
}
GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE16(U32[(i < 4) ? i : (i - 4)])); }
GSVector4i upl8(const GSVector4i& v) const
{
@@ -930,19 +1737,8 @@ public:
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
ALWAYS_INLINE GSVector2i xy() const
{
GSVector2i ret;
storel(&ret, *this);
return ret;
}
ALWAYS_INLINE GSVector2i zw() const
{
GSVector2i ret;
storeh(&ret, *this);
return ret;
}
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(x, y); }
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(z, w); }
// clang-format off
// l/h/lh not implemented until needed
@@ -1062,26 +1858,11 @@ public:
this->w = 0.0f;
}
ALWAYS_INLINE explicit GSVector4(const GSVector2& v)
{
x = v.x;
y = v.y;
z = 0.0f;
w = 0.0f;
}
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
{
x = static_cast<float>(v.x);
y = static_cast<float>(v.y);
z = 0.0f;
w = 0.0f;
}
ALWAYS_INLINE explicit GSVector4(float f) { x = y = z = w = f; }
ALWAYS_INLINE explicit GSVector4(int i) { x = y = z = w = static_cast<float>(i); }
ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : x(v.x), y(v.y), z(0.0f), w(0.0f) {}
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
@@ -1298,7 +2079,7 @@ public:
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
{
std::memcpy(p, &v.x, sizeof(float));
std::memcpy(p, v.F32, sizeof(F32));
}
ALWAYS_INLINE static void store(float* p, const GSVector4& v) { *p = v.x; }
@@ -1589,6 +2370,33 @@ public:
}
};
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
{
// TODO: Truncation vs rounding...
x = static_cast<s32>(v.x);
y = static_cast<s32>(v.y);
}
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
{
x = static_cast<float>(v.x);
y = static_cast<float>(v.y);
}
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
{
GSVector2i ret;
std::memcpy(&ret, &v, sizeof(ret));
return ret;
}
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
{
GSVector2 ret;
std::memcpy(&ret, &v, sizeof(ret));
return ret;
}
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
{
// TODO: Truncation vs rounding...