deform optimized even more

This commit is contained in:
d1str4ught
2026-02-08 07:35:41 +01:00
parent e0be217d3f
commit 9afd6ee906
3 changed files with 140 additions and 94 deletions

View File

@@ -2,66 +2,111 @@
#include <xmmintrin.h>
#include <emmintrin.h>
namespace
{
constexpr float kInv255 = 1.0f / 255.0f;
inline void TransformPositionNormal(
const float* matrix,
const __m128 px, const __m128 py, const __m128 pz, const __m128 pw,
const __m128 nx, const __m128 ny, const __m128 nz,
__m128& position, __m128& normal)
{
const __m128 r0 = _mm_loadu_ps(matrix + 0);
const __m128 r1 = _mm_loadu_ps(matrix + 4);
const __m128 r2 = _mm_loadu_ps(matrix + 8);
const __m128 r3 = _mm_loadu_ps(matrix + 12);
position = _mm_add_ps(_mm_mul_ps(r0, px), _mm_mul_ps(r1, py));
position = _mm_add_ps(position, _mm_mul_ps(r2, pz));
position = _mm_add_ps(position, _mm_mul_ps(r3, pw));
normal = _mm_add_ps(_mm_mul_ps(r0, nx), _mm_mul_ps(r1, ny));
normal = _mm_add_ps(normal, _mm_mul_ps(r2, nz));
}
inline void AccumulateWeightedBone(
__m128& blendedPosition,
__m128& blendedNormal,
const float* matrix,
const granny_uint8 weight,
const __m128 px, const __m128 py, const __m128 pz, const __m128 pw,
const __m128 nx, const __m128 ny, const __m128 nz)
{
__m128 p;
__m128 n;
TransformPositionNormal(matrix, px, py, pz, pw, nx, ny, nz, p, n);
const __m128 w = _mm_set1_ps(static_cast<float>(weight) * kInv255);
blendedPosition = _mm_add_ps(blendedPosition, _mm_mul_ps(p, w));
blendedNormal = _mm_add_ps(blendedNormal, _mm_mul_ps(n, w));
}
inline void StoreVec3(float* out, const __m128 v)
{
_mm_store_ss(out + 0, v);
_mm_store_ss(out + 1, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
_mm_store_ss(out + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
}
}
void DeformPWNT3432toGrannyPNGBT33332D(granny_int32x Count, void const* SourceInit, void* DestInit,
granny_matrix_4x4 const* Transforms,
granny_int32x CopySize, granny_int32x SourceStride, granny_int32x DestStride)
granny_int32x SourceStride, granny_int32x DestStride)
{
const float inv255 = 1.0f / 255.0f;
const granny_pwnt3432_vertex* src = (const granny_pwnt3432_vertex*)SourceInit;
granny_pnt332_vertex* dst = (granny_pnt332_vertex*)DestInit;
const __m128 pw = _mm_set1_ps(1.0f);
while (Count--) {
const __m128 srcPos = _mm_set_ps(1.0f, src->Position[2], src->Position[1], src->Position[0]);
const __m128 srcNrm = _mm_set_ps(0.0f, src->Normal[2], src->Normal[1], src->Normal[0]);
const __m128 px = _mm_set1_ps(src->Position[0]);
const __m128 py = _mm_set1_ps(src->Position[1]);
const __m128 pz = _mm_set1_ps(src->Position[2]);
const __m128 px = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 py = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 pz = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 pw = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 nx = _mm_set1_ps(src->Normal[0]);
const __m128 ny = _mm_set1_ps(src->Normal[1]);
const __m128 nz = _mm_set1_ps(src->Normal[2]);
const __m128 nx = _mm_shuffle_ps(srcNrm, srcNrm, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 ny = _mm_shuffle_ps(srcNrm, srcNrm, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 nz = _mm_shuffle_ps(srcNrm, srcNrm, _MM_SHUFFLE(2, 2, 2, 2));
const granny_uint8 w0 = src->BoneWeights[0];
const granny_uint8 w1 = src->BoneWeights[1];
const granny_uint8 w2 = src->BoneWeights[2];
const granny_uint8 w3 = src->BoneWeights[3];
__m128 P = _mm_setzero_ps();
__m128 N = _mm_setzero_ps();
__m128 P;
__m128 N;
for (int i = 0; i < 4; ++i) {
const int bi = src->BoneIndices[i];
const float wS = (float)src->BoneWeights[i] * inv255;
if (wS <= 0.0f) continue;
if (w0 == 255 && ((w1 | w2 | w3) == 0)) {
const float* m = (const float*)(&Transforms[src->BoneIndices[0]]);
TransformPositionNormal(m, px, py, pz, pw, nx, ny, nz, P, N);
const float* m = (const float*)(&Transforms[bi]);
const __m128 rigidWeight = _mm_set1_ps(static_cast<float>(w0) * kInv255);
P = _mm_mul_ps(P, rigidWeight);
N = _mm_mul_ps(N, rigidWeight);
}
else {
P = _mm_setzero_ps();
N = _mm_setzero_ps();
const __m128 r0 = _mm_loadu_ps(m + 0);
const __m128 r1 = _mm_loadu_ps(m + 4);
const __m128 r2 = _mm_loadu_ps(m + 8);
const __m128 r3 = _mm_loadu_ps(m + 12);
__m128 p = _mm_add_ps(_mm_mul_ps(r0, px), _mm_mul_ps(r1, py));
p = _mm_add_ps(p, _mm_mul_ps(r2, pz));
p = _mm_add_ps(p, _mm_mul_ps(r3, pw));
const __m128 w = _mm_set1_ps(wS);
P = _mm_add_ps(P, _mm_mul_ps(p, w));
__m128 n = _mm_add_ps(_mm_mul_ps(r0, nx), _mm_mul_ps(r1, ny));
n = _mm_add_ps(n, _mm_mul_ps(r2, nz));
N = _mm_add_ps(N, _mm_mul_ps(n, w));
if (w0) {
const float* m = (const float*)(&Transforms[src->BoneIndices[0]]);
AccumulateWeightedBone(P, N, m, w0, px, py, pz, pw, nx, ny, nz);
}
if (w1) {
const float* m = (const float*)(&Transforms[src->BoneIndices[1]]);
AccumulateWeightedBone(P, N, m, w1, px, py, pz, pw, nx, ny, nz);
}
if (w2) {
const float* m = (const float*)(&Transforms[src->BoneIndices[2]]);
AccumulateWeightedBone(P, N, m, w2, px, py, pz, pw, nx, ny, nz);
}
if (w3) {
const float* m = (const float*)(&Transforms[src->BoneIndices[3]]);
AccumulateWeightedBone(P, N, m, w3, px, py, pz, pw, nx, ny, nz);
}
}
float pOut[4], nOut[4];
_mm_storeu_ps(pOut, P);
_mm_storeu_ps(nOut, N);
dst->Position[0] = pOut[0];
dst->Position[1] = pOut[1];
dst->Position[2] = pOut[2];
dst->Normal[0] = nOut[0];
dst->Normal[1] = nOut[1];
dst->Normal[2] = nOut[2];
StoreVec3(dst->Position, P);
StoreVec3(dst->Normal, N);
dst->UV[0] = src->UV[0];
dst->UV[1] = src->UV[1];
@@ -73,64 +118,66 @@ void DeformPWNT3432toGrannyPNGBT33332D(granny_int32x Count, void const* SourceIn
void DeformPWNT3432toGrannyPNGBT33332I(granny_int32x Count, void const* SourceInit, void* DestInit,
granny_int32x const* TransformTable, granny_matrix_4x4 const* Transforms,
granny_int32x CopySize, granny_int32x SourceStride, granny_int32x DestStride)
granny_int32x SourceStride, granny_int32x DestStride)
{
const float inv255 = 1.0f / 255.0f;
const granny_pwnt3432_vertex* src = (const granny_pwnt3432_vertex*)SourceInit;
granny_pnt332_vertex* dst = (granny_pnt332_vertex*)DestInit;
const __m128 pw = _mm_set1_ps(1.0f);
while (Count--) {
const __m128 srcPos = _mm_set_ps(1.0f, src->Position[2], src->Position[1], src->Position[0]);
const __m128 srcNrm = _mm_set_ps(0.0f, src->Normal[2], src->Normal[1], src->Normal[0]);
const __m128 px = _mm_set1_ps(src->Position[0]);
const __m128 py = _mm_set1_ps(src->Position[1]);
const __m128 pz = _mm_set1_ps(src->Position[2]);
const __m128 px = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 py = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 pz = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 pw = _mm_shuffle_ps(srcPos, srcPos, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 nx = _mm_set1_ps(src->Normal[0]);
const __m128 ny = _mm_set1_ps(src->Normal[1]);
const __m128 nz = _mm_set1_ps(src->Normal[2]);
const __m128 nx = _mm_shuffle_ps(srcNrm, srcNrm, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 ny = _mm_shuffle_ps(srcNrm, srcNrm, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 nz = _mm_shuffle_ps(srcNrm, srcNrm, _MM_SHUFFLE(2, 2, 2, 2));
const granny_uint8 w0 = src->BoneWeights[0];
const granny_uint8 w1 = src->BoneWeights[1];
const granny_uint8 w2 = src->BoneWeights[2];
const granny_uint8 w3 = src->BoneWeights[3];
__m128 P = _mm_setzero_ps();
__m128 N = _mm_setzero_ps();
for (int i = 0; i < 4; ++i) {
const int bi = TransformTable[src->BoneIndices[i]];
const float wS = (float)src->BoneWeights[i] * inv255;
if (wS <= 0.0f) continue;
__m128 P;
__m128 N;
if (w0 == 255 && ((w1 | w2 | w3) == 0)) {
const int bi = TransformTable[src->BoneIndices[0]];
const float* m = (const float*)(&Transforms[bi]);
TransformPositionNormal(m, px, py, pz, pw, nx, ny, nz, P, N);
const __m128 r0 = _mm_loadu_ps(m + 0);
const __m128 r1 = _mm_loadu_ps(m + 4);
const __m128 r2 = _mm_loadu_ps(m + 8);
const __m128 r3 = _mm_loadu_ps(m + 12);
const __m128 rigidWeight = _mm_set1_ps(static_cast<float>(w0) * kInv255);
P = _mm_mul_ps(P, rigidWeight);
N = _mm_mul_ps(N, rigidWeight);
}
else {
P = _mm_setzero_ps();
N = _mm_setzero_ps();
__m128 p = _mm_add_ps(_mm_mul_ps(r0, px), _mm_mul_ps(r1, py));
p = _mm_add_ps(p, _mm_mul_ps(r2, pz));
p = _mm_add_ps(p, _mm_mul_ps(r3, pw));
const __m128 w = _mm_set1_ps(wS);
P = _mm_add_ps(P, _mm_mul_ps(p, w));
__m128 n = _mm_add_ps(_mm_mul_ps(r0, nx), _mm_mul_ps(r1, ny));
n = _mm_add_ps(n, _mm_mul_ps(r2, nz));
N = _mm_add_ps(N, _mm_mul_ps(n, w));
if (w0) {
const int bi = TransformTable[src->BoneIndices[0]];
const float* m = (const float*)(&Transforms[bi]);
AccumulateWeightedBone(P, N, m, w0, px, py, pz, pw, nx, ny, nz);
}
if (w1) {
const int bi = TransformTable[src->BoneIndices[1]];
const float* m = (const float*)(&Transforms[bi]);
AccumulateWeightedBone(P, N, m, w1, px, py, pz, pw, nx, ny, nz);
}
if (w2) {
const int bi = TransformTable[src->BoneIndices[2]];
const float* m = (const float*)(&Transforms[bi]);
AccumulateWeightedBone(P, N, m, w2, px, py, pz, pw, nx, ny, nz);
}
if (w3) {
const int bi = TransformTable[src->BoneIndices[3]];
const float* m = (const float*)(&Transforms[bi]);
AccumulateWeightedBone(P, N, m, w3, px, py, pz, pw, nx, ny, nz);
}
}
float pOut[4], nOut[4];
_mm_storeu_ps(pOut, P);
_mm_storeu_ps(nOut, N);
dst->Position[0] = pOut[0];
dst->Position[1] = pOut[1];
dst->Position[2] = pOut[2];
dst->Normal[0] = nOut[0];
dst->Normal[1] = nOut[1];
dst->Normal[2] = nOut[2];
StoreVec3(dst->Position, P);
StoreVec3(dst->Normal, N);
dst->UV[0] = src->UV[0];
dst->UV[1] = src->UV[1];
@@ -142,12 +189,12 @@ void DeformPWNT3432toGrannyPNGBT33332I(granny_int32x Count, void const* SourceIn
void DeformPWNT3432toGrannyPNGBT33332(granny_int32x Count, void const* SourceInit, void* DestInit,
granny_int32x const* TransformTable, granny_matrix_4x4 const* Transforms,
granny_int32x CopySize, granny_int32x SourceStride, granny_int32x DestStride)
granny_int32x SourceStride, granny_int32x DestStride)
{
if (TransformTable) [[likely]] {
DeformPWNT3432toGrannyPNGBT33332I(Count, SourceInit, DestInit, TransformTable, Transforms, CopySize, SourceStride, DestStride);
DeformPWNT3432toGrannyPNGBT33332I(Count, SourceInit, DestInit, TransformTable, Transforms, SourceStride, DestStride);
}
else [[unlikely]] {
DeformPWNT3432toGrannyPNGBT33332D(Count, SourceInit, DestInit, Transforms, CopySize, SourceStride, DestStride);
DeformPWNT3432toGrannyPNGBT33332D(Count, SourceInit, DestInit, Transforms, SourceStride, DestStride);
}
}

View File

@@ -3,4 +3,4 @@
void DeformPWNT3432toGrannyPNGBT33332(granny_int32x Count, void const* SourceInit, void* DestInit,
granny_int32x const* TransformTable, granny_matrix_4x4 const* Transforms,
granny_int32x CopySize, granny_int32x SourceStride, granny_int32x DestStride);
granny_int32x SourceStride, granny_int32x DestStride);

View File

@@ -69,7 +69,6 @@ void CGrannyMesh::DeformPNTVertices(void* dstBaseVertices, D3DXMATRIX* boneMatri
boneIndices,
(granny_matrix_4x4 const*)boneMatrices,
sizeof(granny_pwnt3432_vertex),
sizeof(granny_pwnt3432_vertex),
sizeof(granny_pnt332_vertex)
);
}