swr: [rasterizer core] Frontend SIMD16 WIP
Removed temporary scafolding in PA, widended the PA_STATE interface for SIMD16, and implemented PA_STATE_CUT and PA_TESS for SIMD16. PA_STATE_CUT and PA_TESS now work in SIMD16. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
@@ -217,6 +217,12 @@ struct PA_STATE;
|
||||
typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
|
||||
uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
// function signature for pipeline stages that execute after primitive assembly
|
||||
typedef void(*PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
|
||||
uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
|
||||
|
||||
#endif
|
||||
OSALIGNLINE(struct) API_STATE
|
||||
{
|
||||
// Vertex Buffers
|
||||
|
@@ -1295,7 +1295,7 @@ void ProcessDraw(
|
||||
|
||||
while (pa.HasWork())
|
||||
{
|
||||
// PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
|
||||
// GetNextVsOutput currently has the side effect of updating some PA state machine state.
|
||||
// So we need to keep this outside of (i < endVertex) check.
|
||||
|
||||
simdmask *pvCutIndices_lo = nullptr;
|
||||
@@ -1303,8 +1303,10 @@ void ProcessDraw(
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
pvCutIndices_lo = &pa.GetNextVsIndices();
|
||||
pvCutIndices_hi = &pa.GetNextVsIndices();
|
||||
// simd16mask <=> simdmask[2]
|
||||
|
||||
pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
|
||||
pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
|
||||
}
|
||||
|
||||
simdvertex vout_lo;
|
||||
@@ -1313,7 +1315,7 @@ void ProcessDraw(
|
||||
vsContext_lo.pVout = &vout_lo;
|
||||
vsContext_hi.pVout = &vout_hi;
|
||||
|
||||
simd16vertex &vout = pa.GetNextVsOutput_simd16();
|
||||
simd16vertex &vout = pa.GetNextVsOutput();
|
||||
|
||||
if (i < endVertex)
|
||||
{
|
||||
@@ -1433,12 +1435,13 @@ void ProcessDraw(
|
||||
{
|
||||
SWR_ASSERT(pDC->pState->pfnProcessPrims);
|
||||
|
||||
uint32_t genMask = GenMask(pa.NumPrims_simd16());
|
||||
uint32_t genMask_lo = genMask & 255;
|
||||
uint32_t genMask_hi = (genMask >> 8) & 255;
|
||||
uint32_t mask = GenMask(pa.NumPrims());
|
||||
uint32_t mask_lo = mask & 255;
|
||||
uint32_t mask_hi = (mask >> 8) & 255;
|
||||
|
||||
simdscalari getPrimId_lo = pa.GetPrimID_simd16_lo(work.startPrimID);
|
||||
simdscalari getPrimId_hi = pa.GetPrimID_simd16_hi(work.startPrimID);
|
||||
simd16scalari primid = pa.GetPrimID(work.startPrimID);
|
||||
simdscalari primid_lo = primid.lo;
|
||||
simdscalari primid_hi = primid.hi;
|
||||
|
||||
simdvector prim[MAX_NUM_VERTS_PER_PRIM];
|
||||
|
||||
@@ -1451,10 +1454,9 @@ void ProcessDraw(
|
||||
}
|
||||
|
||||
pa.useAlternateOffset = false;
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
|
||||
genMask_lo, getPrimId_lo, _simd_set1_epi32(0));
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_lo, primid_lo, _simd_setzero_si());
|
||||
|
||||
if (genMask_hi)
|
||||
if (mask_hi)
|
||||
{
|
||||
for (uint32_t i = 0; i < 3; i += 1)
|
||||
{
|
||||
@@ -1465,8 +1467,7 @@ void ProcessDraw(
|
||||
}
|
||||
|
||||
pa.useAlternateOffset = true;
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
|
||||
genMask_hi, getPrimId_hi, _simd_set1_epi32(0));
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_hi, primid_hi, _simd_setzero_si());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1543,7 +1544,7 @@ void ProcessDraw(
|
||||
|
||||
while (pa.HasWork())
|
||||
{
|
||||
// PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
|
||||
// GetNextVsOutput currently has the side effect of updating some PA state machine state.
|
||||
// So we need to keep this outside of (i < endVertex) check.
|
||||
simdmask* pvCutIndices = nullptr;
|
||||
if (IsIndexedT::value)
|
||||
|
@@ -34,6 +34,39 @@
|
||||
|
||||
struct PA_STATE
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
enum
|
||||
{
|
||||
SIMD_WIDTH = KNOB_SIMD16_WIDTH,
|
||||
SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
|
||||
SIMD_WIDTH_LOG2 = 4
|
||||
};
|
||||
|
||||
typedef simd16mask SIMDMASK;
|
||||
|
||||
typedef simd16scalar SIMDSCALAR;
|
||||
typedef simd16vector SIMDVECTOR;
|
||||
typedef simd16vertex SIMDVERTEX;
|
||||
|
||||
typedef simd16scalari SIMDSCALARI;
|
||||
|
||||
#else
|
||||
enum
|
||||
{
|
||||
SIMD_WIDTH = KNOB_SIMD_WIDTH,
|
||||
SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
|
||||
SIMD_WIDTH_LOG2 = 3
|
||||
};
|
||||
|
||||
typedef simdmask SIMDMASK;
|
||||
|
||||
typedef simdscalar SIMDSCALAR;
|
||||
typedef simdvector SIMDVECTOR;
|
||||
typedef simdvertex SIMDVERTEX;
|
||||
|
||||
typedef simdscalari SIMDSCALARI;
|
||||
|
||||
#endif
|
||||
DRAW_CONTEXT *pDC{ nullptr }; // draw context
|
||||
uint8_t* pStreamBase{ nullptr }; // vertex stream
|
||||
uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
|
||||
@@ -60,24 +93,12 @@ struct PA_STATE
|
||||
#endif
|
||||
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
|
||||
virtual bool NextPrim() = 0;
|
||||
virtual simdvertex& GetNextVsOutput() = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual simdvertex& GetNextVsOutput_simd16_lo() = 0;
|
||||
virtual simdvertex& GetNextVsOutput_simd16_hi() = 0;
|
||||
virtual simd16vertex& GetNextVsOutput_simd16() = 0;
|
||||
#endif
|
||||
virtual SIMDVERTEX& GetNextVsOutput() = 0;
|
||||
virtual bool GetNextStreamOutput() = 0;
|
||||
virtual simdmask& GetNextVsIndices() = 0;
|
||||
virtual SIMDMASK& GetNextVsIndices() = 0;
|
||||
virtual uint32_t NumPrims() = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual uint32_t NumPrims_simd16() = 0;
|
||||
#endif
|
||||
virtual void Reset() = 0;
|
||||
virtual simdscalari GetPrimID(uint32_t startID) = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual simdscalari GetPrimID_simd16_lo(uint32_t startID) = 0;
|
||||
virtual simdscalari GetPrimID_simd16_hi(uint32_t startID) = 0;
|
||||
#endif
|
||||
virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
|
||||
};
|
||||
|
||||
// The Optimized PA is a state machine that assembles triangles from vertex shader simd
|
||||
@@ -98,7 +119,8 @@ struct PA_STATE
|
||||
// cuts
|
||||
struct PA_STATE_OPT : public PA_STATE
|
||||
{
|
||||
simdvertex leadingVertex; // For tri-fan
|
||||
SIMDVERTEX leadingVertex; // For tri-fan
|
||||
|
||||
uint32_t numPrims{ 0 }; // Total number of primitives for draw.
|
||||
uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
|
||||
|
||||
@@ -112,20 +134,22 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
bool reset{ false }; // reset state
|
||||
|
||||
uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
|
||||
simdscalari primID;
|
||||
SIMDSCALARI primID;
|
||||
|
||||
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
|
||||
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
|
||||
PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
|
||||
#endif
|
||||
PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
|
||||
PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr }; // PA state machine function for assembling 16 triangles
|
||||
PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr }; // initial state to set on reset
|
||||
PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
|
||||
#endif
|
||||
|
||||
// state used to advance the PA when Next is called
|
||||
@@ -138,7 +162,7 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
bool nextReset{ false };
|
||||
bool isStreaming{ false };
|
||||
|
||||
simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function
|
||||
SIMDMASK tmpIndices{ 0 }; // temporary index store for unused virtual function
|
||||
|
||||
PA_STATE_OPT() {}
|
||||
PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
|
||||
@@ -221,55 +245,18 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
return morePrims;
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput()
|
||||
SIMDVERTEX& GetNextVsOutput()
|
||||
{
|
||||
// increment cur and prev indices
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / SIMD_WIDTH;
|
||||
this->prev = this->cur; // prev is undefined for first state.
|
||||
this->cur = this->counter % numSimdVerts;
|
||||
|
||||
simdvertex* pVertex = (simdvertex*)pStreamBase;
|
||||
SIMDVERTEX* pVertex = (SIMDVERTEX*)pStreamBase;
|
||||
return pVertex[this->cur];
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdvertex& GetNextVsOutput_simd16_lo()
|
||||
{
|
||||
// increment cur and prev indices
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH;
|
||||
this->prev = this->cur; // prev is undefined for first state.
|
||||
this->cur = this->counter % numSimdVerts;
|
||||
|
||||
simdvertex* pVertex = (simdvertex*)pStreamBase;
|
||||
return pVertex[this->cur * 2];
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput_simd16_hi()
|
||||
{
|
||||
// increment cur and prev indices
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH;
|
||||
#if 1
|
||||
this->prev = this->cur; // prev is undefined for first state.
|
||||
this->cur = this->counter % numSimdVerts;
|
||||
#endif
|
||||
|
||||
simdvertex* pVertex = (simdvertex*)pStreamBase;
|
||||
return pVertex[this->cur * 2 + 1];
|
||||
}
|
||||
|
||||
simd16vertex& GetNextVsOutput_simd16()
|
||||
{
|
||||
// increment cur and prev indices
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH;
|
||||
this->prev = this->cur; // prev is undefined for first state.
|
||||
this->cur = this->counter % numSimdVerts;
|
||||
|
||||
simd16vertex* pVertex = (simd16vertex*)pStreamBase;
|
||||
return pVertex[this->cur];
|
||||
}
|
||||
|
||||
#endif
|
||||
simdmask& GetNextVsIndices()
|
||||
SIMDMASK& GetNextVsIndices()
|
||||
{
|
||||
// unused in optimized PA, pass tmp buffer back
|
||||
return tmpIndices;
|
||||
@@ -286,17 +273,9 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
uint32_t NumPrims()
|
||||
{
|
||||
return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
|
||||
(KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
|
||||
(SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
uint32_t NumPrims_simd16()
|
||||
{
|
||||
return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
|
||||
(KNOB_SIMD16_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD16_WIDTH;
|
||||
}
|
||||
|
||||
#endif
|
||||
void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
|
||||
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
|
||||
uint32_t numSimdPrims = 0,
|
||||
@@ -343,33 +322,16 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
this->reset = false;
|
||||
}
|
||||
|
||||
simdscalari GetPrimID(uint32_t startID)
|
||||
SIMDSCALARI GetPrimID(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(this->primID,
|
||||
_simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
|
||||
}
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
|
||||
simdscalari GetPrimID_simd16_lo(uint32_t startID)
|
||||
{
|
||||
#if 1
|
||||
return _simd_add_epi32(this->primID,
|
||||
_simd_set1_epi32(startID + (this->primIDIncr / 2) * (this->numPrimsComplete / KNOB_SIMD_WIDTH) * 2));
|
||||
#if USE_SIMD16_FRONTEND
|
||||
return _simd16_add_epi32(this->primID,
|
||||
_simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
|
||||
#else
|
||||
return _simd_set1_epi32(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
simdscalari GetPrimID_simd16_hi(uint32_t startID)
|
||||
{
|
||||
#if 1
|
||||
return _simd_add_epi32(this->primID,
|
||||
_simd_set1_epi32(startID + (this->primIDIncr / 2) * ((this->numPrimsComplete / KNOB_SIMD_WIDTH) * 2 + 1)));
|
||||
#else
|
||||
return _simd_set1_epi32(0);
|
||||
_simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// helper C wrappers to avoid having to rewrite all the PA topology state functions
|
||||
@@ -489,22 +451,26 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
|
||||
// Cut-aware primitive assembler.
|
||||
struct PA_STATE_CUT : public PA_STATE
|
||||
{
|
||||
simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
|
||||
SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
|
||||
uint32_t numVerts{ 0 }; // number of vertices available in buffer store
|
||||
uint32_t numAttribs{ 0 }; // number of attributes
|
||||
int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
|
||||
uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
|
||||
OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather
|
||||
simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
|
||||
#else
|
||||
OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
|
||||
#endif
|
||||
SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
|
||||
uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
|
||||
uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
|
||||
uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
|
||||
uint32_t curVertex{ 0 }; // current unprocessed vertex
|
||||
uint32_t startPrimId{ 0 }; // starting prim id
|
||||
simdscalari vPrimId; // vector of prim ID
|
||||
SIMDSCALARI vPrimId; // vector of prim ID
|
||||
bool needOffsets{ false }; // need to compute gather offsets for current SIMD
|
||||
uint32_t vertsPerPrim{ 0 };
|
||||
simdvertex tmpVertex; // temporary simdvertex for unimplemented API
|
||||
SIMDVERTEX tmpVertex; // temporary simdvertex for unimplemented API
|
||||
bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
|
||||
// are ignored. Fetch shader sends invalid verts on cuts that should be ignored
|
||||
// while the GS sends valid verts for every index
|
||||
@@ -518,7 +484,7 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
|
||||
|
||||
PA_STATE_CUT() {}
|
||||
PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
|
||||
PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
|
||||
uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
|
||||
: PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
|
||||
{
|
||||
@@ -535,7 +501,11 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
curIndex = 0;
|
||||
pCutIndices = in_pIndices;
|
||||
memset(indices, 0, sizeof(indices));
|
||||
vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
#else
|
||||
vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
#endif
|
||||
reverseWinding = false;
|
||||
adjExtraVert = -1;
|
||||
|
||||
@@ -566,44 +536,18 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
}
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput()
|
||||
SIMDVERTEX& GetNextVsOutput()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
|
||||
this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
|
||||
uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
|
||||
this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
|
||||
this->needOffsets = true;
|
||||
return ((simdvertex*)pStreamBase)[vertexIndex];
|
||||
return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdvertex& GetNextVsOutput_simd16_lo()
|
||||
SIMDMASK& GetNextVsIndices()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH;
|
||||
this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts;
|
||||
this->needOffsets = true;
|
||||
return ((simdvertex*)pStreamBase)[vertexIndex * 2];
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput_simd16_hi()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH;
|
||||
this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts;
|
||||
this->needOffsets = true;
|
||||
return ((simdvertex*)pStreamBase)[vertexIndex * 2 + 1];
|
||||
}
|
||||
|
||||
simd16vertex& GetNextVsOutput_simd16()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH;
|
||||
this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts;
|
||||
this->needOffsets = true;
|
||||
return ((simd16vertex*)pStreamBase)[vertexIndex];
|
||||
}
|
||||
|
||||
#endif
|
||||
simdmask& GetNextVsIndices()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
|
||||
simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
|
||||
uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
|
||||
SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
|
||||
return *pCurCutIndex;
|
||||
}
|
||||
|
||||
@@ -611,7 +555,8 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
{
|
||||
// unused
|
||||
SWR_ASSERT(0 && "Not implemented");
|
||||
return this->tmpVertex.attrib[0];
|
||||
static simdvector junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
@@ -626,28 +571,20 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
#endif
|
||||
bool GetNextStreamOutput()
|
||||
{
|
||||
this->headVertex += KNOB_SIMD_WIDTH;
|
||||
this->headVertex += SIMD_WIDTH;
|
||||
this->needOffsets = true;
|
||||
return HasWork();
|
||||
}
|
||||
|
||||
simdscalari GetPrimID(uint32_t startID)
|
||||
SIMDSCALARI GetPrimID(uint32_t startID)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
|
||||
#else
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdscalari GetPrimID_simd16_lo(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
|
||||
}
|
||||
|
||||
simdscalari GetPrimID_simd16_hi(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID + KNOB_SIMD_WIDTH), this->vPrimId);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void Reset()
|
||||
{
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
@@ -662,7 +599,11 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
this->headVertex = 0;
|
||||
this->reverseWinding = false;
|
||||
this->adjExtraVert = -1;
|
||||
this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
#else
|
||||
this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool HasWork()
|
||||
@@ -672,7 +613,7 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
|
||||
bool IsVertexStoreFull()
|
||||
{
|
||||
return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
|
||||
return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
|
||||
}
|
||||
|
||||
void RestartTopology()
|
||||
@@ -684,8 +625,8 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
|
||||
bool IsCutIndex(uint32_t vertex)
|
||||
{
|
||||
uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
|
||||
uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
|
||||
uint32_t vertexIndex = vertex / SIMD_WIDTH;
|
||||
uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
|
||||
return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
|
||||
}
|
||||
|
||||
@@ -693,7 +634,7 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
// have assembled SIMD prims
|
||||
void ProcessVerts()
|
||||
{
|
||||
while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
|
||||
while (this->numPrimsAssembled != SIMD_WIDTH &&
|
||||
this->numRemainingVerts > 0 &&
|
||||
this->curVertex != this->headVertex)
|
||||
{
|
||||
@@ -724,7 +665,7 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
}
|
||||
|
||||
// special case last primitive for tri strip w/ adj
|
||||
if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
|
||||
if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
|
||||
{
|
||||
(this->*pfnPa)(this->curVertex, true);
|
||||
}
|
||||
@@ -736,13 +677,17 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
// advance tail to the current unsubmitted vertex
|
||||
this->tailVertex = this->curVertex;
|
||||
this->numPrimsAssembled = 0;
|
||||
this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
|
||||
#if USE_SIMD16_FRONTEND
|
||||
this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
|
||||
#else
|
||||
this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
|
||||
#endif
|
||||
}
|
||||
|
||||
bool NextPrim()
|
||||
{
|
||||
// if we've assembled enough prims, we can advance to the next set of verts
|
||||
if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
|
||||
if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
@@ -753,27 +698,37 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
{
|
||||
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
|
||||
{
|
||||
simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
|
||||
SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
|
||||
|
||||
// step to simdvertex batch
|
||||
const uint32_t simdShift = 3; // @todo make knob
|
||||
simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
|
||||
this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
|
||||
const uint32_t simdShift = SIMD_WIDTH_LOG2;
|
||||
#if USE_SIMD16_FRONTEND
|
||||
SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
|
||||
this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
|
||||
#else
|
||||
SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
|
||||
this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
|
||||
#endif
|
||||
|
||||
// step to index
|
||||
const uint32_t simdMask = 0x7; // @todo make knob
|
||||
simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
|
||||
const uint32_t simdMask = SIMD_WIDTH - 1;
|
||||
#if USE_SIMD16_FRONTEND
|
||||
SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
|
||||
this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
|
||||
#else
|
||||
SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
|
||||
this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
bool Assemble(uint32_t slot, simdvector result[])
|
||||
bool Assemble(uint32_t slot, simdvector verts[])
|
||||
{
|
||||
// process any outstanding verts
|
||||
ProcessVerts();
|
||||
|
||||
// return false if we don't have enough prims assembled
|
||||
if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
|
||||
if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -787,18 +742,28 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
|
||||
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
|
||||
{
|
||||
simdscalari offsets = this->vOffsets[v];
|
||||
SIMDSCALARI offsets = this->vOffsets[v];
|
||||
|
||||
// step to attribute
|
||||
offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
|
||||
#if USE_SIMD16_FRONTEND
|
||||
offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
|
||||
#else
|
||||
offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
|
||||
#endif
|
||||
|
||||
float* pBase = (float*)this->pStreamBase;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
|
||||
|
||||
verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo;
|
||||
#else
|
||||
verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
|
||||
#endif
|
||||
|
||||
// move base to next component
|
||||
pBase += KNOB_SIMD_WIDTH;
|
||||
pBase += SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -808,8 +773,49 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
SWR_ASSERT(false);
|
||||
return false;
|
||||
// process any outstanding verts
|
||||
ProcessVerts();
|
||||
|
||||
// return false if we don't have enough prims assembled
|
||||
if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// cache off gather offsets given the current SIMD set of indices the first time we get an assemble
|
||||
if (this->needOffsets)
|
||||
{
|
||||
ComputeOffsets();
|
||||
this->needOffsets = false;
|
||||
}
|
||||
|
||||
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
|
||||
{
|
||||
SIMDSCALARI offsets = this->vOffsets[v];
|
||||
|
||||
// step to attribute
|
||||
#if USE_SIMD16_FRONTEND
|
||||
offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
|
||||
#else
|
||||
offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
|
||||
#endif
|
||||
|
||||
float* pBase = (float*)this->pStreamBase;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
|
||||
#else
|
||||
verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1);
|
||||
verts[v].v[c].hi = _simd_setzero_ps();
|
||||
#endif
|
||||
|
||||
// move base to next component
|
||||
pBase += SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -819,14 +825,18 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
|
||||
{
|
||||
uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
|
||||
#if USE_SIMD16_FRONTEND
|
||||
uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
|
||||
#else
|
||||
uint32_t offset = pOffset[triIndex];
|
||||
offset += sizeof(simdvector) * slot;
|
||||
#endif
|
||||
offset += sizeof(SIMDVECTOR) * slot;
|
||||
float* pVert = (float*)&tri[v];
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
float* pComponent = (float*)(this->pStreamBase + offset);
|
||||
pVert[c] = *pComponent;
|
||||
offset += KNOB_SIMD_WIDTH * sizeof(float);
|
||||
offset += SIMD_WIDTH * sizeof(float);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -836,13 +846,6 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
return this->numPrimsAssembled;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
uint32_t NumPrims_simd16()
|
||||
{
|
||||
return this->numPrimsAssembled;
|
||||
}
|
||||
|
||||
#endif
|
||||
// Per-topology functions
|
||||
void ProcessVertTriStrip(uint32_t index, bool finish)
|
||||
{
|
||||
@@ -1188,7 +1191,7 @@ struct PA_TESS : PA_STATE
|
||||
{
|
||||
PA_TESS(
|
||||
DRAW_CONTEXT *in_pDC,
|
||||
const simdscalar* in_pVertData,
|
||||
const SIMDSCALAR* in_pVertData,
|
||||
uint32_t in_attributeStrideInVectors,
|
||||
uint32_t in_numAttributes,
|
||||
uint32_t* (&in_ppIndices)[3],
|
||||
@@ -1201,7 +1204,11 @@ struct PA_TESS : PA_STATE
|
||||
m_numAttributes(in_numAttributes),
|
||||
m_numPrims(in_numPrims)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
m_vPrimId = _simd16_setzero_si();
|
||||
#else
|
||||
m_vPrimId = _simd_setzero_si();
|
||||
#endif
|
||||
binTopology = in_binTopology;
|
||||
m_ppIndices[0] = in_ppIndices[0];
|
||||
m_ppIndices[1] = in_ppIndices[1];
|
||||
@@ -1248,40 +1255,30 @@ struct PA_TESS : PA_STATE
|
||||
}
|
||||
|
||||
#endif
|
||||
static simdscalari GenPrimMask(uint32_t numPrims)
|
||||
static SIMDSCALARI GenPrimMask(uint32_t numPrims)
|
||||
{
|
||||
SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
#else
|
||||
#error "Help, help, I can't get up!"
|
||||
#endif
|
||||
|
||||
return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
static simd16scalari GenPrimMask_simd16(uint32_t numPrims)
|
||||
{
|
||||
SWR_ASSERT(numPrims <= KNOB_SIMD16_WIDTH);
|
||||
|
||||
static const OSALIGNSIMD16(int32_t) maskGen_16[KNOB_SIMD16_WIDTH * 2] =
|
||||
SWR_ASSERT(numPrims <= SIMD_WIDTH);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
return _simd16_loadu_si((const simd16scalari*)&maskGen_16[KNOB_SIMD16_WIDTH - numPrims]);
|
||||
return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
|
||||
#else
|
||||
static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
bool Assemble(uint32_t slot, simdvector verts[])
|
||||
{
|
||||
static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
|
||||
SWR_ASSERT(slot < m_numAttributes);
|
||||
|
||||
uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
|
||||
@@ -1290,23 +1287,38 @@ struct PA_TESS : PA_STATE
|
||||
return false;
|
||||
}
|
||||
|
||||
simdscalari mask = GenPrimMask(numPrimsToAssemble);
|
||||
SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
|
||||
|
||||
const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
|
||||
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
|
||||
{
|
||||
simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
|
||||
#else
|
||||
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
|
||||
#endif
|
||||
|
||||
const float* pBase = pBaseAttrib;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simd16scalar temp = _simd16_mask_i32gather_ps(
|
||||
_simd16_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
mask,
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
|
||||
verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo;
|
||||
#else
|
||||
verts[i].v[c] = _simd_mask_i32gather_ps(
|
||||
_simd_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
_simd_castsi_ps(mask),
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
|
||||
#endif
|
||||
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1318,29 +1330,43 @@ struct PA_TESS : PA_STATE
|
||||
{
|
||||
SWR_ASSERT(slot < m_numAttributes);
|
||||
|
||||
uint32_t numPrimsToAssemble = PA_TESS::NumPrims_simd16();
|
||||
uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
|
||||
if (0 == numPrimsToAssemble)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
simd16scalari mask = GenPrimMask_simd16(numPrimsToAssemble);
|
||||
SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
|
||||
|
||||
const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
|
||||
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
|
||||
{
|
||||
simd16scalari indices = _simd16_load_si((const simd16scalari*)m_ppIndices[i]);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
|
||||
#else
|
||||
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
|
||||
#endif
|
||||
|
||||
const float* pBase = pBaseAttrib;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
verts[i].v[c] = _simd16_mask_i32gather_ps(
|
||||
_simd16_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
mask,
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
pBase += m_attributeStrideInVectors * KNOB_SIMD16_WIDTH;
|
||||
#else
|
||||
verts[i].v[c].lo = _simd_mask_i32gather_ps(
|
||||
_simd_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
_simd_castsi_ps(mask),
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
verts[i].v[c].hi = _simd_setzero_ps();
|
||||
#endif
|
||||
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1356,14 +1382,18 @@ struct PA_TESS : PA_STATE
|
||||
const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
|
||||
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
|
||||
#else
|
||||
uint32_t index = m_ppIndices[i][primIndex];
|
||||
#endif
|
||||
const float* pVertData = pVertDataBase;
|
||||
float* pVert = (float*)&verts[i];
|
||||
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
pVert[c] = pVertData[index];
|
||||
pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
|
||||
pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1379,82 +1409,44 @@ struct PA_TESS : PA_STATE
|
||||
return HasWork();
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput()
|
||||
SIMDVERTEX& GetNextVsOutput()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simdvertex junk;
|
||||
static SIMDVERTEX junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdvertex& GetNextVsOutput_simd16_lo()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simdvertex junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput_simd16_hi()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simdvertex junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
simd16vertex& GetNextVsOutput_simd16()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simd16vertex junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
#endif
|
||||
bool GetNextStreamOutput()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
return false;
|
||||
}
|
||||
|
||||
simdmask& GetNextVsIndices()
|
||||
SIMDMASK& GetNextVsIndices()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simdmask junk;
|
||||
static SIMDMASK junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
uint32_t NumPrims()
|
||||
{
|
||||
return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
|
||||
return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
uint32_t NumPrims_simd16()
|
||||
{
|
||||
return std::min<uint32_t>(m_numPrims, KNOB_SIMD16_WIDTH);
|
||||
}
|
||||
|
||||
#endif
|
||||
void Reset() { SWR_ASSERT(0); };
|
||||
|
||||
simdscalari GetPrimID(uint32_t startID)
|
||||
SIMDSCALARI GetPrimID(uint32_t startID)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
|
||||
#else
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdscalari GetPrimID_simd16_lo(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
|
||||
}
|
||||
|
||||
simdscalari GetPrimID_simd16_hi(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID + KNOB_SIMD_WIDTH), m_vPrimId);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
const simdscalar* m_pVertexData = nullptr;
|
||||
const SIMDSCALAR* m_pVertexData = nullptr;
|
||||
uint32_t m_attributeStrideInVectors = 0;
|
||||
uint32_t m_numAttributes = 0;
|
||||
uint32_t m_numPrims = 0;
|
||||
@@ -1462,7 +1454,7 @@ private:
|
||||
|
||||
uint32_t m_numVertsPerPrim = 0;
|
||||
|
||||
simdscalari m_vPrimId;
|
||||
SIMDSCALARI m_vPrimId;
|
||||
};
|
||||
|
||||
// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
|
||||
@@ -1486,7 +1478,7 @@ struct PA_FACTORY
|
||||
memset(&indexStore, 0, sizeof(indexStore));
|
||||
uint32_t numAttribs = state.feNumAttributes;
|
||||
|
||||
new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
|
||||
new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
|
||||
&this->indexStore[0], numVerts, numAttribs, state.topology, false);
|
||||
cutPA = true;
|
||||
}
|
||||
@@ -1494,7 +1486,7 @@ struct PA_FACTORY
|
||||
#endif
|
||||
{
|
||||
uint32_t numPrims = GetNumPrims(in_topo, numVerts);
|
||||
new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
|
||||
new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
|
||||
cutPA = false;
|
||||
}
|
||||
|
||||
@@ -1520,6 +1512,6 @@ struct PA_FACTORY
|
||||
|
||||
PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
|
||||
|
||||
simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
|
||||
simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
|
||||
PA_STATE::SIMDVERTEX vertexStore[MAX_NUM_VERTS_PER_PRIM];
|
||||
PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
|
||||
};
|
||||
|
@@ -245,6 +245,10 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
|
||||
#elif KNOB_ARCH >= KNOB_ARCH_AVX2
|
||||
|
||||
const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
|
||||
const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
|
||||
const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
|
||||
|
||||
const simdvector &a = PaGetSimdVector(pa, 0, slot);
|
||||
const simdvector &b = PaGetSimdVector(pa, 1, slot);
|
||||
const simdvector &c = PaGetSimdVector(pa, 2, slot);
|
||||
@@ -253,10 +257,6 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
// v1 -> a1 a4 a7 b2 b5 c0 c3 c6
|
||||
// v2 -> a2 a5 b0 b3 b6 c1 c4 c7
|
||||
|
||||
const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
|
||||
const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
|
||||
const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
|
||||
|
||||
simdvector &v0 = verts[0];
|
||||
simdvector &v1 = verts[1];
|
||||
simdvector &v2 = verts[2];
|
||||
@@ -334,7 +334,7 @@ void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m12
|
||||
// We have 12 simdscalars contained within 3 simdvectors which
|
||||
// hold at least 8 triangles worth of data. We want to assemble a single
|
||||
// triangle with data in horizontal form.
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
|
||||
const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
|
||||
const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
|
||||
@@ -559,7 +559,7 @@ bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
}
|
||||
|
||||
// store off leading vertex for attributes
|
||||
simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
|
||||
PA_STATE_OPT::SIMDVERTEX* pVertex = (PA_STATE_OPT::SIMDVERTEX*)pa.pStreamBase;
|
||||
pa.leadingVertex = pVertex[pa.cur];
|
||||
|
||||
SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
|
||||
@@ -568,7 +568,7 @@ bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
|
||||
bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
{
|
||||
simdvector& leadVert = pa.leadingVertex.attrib[slot];
|
||||
PA_STATE_OPT::SIMDVECTOR& leadVert = pa.leadingVertex.attrib[slot];
|
||||
simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
|
||||
simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
|
||||
simdscalar s;
|
||||
@@ -579,7 +579,11 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
simdscalar a0 = a[i];
|
||||
simdscalar b0 = b[i];
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
__m256 comp = leadVert[i].lo;
|
||||
#else
|
||||
__m256 comp = leadVert[i];
|
||||
#endif
|
||||
simdvector& v0 = verts[0];
|
||||
v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
|
||||
@@ -599,8 +603,19 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
{
|
||||
// vert 0 from leading vertex
|
||||
simdvector& lead = pa.leadingVertex.attrib[slot];
|
||||
#if USE_SIMD16_FRONTEND
|
||||
PA_STATE_OPT::SIMDVECTOR& temp = pa.leadingVertex.attrib[slot];
|
||||
|
||||
simdvector lead;
|
||||
lead[0] = temp[0].lo;
|
||||
lead[1] = temp[1].lo;
|
||||
lead[2] = temp[2].lo;
|
||||
lead[3] = temp[3].lo;
|
||||
verts[0] = swizzleLane0(lead);
|
||||
#else
|
||||
PA_STATE_OPT::SIMDVECTOR& lead = pa.leadingVertex.attrib[slot];
|
||||
verts[0] = swizzleLane0(lead);
|
||||
#endif
|
||||
|
||||
simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
|
||||
simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
|
||||
@@ -1201,7 +1216,7 @@ void PaRectListSingle0(
|
||||
// We have 12 simdscalars contained within 3 simdvectors which
|
||||
// hold at least 8 triangles worth of data. We want to assemble a single
|
||||
// triangle with data in horizontal form.
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
|
||||
const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
|
||||
|
||||
@@ -1417,11 +1432,15 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t*
|
||||
this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
|
||||
#endif
|
||||
|
||||
// simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
// simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
|
||||
simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
simd16scalari id82 = _simd16_set_epi32( 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
|
||||
|
||||
#else
|
||||
simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
|
||||
|
||||
#endif
|
||||
switch(this->binTopology)
|
||||
{
|
||||
case TOP_TRIANGLE_LIST:
|
||||
@@ -1430,18 +1449,33 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t*
|
||||
case TOP_LINE_STRIP:
|
||||
case TOP_LINE_LIST:
|
||||
case TOP_LINE_LOOP:
|
||||
#if USE_SIMD16_FRONTEND
|
||||
this->primIDIncr = 16;
|
||||
this->primID = id16;
|
||||
#else
|
||||
this->primIDIncr = 8;
|
||||
this->primID = id8;
|
||||
#endif
|
||||
break;
|
||||
case TOP_QUAD_LIST:
|
||||
case TOP_QUAD_STRIP:
|
||||
case TOP_RECT_LIST:
|
||||
#if USE_SIMD16_FRONTEND
|
||||
this->primIDIncr = 8;
|
||||
this->primID = id82;
|
||||
#else
|
||||
this->primIDIncr = 4;
|
||||
this->primID = id4;
|
||||
#endif
|
||||
break;
|
||||
case TOP_POINT_LIST:
|
||||
#if USE_SIMD16_FRONTEND
|
||||
this->primIDIncr = 16;
|
||||
this->primID = id16;
|
||||
#else
|
||||
this->primIDIncr = 8;
|
||||
this->primID = id8;
|
||||
#endif
|
||||
break;
|
||||
case TOP_PATCHLIST_1:
|
||||
case TOP_PATCHLIST_2:
|
||||
@@ -1476,8 +1510,13 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t*
|
||||
case TOP_PATCHLIST_31:
|
||||
case TOP_PATCHLIST_32:
|
||||
// Always run KNOB_SIMD_WIDTH number of patches at a time.
|
||||
#if USE_SIMD16_FRONTEND
|
||||
this->primIDIncr = 16;
|
||||
this->primID = id16;
|
||||
#else
|
||||
this->primIDIncr = 8;
|
||||
this->primID = id8;
|
||||
#endif
|
||||
break;
|
||||
|
||||
default:
|
||||
|
Reference in New Issue
Block a user