Enable SSE2 for FS.
This commit is contained in:
@@ -56,30 +56,22 @@ quad_shade_stage(struct quad_stage *qs)
|
||||
}
|
||||
|
||||
|
||||
#if !defined(XSTDCALL)
|
||||
#if defined(WIN32)
|
||||
#define XSTDCALL __stdcall
|
||||
#else
|
||||
#define XSTDCALL
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Compute quad's attribute values by linear interpolation.
|
||||
*
|
||||
* Push into the fp:
|
||||
*
|
||||
* INPUT[attr] = MAD COEF_A0[attr], COEF_DADX[attr], INPUT_WPOS.xxxx
|
||||
* INPUT[attr] = MAD INPUT[attr], COEF_DADY[attr], INPUT_WPOS.yyyy
|
||||
*/
|
||||
static INLINE void
|
||||
linterp(const struct tgsi_interp_coef *coef,
|
||||
struct tgsi_exec_vector *pos, uint ch)
|
||||
{
|
||||
uint j;
|
||||
for (j = 0; j < QUAD_SIZE; j++) {
|
||||
const float x = pos->xyzw[0].f[j];
|
||||
const float y = pos->xyzw[1].f[j];
|
||||
pos->xyzw[ch].f[j] = (coef->a0[ch] +
|
||||
coef->dadx[ch] * x +
|
||||
coef->dady[ch] * y);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
typedef void (XSTDCALL *sse2_function)(
|
||||
const struct tgsi_exec_vector *input,
|
||||
struct tgsi_exec_vector *output,
|
||||
float (*constant)[4],
|
||||
struct tgsi_exec_vector *temporary,
|
||||
const struct tgsi_interp_coef *coef );
|
||||
#endif
|
||||
|
||||
/* This should be done by the fragment shader execution unit (code
|
||||
* generated from the decl instructions). Do it here for now.
|
||||
@@ -127,12 +119,23 @@ shade_quad(
|
||||
machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f;
|
||||
machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f;
|
||||
|
||||
/* interp Z */
|
||||
linterp(&quad->coef[0], &machine.Inputs[0], 2); /* Z */
|
||||
linterp(&quad->coef[0], &machine.Inputs[0], 3); /* 1/W */
|
||||
|
||||
/* run shader */
|
||||
tgsi_exec_machine_run( &machine );
|
||||
if( softpipe->fs->executable != NULL ) {
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
sse2_function func = (sse2_function) softpipe->fs->executable;
|
||||
func(
|
||||
machine.Inputs,
|
||||
machine.Outputs,
|
||||
machine.Consts,
|
||||
machine.Temps,
|
||||
machine.InterpCoefs );
|
||||
#else
|
||||
assert( 0 );
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
tgsi_exec_machine_run( &machine );
|
||||
}
|
||||
|
||||
/* store result color (always in output[1]) */
|
||||
memcpy(
|
||||
|
@@ -114,6 +114,23 @@ get_temp(
|
||||
(vec * 4 + chan) * 16 );
|
||||
}
|
||||
|
||||
static struct x86_reg
|
||||
get_coef_base( void )
|
||||
{
|
||||
return get_output_base();
|
||||
}
|
||||
|
||||
static struct x86_reg
|
||||
get_coef(
|
||||
unsigned vec,
|
||||
unsigned chan,
|
||||
unsigned member )
|
||||
{
|
||||
return x86_make_disp(
|
||||
get_coef_base(),
|
||||
((vec * 3 + member) * 4 + chan) * 4 );
|
||||
}
|
||||
|
||||
static struct x86_reg
|
||||
get_addr(
|
||||
unsigned vec,
|
||||
@@ -143,7 +160,7 @@ emit_const(
|
||||
}
|
||||
|
||||
static void
|
||||
emit_input(
|
||||
emit_inputf(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
@@ -155,6 +172,19 @@ emit_input(
|
||||
get_input( vec, chan ) );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_inputs(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
unsigned chan )
|
||||
{
|
||||
sse_movups(
|
||||
func,
|
||||
get_input( vec, chan ),
|
||||
make_xmm( xmm ) );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_output(
|
||||
struct x86_function *func,
|
||||
@@ -182,7 +212,7 @@ emit_tempf(
|
||||
}
|
||||
|
||||
static void
|
||||
emit_temps (
|
||||
emit_temps(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
@@ -194,6 +224,70 @@ emit_temps (
|
||||
make_xmm( xmm ) );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_coef(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
unsigned chan,
|
||||
unsigned member )
|
||||
{
|
||||
sse_movss(
|
||||
func,
|
||||
make_xmm( xmm ),
|
||||
get_coef( vec, chan, member ) );
|
||||
sse_shufps(
|
||||
func,
|
||||
make_xmm( xmm ),
|
||||
make_xmm( xmm ),
|
||||
SHUF( 0, 0, 0, 0 ) );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_coef_a0(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
unsigned chan )
|
||||
{
|
||||
emit_coef(
|
||||
func,
|
||||
xmm,
|
||||
vec,
|
||||
chan,
|
||||
0 );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_coef_dadx(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
unsigned chan )
|
||||
{
|
||||
emit_coef(
|
||||
func,
|
||||
xmm,
|
||||
vec,
|
||||
chan,
|
||||
1 );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_coef_dady(
|
||||
struct x86_function *func,
|
||||
unsigned xmm,
|
||||
unsigned vec,
|
||||
unsigned chan )
|
||||
{
|
||||
emit_coef(
|
||||
func,
|
||||
xmm,
|
||||
vec,
|
||||
chan,
|
||||
2 );
|
||||
}
|
||||
|
||||
static void
|
||||
emit_addrf(
|
||||
struct x86_function *func,
|
||||
@@ -676,7 +770,7 @@ emit_fetch(
|
||||
break;
|
||||
|
||||
case TGSI_FILE_INPUT:
|
||||
emit_input(
|
||||
emit_inputf(
|
||||
func,
|
||||
xmm,
|
||||
reg->SrcRegister.Index,
|
||||
@@ -1658,6 +1752,76 @@ emit_instruction(
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_declaration(
|
||||
struct x86_function *func,
|
||||
struct tgsi_full_declaration *decl )
|
||||
{
|
||||
if( decl->Declaration.File == TGSI_FILE_INPUT ) {
|
||||
unsigned first, last, mask;
|
||||
unsigned i, j;
|
||||
|
||||
assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
|
||||
|
||||
first = decl->u.DeclarationRange.First;
|
||||
last = decl->u.DeclarationRange.Last;
|
||||
mask = decl->Declaration.UsageMask;
|
||||
|
||||
/* Do not touch WPOS.xy */
|
||||
if( first == 0 ) {
|
||||
mask &= ~TGSI_WRITEMASK_XY;
|
||||
if( mask == TGSI_WRITEMASK_NONE ) {
|
||||
first++;
|
||||
}
|
||||
}
|
||||
|
||||
for( i = first; i <= last; i++ ) {
|
||||
for( j = 0; j < NUM_CHANNELS; j++ ) {
|
||||
if( mask & (1 << j) ) {
|
||||
switch( decl->Interpolation.Interpolate ) {
|
||||
case TGSI_INTERPOLATE_CONSTANT:
|
||||
emit_coef_a0( func, 0, i, j );
|
||||
emit_inputs( func, 0, i, j );
|
||||
break;
|
||||
|
||||
case TGSI_INTERPOLATE_LINEAR:
|
||||
emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
|
||||
emit_coef_dadx( func, 1, i, j );
|
||||
emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
|
||||
emit_coef_dady( func, 3, i, j );
|
||||
emit_mul( func, 0, 1 ); /* x * dadx */
|
||||
emit_coef_a0( func, 4, i, j );
|
||||
emit_mul( func, 2, 3 ); /* y * dady */
|
||||
emit_add( func, 0, 4 ); /* x * dadx + a0 */
|
||||
emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
|
||||
emit_inputs( func, 0, i, j );
|
||||
break;
|
||||
|
||||
case TGSI_INTERPOLATE_PERSPECTIVE:
|
||||
emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
|
||||
emit_coef_dadx( func, 1, i, j );
|
||||
emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
|
||||
emit_coef_dady( func, 3, i, j );
|
||||
emit_mul( func, 0, 1 ); /* x * dadx */
|
||||
emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
|
||||
emit_coef_a0( func, 5, i, j );
|
||||
emit_rcp( func, 4, 4 ); /* 1.0 / w */
|
||||
emit_mul( func, 2, 3 ); /* y * dady */
|
||||
emit_add( func, 0, 5 ); /* x * dadx + a0 */
|
||||
emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
|
||||
emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
|
||||
emit_inputs( func, 0, i, j );
|
||||
break;
|
||||
|
||||
default:
|
||||
assert( 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
tgsi_emit_sse2(
|
||||
struct tgsi_token *tokens,
|
||||
@@ -1715,4 +1879,82 @@ tgsi_emit_sse2(
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fragment shaders are responsible for interpolating shader inputs. Because on
|
||||
* x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
|
||||
* output, const, temp and coef), the code is split into two phases --
|
||||
* DECLARATION and INSTRUCTION phase.
|
||||
* GP register holding the output argument is aliased with the coeff argument,
|
||||
* as outputs are not needed in the DECLARATION phase.
|
||||
*/
|
||||
unsigned
|
||||
tgsi_emit_sse2_fs(
|
||||
struct tgsi_token *tokens,
|
||||
struct x86_function *func )
|
||||
{
|
||||
struct tgsi_parse_context parse;
|
||||
boolean instruction_phase = FALSE;
|
||||
|
||||
func->csr = func->store;
|
||||
|
||||
/* DECLARATION phase, do not load output argument. */
|
||||
x86_mov(
|
||||
func,
|
||||
get_input_base(),
|
||||
get_argument( 0 ) );
|
||||
x86_mov(
|
||||
func,
|
||||
get_const_base(),
|
||||
get_argument( 2 ) );
|
||||
x86_mov(
|
||||
func,
|
||||
get_temp_base(),
|
||||
get_argument( 3 ) );
|
||||
x86_mov(
|
||||
func,
|
||||
get_coef_base(),
|
||||
get_argument( 4 ) );
|
||||
|
||||
tgsi_parse_init( &parse, tokens );
|
||||
|
||||
while( !tgsi_parse_end_of_tokens( &parse ) ) {
|
||||
tgsi_parse_token( &parse );
|
||||
|
||||
switch( parse.FullToken.Token.Type ) {
|
||||
case TGSI_TOKEN_TYPE_DECLARATION:
|
||||
emit_declaration(
|
||||
func,
|
||||
&parse.FullToken.FullDeclaration );
|
||||
break;
|
||||
|
||||
case TGSI_TOKEN_TYPE_INSTRUCTION:
|
||||
if( !instruction_phase ) {
|
||||
/* INSTRUCTION phase, overwrite coeff with output. */
|
||||
instruction_phase = TRUE;
|
||||
x86_mov(
|
||||
func,
|
||||
get_output_base(),
|
||||
get_argument( 1 ) );
|
||||
}
|
||||
emit_instruction(
|
||||
func,
|
||||
&parse.FullToken.FullInstruction );
|
||||
break;
|
||||
|
||||
default:
|
||||
assert( 0 );
|
||||
}
|
||||
}
|
||||
|
||||
tgsi_parse_free( &parse );
|
||||
|
||||
#ifdef WIN32
|
||||
x86_retw( func, 16 );
|
||||
#else
|
||||
x86_ret( func );
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -13,6 +13,11 @@ tgsi_emit_sse2(
|
||||
struct tgsi_token *tokens,
|
||||
struct x86_function *function );
|
||||
|
||||
unsigned
|
||||
tgsi_emit_sse2_fs(
|
||||
struct tgsi_token *tokens,
|
||||
struct x86_function *function );
|
||||
|
||||
#if defined __cplusplus
|
||||
} // extern "C"
|
||||
#endif // defined __cplusplus
|
||||
|
@@ -36,6 +36,7 @@
|
||||
#include "pipe/p_defines.h"
|
||||
#include "pipe/p_winsys.h"
|
||||
#include "pipe/tgsi/mesa/mesa_to_tgsi.h"
|
||||
#include "pipe/tgsi/exec/tgsi_core.h"
|
||||
#include "pipe/tgsi/exec/tgsi_dump.h"
|
||||
|
||||
#include "st_context.h"
|
||||
@@ -163,6 +164,14 @@ st_translate_fragment_shader(struct st_context *st,
|
||||
if (TGSI_DEBUG)
|
||||
tgsi_dump( stfp->tokens, 0/*TGSI_DUMP_VERBOSE*/ );
|
||||
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
if (stfp->sse2_program.csr == stfp->sse2_program.store)
|
||||
tgsi_emit_sse2_fs( stfp->tokens, &stfp->sse2_program );
|
||||
|
||||
if (!cso->state.executable)
|
||||
((struct cso_fragment_shader*)cso)->state.executable = (void *) x86_get_func( &stfp->sse2_program );
|
||||
#endif
|
||||
|
||||
stfp->dirty = 0;
|
||||
|
||||
return cso;
|
||||
|
@@ -99,13 +99,16 @@ static struct gl_program *st_new_program( GLcontext *ctx,
|
||||
}
|
||||
|
||||
case GL_FRAGMENT_PROGRAM_ARB:
|
||||
case GL_FRAGMENT_PROGRAM_NV:
|
||||
{
|
||||
case GL_FRAGMENT_PROGRAM_NV: {
|
||||
struct st_fragment_program *prog = CALLOC_STRUCT(st_fragment_program);
|
||||
|
||||
prog->id = program_id++;
|
||||
prog->dirty = 1;
|
||||
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
x86_init_func( &prog->sse2_program );
|
||||
#endif
|
||||
|
||||
return _mesa_init_fragment_program( ctx,
|
||||
&prog->Base,
|
||||
target,
|
||||
@@ -121,8 +124,7 @@ static void st_delete_program( GLcontext *ctx,
|
||||
struct gl_program *prog )
|
||||
{
|
||||
switch( prog->Target ) {
|
||||
case GL_VERTEX_PROGRAM_ARB:
|
||||
{
|
||||
case GL_VERTEX_PROGRAM_ARB: {
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
struct st_vertex_program *p = (struct st_vertex_program *) prog;
|
||||
|
||||
@@ -130,7 +132,14 @@ static void st_delete_program( GLcontext *ctx,
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
case GL_FRAGMENT_PROGRAM_ARB: {
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
struct st_fragment_program *p = (struct st_fragment_program *) prog;
|
||||
|
||||
x86_release_func( &p->sse2_program );
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
_mesa_delete_program( ctx, prog );
|
||||
}
|
||||
@@ -156,7 +165,7 @@ static void st_program_string_notify( GLcontext *ctx,
|
||||
if (prog == &ctx->FragmentProgram._Current->Base)
|
||||
st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
|
||||
|
||||
p->id = program_id++;
|
||||
p->id = program_id++;
|
||||
p->param_state = p->Base.Base.Parameters->StateFlags;
|
||||
}
|
||||
else if (target == GL_VERTEX_PROGRAM_ARB) {
|
||||
@@ -165,7 +174,7 @@ static void st_program_string_notify( GLcontext *ctx,
|
||||
if (prog == &ctx->VertexProgram._Current->Base)
|
||||
st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
|
||||
|
||||
p->id = program_id++;
|
||||
p->id = program_id++;
|
||||
p->param_state = p->Base.Base.Parameters->StateFlags;
|
||||
|
||||
/* Also tell tnl about it:
|
||||
|
@@ -49,10 +49,14 @@ struct st_fragment_program
|
||||
GLboolean error; /* If program is malformed for any reason. */
|
||||
GLuint id; /**< String id, for tracking ProgramStringNotify changes. */
|
||||
|
||||
|
||||
/** The program in TGSI format */
|
||||
struct tgsi_token tokens[ST_FP_MAX_TOKENS];
|
||||
GLboolean dirty;
|
||||
|
||||
#if defined(USE_X86_ASM) || defined(SLANG_X86)
|
||||
struct x86_function sse2_program;
|
||||
#endif
|
||||
|
||||
/** Pointer to the corresponding cached shader */
|
||||
const struct cso_fragment_shader *fs;
|
||||
|
||||
|
Reference in New Issue
Block a user