Enable SSE2 for FS.

This commit is contained in:
michal
2007-09-24 12:32:26 +01:00
parent c0afc92f00
commit c0dd02219d
6 changed files with 310 additions and 38 deletions

View File

@@ -56,30 +56,22 @@ quad_shade_stage(struct quad_stage *qs)
}
#if !defined(XSTDCALL)
#if defined(WIN32)
#define XSTDCALL __stdcall
#else
#define XSTDCALL
#endif
#endif
/**
* Compute quad's attribute values by linear interpolation.
*
* Push into the fp:
*
* INPUT[attr] = MAD COEF_A0[attr], COEF_DADX[attr], INPUT_WPOS.xxxx
* INPUT[attr] = MAD INPUT[attr], COEF_DADY[attr], INPUT_WPOS.yyyy
*/
static INLINE void
linterp(const struct tgsi_interp_coef *coef,
struct tgsi_exec_vector *pos, uint ch)
{
uint j;
for (j = 0; j < QUAD_SIZE; j++) {
const float x = pos->xyzw[0].f[j];
const float y = pos->xyzw[1].f[j];
pos->xyzw[ch].f[j] = (coef->a0[ch] +
coef->dadx[ch] * x +
coef->dady[ch] * y);
}
}
#if defined(USE_X86_ASM) || defined(SLANG_X86)
typedef void (XSTDCALL *sse2_function)(
const struct tgsi_exec_vector *input,
struct tgsi_exec_vector *output,
float (*constant)[4],
struct tgsi_exec_vector *temporary,
const struct tgsi_interp_coef *coef );
#endif
/* This should be done by the fragment shader execution unit (code
* generated from the decl instructions). Do it here for now.
@@ -127,12 +119,23 @@ shade_quad(
machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f;
machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f;
/* interp Z */
linterp(&quad->coef[0], &machine.Inputs[0], 2); /* Z */
linterp(&quad->coef[0], &machine.Inputs[0], 3); /* 1/W */
/* run shader */
tgsi_exec_machine_run( &machine );
if( softpipe->fs->executable != NULL ) {
#if defined(USE_X86_ASM) || defined(SLANG_X86)
sse2_function func = (sse2_function) softpipe->fs->executable;
func(
machine.Inputs,
machine.Outputs,
machine.Consts,
machine.Temps,
machine.InterpCoefs );
#else
assert( 0 );
#endif
}
else {
tgsi_exec_machine_run( &machine );
}
/* store result color (always in output[1]) */
memcpy(

View File

@@ -114,6 +114,23 @@ get_temp(
(vec * 4 + chan) * 16 );
}
static struct x86_reg
get_coef_base( void )
{
return get_output_base();
}
static struct x86_reg
get_coef(
unsigned vec,
unsigned chan,
unsigned member )
{
return x86_make_disp(
get_coef_base(),
((vec * 3 + member) * 4 + chan) * 4 );
}
static struct x86_reg
get_addr(
unsigned vec,
@@ -143,7 +160,7 @@ emit_const(
}
static void
emit_input(
emit_inputf(
struct x86_function *func,
unsigned xmm,
unsigned vec,
@@ -155,6 +172,19 @@ emit_input(
get_input( vec, chan ) );
}
static void
emit_inputs(
struct x86_function *func,
unsigned xmm,
unsigned vec,
unsigned chan )
{
sse_movups(
func,
get_input( vec, chan ),
make_xmm( xmm ) );
}
static void
emit_output(
struct x86_function *func,
@@ -182,7 +212,7 @@ emit_tempf(
}
static void
emit_temps (
emit_temps(
struct x86_function *func,
unsigned xmm,
unsigned vec,
@@ -194,6 +224,70 @@ emit_temps (
make_xmm( xmm ) );
}
static void
emit_coef(
struct x86_function *func,
unsigned xmm,
unsigned vec,
unsigned chan,
unsigned member )
{
sse_movss(
func,
make_xmm( xmm ),
get_coef( vec, chan, member ) );
sse_shufps(
func,
make_xmm( xmm ),
make_xmm( xmm ),
SHUF( 0, 0, 0, 0 ) );
}
static void
emit_coef_a0(
struct x86_function *func,
unsigned xmm,
unsigned vec,
unsigned chan )
{
emit_coef(
func,
xmm,
vec,
chan,
0 );
}
static void
emit_coef_dadx(
struct x86_function *func,
unsigned xmm,
unsigned vec,
unsigned chan )
{
emit_coef(
func,
xmm,
vec,
chan,
1 );
}
static void
emit_coef_dady(
struct x86_function *func,
unsigned xmm,
unsigned vec,
unsigned chan )
{
emit_coef(
func,
xmm,
vec,
chan,
2 );
}
static void
emit_addrf(
struct x86_function *func,
@@ -676,7 +770,7 @@ emit_fetch(
break;
case TGSI_FILE_INPUT:
emit_input(
emit_inputf(
func,
xmm,
reg->SrcRegister.Index,
@@ -1658,6 +1752,76 @@ emit_instruction(
}
}
static void
emit_declaration(
struct x86_function *func,
struct tgsi_full_declaration *decl )
{
if( decl->Declaration.File == TGSI_FILE_INPUT ) {
unsigned first, last, mask;
unsigned i, j;
assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
first = decl->u.DeclarationRange.First;
last = decl->u.DeclarationRange.Last;
mask = decl->Declaration.UsageMask;
/* Do not touch WPOS.xy */
if( first == 0 ) {
mask &= ~TGSI_WRITEMASK_XY;
if( mask == TGSI_WRITEMASK_NONE ) {
first++;
}
}
for( i = first; i <= last; i++ ) {
for( j = 0; j < NUM_CHANNELS; j++ ) {
if( mask & (1 << j) ) {
switch( decl->Interpolation.Interpolate ) {
case TGSI_INTERPOLATE_CONSTANT:
emit_coef_a0( func, 0, i, j );
emit_inputs( func, 0, i, j );
break;
case TGSI_INTERPOLATE_LINEAR:
emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
emit_coef_dadx( func, 1, i, j );
emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
emit_coef_dady( func, 3, i, j );
emit_mul( func, 0, 1 ); /* x * dadx */
emit_coef_a0( func, 4, i, j );
emit_mul( func, 2, 3 ); /* y * dady */
emit_add( func, 0, 4 ); /* x * dadx + a0 */
emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
emit_inputs( func, 0, i, j );
break;
case TGSI_INTERPOLATE_PERSPECTIVE:
emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
emit_coef_dadx( func, 1, i, j );
emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
emit_coef_dady( func, 3, i, j );
emit_mul( func, 0, 1 ); /* x * dadx */
emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
emit_coef_a0( func, 5, i, j );
emit_rcp( func, 4, 4 ); /* 1.0 / w */
emit_mul( func, 2, 3 ); /* y * dady */
emit_add( func, 0, 5 ); /* x * dadx + a0 */
emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
emit_inputs( func, 0, i, j );
break;
default:
assert( 0 );
}
}
}
}
}
}
unsigned
tgsi_emit_sse2(
struct tgsi_token *tokens,
@@ -1715,4 +1879,82 @@ tgsi_emit_sse2(
return 1;
}
/**
* Fragment shaders are responsible for interpolating shader inputs. Because on
* x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
* output, const, temp and coef), the code is split into two phases --
* DECLARATION and INSTRUCTION phase.
* GP register holding the output argument is aliased with the coeff argument,
* as outputs are not needed in the DECLARATION phase.
*/
unsigned
tgsi_emit_sse2_fs(
struct tgsi_token *tokens,
struct x86_function *func )
{
struct tgsi_parse_context parse;
boolean instruction_phase = FALSE;
func->csr = func->store;
/* DECLARATION phase, do not load output argument. */
x86_mov(
func,
get_input_base(),
get_argument( 0 ) );
x86_mov(
func,
get_const_base(),
get_argument( 2 ) );
x86_mov(
func,
get_temp_base(),
get_argument( 3 ) );
x86_mov(
func,
get_coef_base(),
get_argument( 4 ) );
tgsi_parse_init( &parse, tokens );
while( !tgsi_parse_end_of_tokens( &parse ) ) {
tgsi_parse_token( &parse );
switch( parse.FullToken.Token.Type ) {
case TGSI_TOKEN_TYPE_DECLARATION:
emit_declaration(
func,
&parse.FullToken.FullDeclaration );
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
if( !instruction_phase ) {
/* INSTRUCTION phase, overwrite coeff with output. */
instruction_phase = TRUE;
x86_mov(
func,
get_output_base(),
get_argument( 1 ) );
}
emit_instruction(
func,
&parse.FullToken.FullInstruction );
break;
default:
assert( 0 );
}
}
tgsi_parse_free( &parse );
#ifdef WIN32
x86_retw( func, 16 );
#else
x86_ret( func );
#endif
return 1;
}
#endif

View File

@@ -13,6 +13,11 @@ tgsi_emit_sse2(
struct tgsi_token *tokens,
struct x86_function *function );
unsigned
tgsi_emit_sse2_fs(
struct tgsi_token *tokens,
struct x86_function *function );
#if defined __cplusplus
} // extern "C"
#endif // defined __cplusplus

View File

@@ -36,6 +36,7 @@
#include "pipe/p_defines.h"
#include "pipe/p_winsys.h"
#include "pipe/tgsi/mesa/mesa_to_tgsi.h"
#include "pipe/tgsi/exec/tgsi_core.h"
#include "pipe/tgsi/exec/tgsi_dump.h"
#include "st_context.h"
@@ -163,6 +164,14 @@ st_translate_fragment_shader(struct st_context *st,
if (TGSI_DEBUG)
tgsi_dump( stfp->tokens, 0/*TGSI_DUMP_VERBOSE*/ );
#if defined(USE_X86_ASM) || defined(SLANG_X86)
if (stfp->sse2_program.csr == stfp->sse2_program.store)
tgsi_emit_sse2_fs( stfp->tokens, &stfp->sse2_program );
if (!cso->state.executable)
((struct cso_fragment_shader*)cso)->state.executable = (void *) x86_get_func( &stfp->sse2_program );
#endif
stfp->dirty = 0;
return cso;

View File

@@ -99,13 +99,16 @@ static struct gl_program *st_new_program( GLcontext *ctx,
}
case GL_FRAGMENT_PROGRAM_ARB:
case GL_FRAGMENT_PROGRAM_NV:
{
case GL_FRAGMENT_PROGRAM_NV: {
struct st_fragment_program *prog = CALLOC_STRUCT(st_fragment_program);
prog->id = program_id++;
prog->dirty = 1;
#if defined(USE_X86_ASM) || defined(SLANG_X86)
x86_init_func( &prog->sse2_program );
#endif
return _mesa_init_fragment_program( ctx,
&prog->Base,
target,
@@ -121,8 +124,7 @@ static void st_delete_program( GLcontext *ctx,
struct gl_program *prog )
{
switch( prog->Target ) {
case GL_VERTEX_PROGRAM_ARB:
{
case GL_VERTEX_PROGRAM_ARB: {
#if defined(USE_X86_ASM) || defined(SLANG_X86)
struct st_vertex_program *p = (struct st_vertex_program *) prog;
@@ -130,7 +132,14 @@ static void st_delete_program( GLcontext *ctx,
#endif
break;
}
case GL_FRAGMENT_PROGRAM_ARB: {
#if defined(USE_X86_ASM) || defined(SLANG_X86)
struct st_fragment_program *p = (struct st_fragment_program *) prog;
x86_release_func( &p->sse2_program );
#endif
break;
}
}
_mesa_delete_program( ctx, prog );
}
@@ -156,7 +165,7 @@ static void st_program_string_notify( GLcontext *ctx,
if (prog == &ctx->FragmentProgram._Current->Base)
st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
p->id = program_id++;
p->id = program_id++;
p->param_state = p->Base.Base.Parameters->StateFlags;
}
else if (target == GL_VERTEX_PROGRAM_ARB) {
@@ -165,7 +174,7 @@ static void st_program_string_notify( GLcontext *ctx,
if (prog == &ctx->VertexProgram._Current->Base)
st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
p->id = program_id++;
p->id = program_id++;
p->param_state = p->Base.Base.Parameters->StateFlags;
/* Also tell tnl about it:

View File

@@ -49,10 +49,14 @@ struct st_fragment_program
GLboolean error; /* If program is malformed for any reason. */
GLuint id; /**< String id, for tracking ProgramStringNotify changes. */
/** The program in TGSI format */
struct tgsi_token tokens[ST_FP_MAX_TOKENS];
GLboolean dirty;
#if defined(USE_X86_ASM) || defined(SLANG_X86)
struct x86_function sse2_program;
#endif
/** Pointer to the corresponding cached shader */
const struct cso_fragment_shader *fs;