Enable SSE2 for FS.

2007-09-24 12:32:26 +01:00
parent c0afc92f00
commit c0dd02219d
6 changed files with 310 additions and 38 deletions
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -56,30 +56,22 @@ quad_shade_stage(struct quad_stage *qs)
 }


+#if !defined(XSTDCALL) 
+#if defined(WIN32)
+#define XSTDCALL __stdcall
+#else
+#define XSTDCALL
+#endif
+#endif

-
-/**
- * Compute quad's attribute values by linear interpolation.
- *
- * Push into the fp:
- * 
- *   INPUT[attr] = MAD COEF_A0[attr], COEF_DADX[attr], INPUT_WPOS.xxxx
- *   INPUT[attr] = MAD INPUT[attr],   COEF_DADY[attr], INPUT_WPOS.yyyy
- */
-static INLINE void
-linterp(const struct tgsi_interp_coef *coef,
-          struct tgsi_exec_vector *pos, uint ch)
-{
-   uint j;
-   for (j = 0; j < QUAD_SIZE; j++) {
-      const float x = pos->xyzw[0].f[j];
-      const float y = pos->xyzw[1].f[j];
-      pos->xyzw[ch].f[j] = (coef->a0[ch] +
-                            coef->dadx[ch] * x + 
-                            coef->dady[ch] * y);
-   }
-}
-
+#if defined(USE_X86_ASM) || defined(SLANG_X86)
+typedef void (XSTDCALL *sse2_function)(
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],
+   struct tgsi_exec_vector *temporary,
+   const struct tgsi_interp_coef *coef );
+#endif

 /* This should be done by the fragment shader execution unit (code
 * generated from the decl instructions).  Do it here for now.
@@ -127,12 +119,23 @@ shade_quad(
   machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f;
   machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f;

-   /* interp Z */
-   linterp(&quad->coef[0], &machine.Inputs[0], 2); /* Z */
-   linterp(&quad->coef[0], &machine.Inputs[0], 3); /* 1/W */
-
   /* run shader */
-   tgsi_exec_machine_run( &machine );
+   if( softpipe->fs->executable != NULL ) {
+#if defined(USE_X86_ASM) || defined(SLANG_X86)
+      sse2_function func = (sse2_function) softpipe->fs->executable;
+      func(
+         machine.Inputs,
+         machine.Outputs,
+         machine.Consts,
+         machine.Temps,
+         machine.InterpCoefs );
+#else
+      assert( 0 );
+#endif
+   }
+   else {
+      tgsi_exec_machine_run( &machine );
+   }

   /* store result color (always in output[1]) */
   memcpy(
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -114,6 +114,23 @@ get_temp(
      (vec * 4 + chan) * 16 );
 }

+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
 static struct x86_reg
 get_addr(
   unsigned vec,
@@ -143,7 +160,7 @@ emit_const(
 }

 static void
-emit_input(
+emit_inputf(
   struct x86_function *func,
   unsigned xmm,
   unsigned vec,
@@ -155,6 +172,19 @@ emit_input(
      get_input( vec, chan ) );
 }

+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
 static void
 emit_output(
   struct x86_function *func,
@@ -182,7 +212,7 @@ emit_tempf(
 }

 static void
-emit_temps (
+emit_temps(
   struct x86_function *func,
   unsigned xmm,
   unsigned vec,
@@ -194,6 +224,70 @@ emit_temps (
      make_xmm( xmm ) );
 }

+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+
 static void
 emit_addrf(
   struct x86_function *func,
@@ -676,7 +770,7 @@ emit_fetch(
         break;

      case TGSI_FILE_INPUT:
-         emit_input(
+         emit_inputf(
            func,
            xmm,
            reg->SrcRegister.Index,
@@ -1658,6 +1752,76 @@ emit_instruction(
   }
 }

+static void
+emit_declaration(
+   struct x86_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+      first = decl->u.DeclarationRange.First;
+      last = decl->u.DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      /* Do not touch WPOS.xy */
+      if( first == 0 ) {
+         mask &= ~TGSI_WRITEMASK_XY;
+         if( mask == TGSI_WRITEMASK_NONE ) {
+            first++;
+         }
+      }
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Interpolation.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+               }
+            }
+         }
+      }
+   }
+}
+
 unsigned
 tgsi_emit_sse2(
   struct tgsi_token *tokens,
@@ -1715,4 +1879,82 @@ tgsi_emit_sse2(
   return 1;
 }

+/**
+ * Fragment shaders are responsible for interpolating shader inputs. Because on
+ * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
+ * output, const, temp and coef), the code is split into two phases --
+ * DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff argument,
+ * as outputs are not needed in the DECLARATION phase.
+ */
+unsigned
+tgsi_emit_sse2_fs(
+   struct tgsi_token *tokens,
+   struct x86_function *func )
+{
+   struct tgsi_parse_context parse;
+   boolean instruction_phase = FALSE;
+
+   func->csr = func->store;
+
+   /* DECLARATION phase, do not load output argument. */
+   x86_mov(
+      func,
+      get_input_base(),
+      get_argument( 0 ) );
+   x86_mov(
+      func,
+      get_const_base(),
+      get_argument( 2 ) );
+   x86_mov(
+      func,
+      get_temp_base(),
+      get_argument( 3 ) );
+   x86_mov(
+      func,
+      get_coef_base(),
+      get_argument( 4 ) );
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         emit_declaration(
+            func,
+            &parse.FullToken.FullDeclaration );
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if( !instruction_phase ) {
+            /* INSTRUCTION phase, overwrite coeff with output. */
+            instruction_phase = TRUE;
+            x86_mov(
+               func,
+               get_output_base(),
+               get_argument( 1 ) );
+         }
+         emit_instruction(
+            func,
+            &parse.FullToken.FullInstruction );
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free( &parse );
+
+#ifdef WIN32
+   x86_retw( func, 16 );
+#else
+   x86_ret( func );
+#endif
+
+   return 1;
+}
+
 #endif
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.h
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.h
@@ -13,6 +13,11 @@ tgsi_emit_sse2(
   struct tgsi_token *tokens,
   struct x86_function *function );

+unsigned
+tgsi_emit_sse2_fs(
+   struct tgsi_token *tokens,
+   struct x86_function *function );
+
 #if defined __cplusplus
 } // extern "C"
 #endif // defined __cplusplus
--- a/src/mesa/state_tracker/st_atom_fs.c
+++ b/src/mesa/state_tracker/st_atom_fs.c
@@ -36,6 +36,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
 #include "pipe/tgsi/mesa/mesa_to_tgsi.h"
+#include "pipe/tgsi/exec/tgsi_core.h"
 #include "pipe/tgsi/exec/tgsi_dump.h"

 #include "st_context.h"
@@ -163,6 +164,14 @@ st_translate_fragment_shader(struct st_context *st,
   if (TGSI_DEBUG)
      tgsi_dump( stfp->tokens, 0/*TGSI_DUMP_VERBOSE*/ );

+#if defined(USE_X86_ASM) || defined(SLANG_X86)
+   if (stfp->sse2_program.csr == stfp->sse2_program.store)
+      tgsi_emit_sse2_fs( stfp->tokens, &stfp->sse2_program );
+
+   if (!cso->state.executable)
+      ((struct cso_fragment_shader*)cso)->state.executable = (void *) x86_get_func( &stfp->sse2_program );
+#endif
+
   stfp->dirty = 0;

   return cso;
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -99,13 +99,16 @@ static struct gl_program *st_new_program( GLcontext *ctx,
   }

   case GL_FRAGMENT_PROGRAM_ARB:
-   case GL_FRAGMENT_PROGRAM_NV:
-   {
+   case GL_FRAGMENT_PROGRAM_NV: {
      struct st_fragment_program *prog = CALLOC_STRUCT(st_fragment_program);

      prog->id = program_id++;
      prog->dirty = 1;

+#if defined(USE_X86_ASM) || defined(SLANG_X86)
+      x86_init_func( &prog->sse2_program );
+#endif
+
      return _mesa_init_fragment_program( ctx, 
 					  &prog->Base,
 					  target, 
@@ -121,8 +124,7 @@ static void st_delete_program( GLcontext *ctx,
 			       struct gl_program *prog )
 {
   switch( prog->Target ) {
-   case GL_VERTEX_PROGRAM_ARB:
-   {
+   case GL_VERTEX_PROGRAM_ARB: {
 #if defined(USE_X86_ASM) || defined(SLANG_X86)
      struct st_vertex_program *p = (struct st_vertex_program *) prog;

@@ -130,7 +132,14 @@ static void st_delete_program( GLcontext *ctx,
 #endif
      break;
   }
+   case GL_FRAGMENT_PROGRAM_ARB: {
+#if defined(USE_X86_ASM) || defined(SLANG_X86)
+      struct st_fragment_program *p = (struct st_fragment_program *) prog;

+      x86_release_func( &p->sse2_program );
+#endif
+      break;
+   }
   }
   _mesa_delete_program( ctx, prog );
 }
@@ -156,7 +165,7 @@ static void st_program_string_notify( GLcontext *ctx,
      if (prog == &ctx->FragmentProgram._Current->Base)
 	 st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;

-      p->id = program_id++;      
+      p->id = program_id++;
      p->param_state = p->Base.Base.Parameters->StateFlags;
   }
   else if (target == GL_VERTEX_PROGRAM_ARB) {
@@ -165,7 +174,7 @@ static void st_program_string_notify( GLcontext *ctx,
      if (prog == &ctx->VertexProgram._Current->Base)
 	 st->dirty.st |= ST_NEW_VERTEX_PROGRAM;

-      p->id = program_id++;      
+      p->id = program_id++;
      p->param_state = p->Base.Base.Parameters->StateFlags;

      /* Also tell tnl about it:
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -49,10 +49,14 @@ struct st_fragment_program
   GLboolean error;             /* If program is malformed for any reason. */
   GLuint id; /**< String id, for tracking ProgramStringNotify changes. */

-
+   /** The program in TGSI format */
   struct tgsi_token tokens[ST_FP_MAX_TOKENS];
   GLboolean dirty;

+#if defined(USE_X86_ASM) || defined(SLANG_X86)
+   struct x86_function  sse2_program;
+#endif
+
   /** Pointer to the corresponding cached shader */
   const struct cso_fragment_shader *fs;