swrast: initial multi-threaded span rendering
Optional parallel rendering of spans using OpenMP. Initial implementation for aa triangles. A new option for scons is also provided to activate the openmp support (off by default). Signed-off-by: Brian Paul <brianp@vmware.com>
This commit is contained in:

committed by
Brian Paul

parent
fa351bd2e0
commit
e411cd7b0a
@@ -88,6 +88,7 @@ def AddOptions(opts):
|
||||
opts.Add('toolchain', 'compiler toolchain', default_toolchain)
|
||||
opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 'no'))
|
||||
opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
|
||||
opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp (swrast)', 'no'))
|
||||
opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
|
||||
opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
|
||||
opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
|
||||
|
@@ -596,6 +596,18 @@ def generate(env):
|
||||
libs += ['m', 'pthread', 'dl']
|
||||
env.Append(LIBS = libs)
|
||||
|
||||
# OpenMP
|
||||
if env['openmp']:
|
||||
if env['msvc']:
|
||||
env.Append(CCFLAGS = ['/openmp'])
|
||||
# When building openmp release VS2008 link.exe crashes with LNK1103 error.
|
||||
# Workaround: overwrite PDB flags with empty value as it isn't required anyways
|
||||
if env['build'] == 'release':
|
||||
env['PDB'] = ''
|
||||
if env['gcc']:
|
||||
env.Append(CCFLAGS = ['-fopenmp'])
|
||||
env.Append(LIBS = ['gomp'])
|
||||
|
||||
# Load tools
|
||||
env.Tool('lex')
|
||||
env.Tool('yacc')
|
||||
|
@@ -181,13 +181,20 @@
|
||||
const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
|
||||
const GLfloat dxdy = majDx / majDy;
|
||||
const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
|
||||
GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
|
||||
GLint iy;
|
||||
for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
|
||||
#endif
|
||||
for (iy = iyMin; iy < iyMax; iy++) {
|
||||
GLfloat x = pMin[0] - (yMin - iy) * dxdy;
|
||||
GLint ix, startX = (GLint) (x - xAdj);
|
||||
GLuint count;
|
||||
GLfloat coverage = 0.0F;
|
||||
|
||||
#ifdef _OPENMP
|
||||
/* each thread needs to use a different (global) SpanArrays variable */
|
||||
span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
|
||||
#endif
|
||||
/* skip over fragments with zero coverage */
|
||||
while (startX < MAX_WIDTH) {
|
||||
coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
|
||||
@@ -228,13 +235,12 @@
|
||||
coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
|
||||
}
|
||||
|
||||
if (ix <= startX)
|
||||
continue;
|
||||
|
||||
span.x = startX;
|
||||
span.y = iy;
|
||||
span.end = (GLuint) ix - (GLuint) startX;
|
||||
_swrast_write_rgba_span(ctx, &span);
|
||||
if (ix > startX) {
|
||||
span.x = startX;
|
||||
span.y = iy;
|
||||
span.end = (GLuint) ix - (GLuint) startX;
|
||||
_swrast_write_rgba_span(ctx, &span);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -244,13 +250,20 @@
|
||||
const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
|
||||
const GLfloat dxdy = majDx / majDy;
|
||||
const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
|
||||
GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
|
||||
GLint iy;
|
||||
for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
|
||||
#endif
|
||||
for (iy = iyMin; iy < iyMax; iy++) {
|
||||
GLfloat x = pMin[0] - (yMin - iy) * dxdy;
|
||||
GLint ix, left, startX = (GLint) (x + xAdj);
|
||||
GLuint count, n;
|
||||
GLfloat coverage = 0.0F;
|
||||
|
||||
#ifdef _OPENMP
|
||||
/* each thread needs to use a different (global) SpanArrays variable */
|
||||
span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
|
||||
#endif
|
||||
/* make sure we're not past the window edge */
|
||||
if (startX >= ctx->DrawBuffer->_Xmax) {
|
||||
startX = ctx->DrawBuffer->_Xmax - 1;
|
||||
@@ -296,31 +309,30 @@
|
||||
ATTRIB_LOOP_END
|
||||
#endif
|
||||
|
||||
if (startX <= ix)
|
||||
continue;
|
||||
if (startX > ix) {
|
||||
n = (GLuint) startX - (GLuint) ix;
|
||||
|
||||
n = (GLuint) startX - (GLuint) ix;
|
||||
left = ix + 1;
|
||||
|
||||
left = ix + 1;
|
||||
|
||||
/* shift all values to the left */
|
||||
/* XXX this is temporary */
|
||||
{
|
||||
SWspanarrays *array = span.array;
|
||||
GLint j;
|
||||
for (j = 0; j < (GLint) n; j++) {
|
||||
array->coverage[j] = array->coverage[j + left];
|
||||
COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
|
||||
/* shift all values to the left */
|
||||
/* XXX this is temporary */
|
||||
{
|
||||
SWspanarrays *array = span.array;
|
||||
GLint j;
|
||||
for (j = 0; j < (GLint) n; j++) {
|
||||
array->coverage[j] = array->coverage[j + left];
|
||||
COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
|
||||
#ifdef DO_Z
|
||||
array->z[j] = array->z[j + left];
|
||||
array->z[j] = array->z[j + left];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
span.x = left;
|
||||
span.y = iy;
|
||||
span.end = n;
|
||||
_swrast_write_rgba_span(ctx, &span);
|
||||
span.x = left;
|
||||
span.y = iy;
|
||||
span.end = n;
|
||||
_swrast_write_rgba_span(ctx, &span);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx )
|
||||
{
|
||||
GLuint i;
|
||||
SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
|
||||
#ifdef _OPENMP
|
||||
const GLint maxThreads = omp_get_max_threads();
|
||||
#else
|
||||
const GLint maxThreads = 1;
|
||||
#endif
|
||||
|
||||
if (SWRAST_DEBUG) {
|
||||
_mesa_debug(ctx, "_swrast_CreateContext\n");
|
||||
@@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx )
|
||||
for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
|
||||
swrast->TextureSample[i] = NULL;
|
||||
|
||||
swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
|
||||
/* SpanArrays is global and shared by all SWspan instances. However, when
|
||||
* using multiple threads, it is necessary to have one SpanArrays instance
|
||||
* per thread.
|
||||
*/
|
||||
swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * sizeof(SWspanarrays));
|
||||
if (!swrast->SpanArrays) {
|
||||
FREE(swrast);
|
||||
return GL_FALSE;
|
||||
}
|
||||
swrast->SpanArrays->ChanType = CHAN_TYPE;
|
||||
for(i = 0; i < maxThreads; i++) {
|
||||
swrast->SpanArrays[i].ChanType = CHAN_TYPE;
|
||||
#if CHAN_TYPE == GL_UNSIGNED_BYTE
|
||||
swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
|
||||
swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
|
||||
#elif CHAN_TYPE == GL_UNSIGNED_SHORT
|
||||
swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
|
||||
swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
|
||||
#else
|
||||
swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
|
||||
swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
|
||||
#endif
|
||||
}
|
||||
|
||||
/* init point span buffer */
|
||||
swrast->PointSpan.primitive = GL_POINT;
|
||||
@@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx )
|
||||
swrast->PointSpan.facing = 0;
|
||||
swrast->PointSpan.array = swrast->SpanArrays;
|
||||
|
||||
swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
|
||||
/* TexelBuffer is also global and normally shared by all SWspan instances;
|
||||
* when running with multiple threads, create one per thread.
|
||||
*/
|
||||
swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * maxThreads *
|
||||
MAX_WIDTH * 4 * sizeof(GLfloat));
|
||||
if (!swrast->TexelBuffer) {
|
||||
FREE(swrast->SpanArrays);
|
||||
|
@@ -48,7 +48,11 @@ typedef float (*float4_array)[4];
|
||||
static INLINE float4_array
|
||||
get_texel_array(SWcontext *swrast, GLuint unit)
|
||||
{
|
||||
#ifdef _OPENMP
|
||||
return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
|
||||
#else
|
||||
return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx )
|
||||
_tnl_notify_pipeline_output_change( ctx );
|
||||
}
|
||||
|
||||
#ifndef _OPENMP
|
||||
/* Don't adjust FPU precision mode in case multiple threads are to be used.
|
||||
* This would require that the additional threads also changed the FPU mode
|
||||
* which is quite a mess as this had to be done in all parallelized sections;
|
||||
* otherwise the master thread and all other threads are running in different
|
||||
* modes, producing inconsistent results.
|
||||
* Note that all x64 implementations don't define/use START_FAST_MATH, so
|
||||
* this is "hack" is only used in i386 mode
|
||||
*/
|
||||
START_FAST_MATH(__tmp);
|
||||
#endif
|
||||
|
||||
for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
|
||||
struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
|
||||
@@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx )
|
||||
break;
|
||||
}
|
||||
|
||||
#ifndef _OPENMP
|
||||
END_FAST_MATH(__tmp);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user