From b373c776933c4d2d00947d92d595368f6d36bc96 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Sat, 3 Oct 2015 07:55:32 -0600 Subject: [PATCH 001/270] mesa: remove unneeded error check in create_textures() Callers of create_texture() will either pass target=0 or a validated GL texture target enum so no need to do another error check inside the loop. Reviewed-by: Anuj Phogat Tested-by: Mark Janes --- src/mesa/main/texobj.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index 173e43c817c..aa4b38cc210 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -1211,6 +1211,7 @@ _mesa_create_nameless_texture(struct gl_context *ctx, GLenum target) * glCreateTextures should throw errors if target = 0. This is not exposed to * the rest of Mesa to encourage Mesa internals to use nameless textures, * which do not require expensive hash lookups. + * \param target either 0 or a a valid / error-checked texture target enum */ static void create_textures(struct gl_context *ctx, GLenum target, @@ -1219,6 +1220,7 @@ create_textures(struct gl_context *ctx, GLenum target, GLuint first; GLint i; const char *func = dsa ? "Create" : "Gen"; + const GLint targetIndex = _mesa_tex_target_to_index(ctx, target); if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE)) _mesa_debug(ctx, "gl%sTextures %d\n", func, n); @@ -1241,7 +1243,6 @@ create_textures(struct gl_context *ctx, GLenum target, /* Allocate new, empty texture objects */ for (i = 0; i < n; i++) { struct gl_texture_object *texObj; - GLint targetIndex; GLuint name = first + i; texObj = ctx->Driver.NewTextureObject(ctx, name, target); if (!texObj) { @@ -1252,14 +1253,6 @@ create_textures(struct gl_context *ctx, GLenum target, /* Initialize the target index if target is non-zero. */ if (target != 0) { - targetIndex = _mesa_tex_target_to_index(ctx, texObj->Target); - if (targetIndex < 0) { /* Bad Target */ - mtx_unlock(&ctx->Shared->Mutex); - _mesa_error(ctx, GL_INVALID_ENUM, "gl%sTextures(target = %s)", - func, _mesa_enum_to_string(texObj->Target)); - return; - } - assert(targetIndex < NUM_TEXTURE_TARGETS); texObj->TargetIndex = targetIndex; } From d61f492aba354283933b5d84e3daacc45a836141 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Sat, 3 Oct 2015 08:05:33 -0600 Subject: [PATCH 002/270] mesa: remove unused _mesa_create_nameless_texture() Reviewed-by: Anuj Phogat Tested-by: Mark Janes --- src/mesa/main/texobj.c | 20 -------------------- src/mesa/main/texobj.h | 3 --- 2 files changed, 23 deletions(-) diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index aa4b38cc210..66eacf850f4 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -1185,26 +1185,6 @@ invalidate_tex_image_error_check(struct gl_context *ctx, GLuint texture, return t; } -/** - * Wrapper for the driver function. Need this because _mesa_new_texture_object - * permits a target of 0 and does not initialize targetIndex. - */ -struct gl_texture_object * -_mesa_create_nameless_texture(struct gl_context *ctx, GLenum target) -{ - struct gl_texture_object *texObj = NULL; - GLint targetIndex; - - if (target == 0) - return texObj; - - texObj = ctx->Driver.NewTextureObject(ctx, 0, target); - targetIndex = _mesa_tex_target_to_index(ctx, texObj->Target); - assert(targetIndex < NUM_TEXTURE_TARGETS); - texObj->TargetIndex = targetIndex; - - return texObj; -} /** * Helper function for glCreateTextures and glGenTextures. Need this because diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h index 690878c85fc..8421337de4d 100644 --- a/src/mesa/main/texobj.h +++ b/src/mesa/main/texobj.h @@ -202,9 +202,6 @@ _mesa_unlock_context_textures( struct gl_context *ctx ); extern void _mesa_lock_context_textures( struct gl_context *ctx ); -extern struct gl_texture_object * -_mesa_create_nameless_texture(struct gl_context *ctx, GLenum target); - extern void _mesa_delete_nameless_texture(struct gl_context *ctx, struct gl_texture_object *texObj); From 7d7dd1871174905dfdd3ca874a09d9d7837ac743 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 5 Oct 2015 08:14:56 -0600 Subject: [PATCH 003/270] mesa,meta: move gl_texture_object::TargetIndex initializations Before, we were unconditionally assigning the TargetIndex field in _mesa_BindTexture(), even if it was already set properly. Now we initialize TargetIndex wherever we initialize the Target field, in _mesa_initialize_texture_object(), finish_texture_init(), etc. v2: also update the meta_copy_image code. In make_view() the view_tex_obj->Target field was set, but not the TargetIndex field. Also, remove a second, redundant assignment to view_tex_obj->Target. Add sanity check assertions too. Reviewed-by: Anuj Phogat Tested-by: Mark Janes --- src/mesa/drivers/common/meta_copy_image.c | 5 ++++- src/mesa/main/shared.c | 5 +++++ src/mesa/main/texobj.c | 27 +++++++++++++++-------- src/mesa/main/textureview.c | 2 ++ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c index 33490ee6615..04b9cafe308 100644 --- a/src/mesa/drivers/common/meta_copy_image.c +++ b/src/mesa/drivers/common/meta_copy_image.c @@ -108,7 +108,11 @@ make_view(struct gl_context *ctx, struct gl_texture_image *tex_image, return false; } + assert(tex_obj->Target != 0); + assert(tex_obj->TargetIndex < NUM_TEXTURE_TARGETS); + view_tex_obj->Target = tex_obj->Target; + view_tex_obj->TargetIndex = tex_obj->TargetIndex; *view_tex_image = _mesa_get_tex_image(ctx, view_tex_obj, tex_obj->Target, 0); @@ -129,7 +133,6 @@ make_view(struct gl_context *ctx, struct gl_texture_image *tex_image, view_tex_obj->NumLayers = tex_obj->NumLayers; view_tex_obj->Immutable = tex_obj->Immutable; view_tex_obj->ImmutableLevels = tex_obj->ImmutableLevels; - view_tex_obj->Target = tex_obj->Target; if (ctx->Driver.TextureView != NULL && !ctx->Driver.TextureView(ctx, view_tex_obj, tex_obj)) { diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c index 1acaf59f432..c37b31d1753 100644 --- a/src/mesa/main/shared.c +++ b/src/mesa/main/shared.c @@ -107,6 +107,11 @@ _mesa_alloc_shared_state(struct gl_context *ctx) }; STATIC_ASSERT(ARRAY_SIZE(targets) == NUM_TEXTURE_TARGETS); shared->DefaultTex[i] = ctx->Driver.NewTextureObject(ctx, 0, targets[i]); + /* Need to explicitly set/overwrite the TargetIndex field here since + * the call to _mesa_tex_target_to_index() in NewTextureObject() may + * fail if the texture target is not supported. + */ + shared->DefaultTex[i]->TargetIndex = i; } /* sanity check */ diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index 66eacf850f4..60c55aeb206 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -286,6 +286,12 @@ _mesa_initialize_texture_object( struct gl_context *ctx, obj->RefCount = 1; obj->Name = name; obj->Target = target; + if (target != 0) { + obj->TargetIndex = _mesa_tex_target_to_index(ctx, target); + } + else { + obj->TargetIndex = NUM_TEXTURE_TARGETS; /* invalid/error value */ + } obj->Priority = 1.0F; obj->BaseLevel = 0; obj->MaxLevel = 1000; @@ -340,6 +346,10 @@ finish_texture_init(struct gl_context *ctx, GLenum target, GLenum filter = GL_LINEAR; assert(obj->Target == 0); + obj->Target = target; + obj->TargetIndex = _mesa_tex_target_to_index(ctx, target); + assert(obj->TargetIndex < NUM_TEXTURE_TARGETS); + switch (target) { case GL_TEXTURE_2D_MULTISAMPLE: case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: @@ -1200,7 +1210,6 @@ create_textures(struct gl_context *ctx, GLenum target, GLuint first; GLint i; const char *func = dsa ? "Create" : "Gen"; - const GLint targetIndex = _mesa_tex_target_to_index(ctx, target); if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE)) _mesa_debug(ctx, "gl%sTextures %d\n", func, n); @@ -1231,11 +1240,6 @@ create_textures(struct gl_context *ctx, GLenum target, return; } - /* Initialize the target index if target is non-zero. */ - if (target != 0) { - texObj->TargetIndex = targetIndex; - } - /* insert into hash table */ _mesa_HashInsert(ctx->Shared->TexObjects, texObj->Name, texObj); @@ -1356,8 +1360,12 @@ unbind_texobj_from_texunits(struct gl_context *ctx, const gl_texture_index index = texObj->TargetIndex; GLuint u; - if (texObj->Target == 0) + if (texObj->Target == 0) { + /* texture was never bound */ return; + } + + assert(index < NUM_TEXTURE_TARGETS); for (u = 0; u < ctx->Texture.NumCurrentTexUsed; u++) { struct gl_texture_unit *unit = &ctx->Texture.Unit[u]; @@ -1725,10 +1733,11 @@ _mesa_BindTexture( GLenum target, GLuint texName ) _mesa_HashInsert(ctx->Shared->TexObjects, texName, newTexObj); mtx_unlock(&ctx->Shared->Mutex); } - newTexObj->Target = target; - newTexObj->TargetIndex = targetIndex; } + assert(newTexObj->Target == target); + assert(newTexObj->TargetIndex == targetIndex); + bind_texture(ctx, ctx->Texture.CurrentUnit, newTexObj); } diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c index 5a3282a40c1..04b7d73da5c 100644 --- a/src/mesa/main/textureview.c +++ b/src/mesa/main/textureview.c @@ -681,6 +681,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture, texObj->Immutable = GL_TRUE; texObj->ImmutableLevels = origTexObj->ImmutableLevels; texObj->Target = target; + texObj->TargetIndex = _mesa_tex_target_to_index(ctx, target); + assert(texObj->TargetIndex < NUM_TEXTURE_TARGETS); if (ctx->Driver.TextureView != NULL && !ctx->Driver.TextureView(ctx, texObj, origTexObj)) { From d8d0e4a81e42678cc8c8b876dfee24d5c2f4ba38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tapani=20P=C3=A4lli?= Date: Thu, 8 Oct 2015 09:43:41 +0300 Subject: [PATCH 004/270] mesa: add GL_UNSIGNED_INT_24_8 to _mesa_pack_depth_span MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch adds missing type (used with NV_read_depth) so that it gets handled correctly. This fixes errors seen with following CTS test: ES3-CTS.gtf.GL3Tests.packed_pixels.packed_pixels Signed-off-by: Tapani Pälli Reviewed-by: Iago Toral Quiroga Cc: "11.0" --- src/mesa/main/pack.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c index 00e31b05c99..89faf515443 100644 --- a/src/mesa/main/pack.c +++ b/src/mesa/main/pack.c @@ -1073,6 +1073,21 @@ _mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest, } } break; + case GL_UNSIGNED_INT_24_8: + { + const GLdouble scale = (GLdouble) 0xffffff; + GLuint *dst = (GLuint *) dest; + GLuint i; + for (i = 0; i < n; i++) { + GLuint z = (GLuint) (depthSpan[i] * scale); + assert(z <= 0xffffff); + dst[i] = (z << 8); + } + if (dstPacking->SwapBytes) { + _mesa_swap4( (GLuint *) dst, n ); + } + break; + } case GL_UNSIGNED_INT: { GLuint *dst = (GLuint *) dest; From 5be9bf2746370ecb180536eb2e5e48391b224dec Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Fri, 2 Oct 2015 08:43:51 +0200 Subject: [PATCH 005/270] main: fix goto in program_resource_top_level_array_stride MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use found_top_level_array_stride instead of found_top_level_array_size. Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Timothy Arceri Reviewed-by: Tapani Pälli --- src/mesa/main/shader_query.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index 6d73e3bdcf2..caaa78caa53 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -1010,11 +1010,11 @@ program_resource_top_level_array_stride(struct gl_shader_program *shProg, } else { array_stride = 0; } - goto found_top_level_array_size; + goto found_top_level_array_stride; } } } -found_top_level_array_size: +found_top_level_array_stride: free(interface_name); free(var_name); return array_stride; From 77c0b64ce335c7013de5da3b9ac497cb400ef8ce Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Thu, 1 Oct 2015 13:13:19 +0200 Subject: [PATCH 006/270] main: fix TOP_LEVEL_ARRAY_SIZE and TOP_LEVEL_ARRAY_STRIDE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the active variable is an array which is already a top-level shader storage block member, don't return its array size and stride when querying TOP_LEVEL_ARRAY_SIZE and TOP_LEVEL_ARRAY_STRIDE respectively. Fixes the following 12 dEQP-GLES31 tests: dEQP-GLES31.functional.ssbo.layout.single_basic_array.shared.mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.shared.row_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.shared.column_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.packed.mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.packed.row_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.packed.column_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std140.mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std140.row_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std140.column_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std430.mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std430.row_major_mat3x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std430.column_major_mat3x4 v2: - Fix check when the shader storage block is instanced - Write auxiliary function to do the check. v3: - Check if full_instanced_name is NULL just after allocation (Ilia) - Remove () from one strcmp() in the if statement (Ilia) Signed-off-by: Samuel Iglesias Gonsalvez Tested-by: Tapani Pälli Reviewed-by: Tapani Pälli --- src/mesa/main/shader_query.cpp | 53 +++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index caaa78caa53..a1db4c23acc 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -872,6 +872,46 @@ get_var_name(const char *name) return strndup(first_dot+1, strlen(first_dot) - 1); } +static bool +is_top_level_shader_storage_block_member(const char* name, + const char* interface_name, + const char* field_name) +{ + bool result = false; + + /* If the given variable is already a top-level shader storage + * block member, then return array_size = 1. + * We could have two possibilities: if we have an instanced + * shader storage block or not instanced. + * + * For the first, we check create a name as it was in top level and + * compare it with the real name. If they are the same, then + * the variable is already at top-level. + * + * Full instanced name is: interface name + '.' + var name + + * NULL character + */ + int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1; + char *full_instanced_name = (char *) calloc(name_length, sizeof(char)); + if (!full_instanced_name) { + fprintf(stderr, "%s: Cannot allocate space for name\n", __func__); + return false; + } + + snprintf(full_instanced_name, name_length, "%s.%s", + interface_name, field_name); + + /* Check if its top-level shader storage block member of an + * instanced interface block, or of a unnamed interface block. + */ + if (strcmp(name, full_instanced_name) == 0 || + strcmp(name, field_name) == 0) + result = true; + + free(full_instanced_name); + return result; +} + static GLint program_resource_top_level_array_size(struct gl_shader_program *shProg, struct gl_program_resource *res, @@ -921,12 +961,17 @@ program_resource_top_level_array_size(struct gl_shader_program *shProg, * the top-level block member is an array with no declared size, * the value zero is written to . */ - if (field->type->is_unsized_array()) + if (is_top_level_shader_storage_block_member(name, + interface_name, + var_name)) + array_size = 1; + else if (field->type->is_unsized_array()) array_size = 0; else if (field->type->is_array()) array_size = field->type->length; else array_size = 1; + goto found_top_level_array_size; } } @@ -995,6 +1040,12 @@ program_resource_top_level_array_stride(struct gl_shader_program *shProg, bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; const glsl_type *array_type = field->type->fields.array; + if (is_top_level_shader_storage_block_member(name, + interface_name, + var_name)) { + array_stride = 0; + goto found_top_level_array_stride; + } if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) { if (array_type->is_record() || array_type->is_array()) { array_stride = array_type->std140_size(row_major); From 66ca8e6632b2623425f848b9efc16edbed56f306 Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Thu, 1 Oct 2015 15:05:00 +0200 Subject: [PATCH 007/270] main: consider that unsized arrays have at least one active element From ARB_shader_storage_buffer_object: "When using the ARB_program_interface_query extension to enumerate the set of active buffer variables, only the first element of arrays (sized or unsized) will be enumerated" _mesa_program_resource_array_size() is used when getting the name (and name length) of the active variables. When it is an unsized array, we want to indicate it has one active element so the returned name would have "[0]" at the end. v2: - Use array_stride > 0 and array_elements == 0 to detect unsized arrays. Because of that, we don't need is_unsized_array flag (Timothy) Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Timothy Arceri --- src/mesa/main/shader_query.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index a1db4c23acc..ed0c89fda17 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -485,8 +485,14 @@ _mesa_program_resource_array_size(struct gl_program_resource *res) case GL_COMPUTE_SUBROUTINE_UNIFORM: case GL_TESS_CONTROL_SUBROUTINE_UNIFORM: case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: - case GL_BUFFER_VARIABLE: return RESOURCE_UNI(res)->array_elements; + case GL_BUFFER_VARIABLE: + /* Unsized arrays */ + if (RESOURCE_UNI(res)->array_stride > 0 && + RESOURCE_UNI(res)->array_elements == 0) + return 1; + else + return RESOURCE_UNI(res)->array_elements; case GL_VERTEX_SUBROUTINE: case GL_GEOMETRY_SUBROUTINE: case GL_FRAGMENT_SUBROUTINE: From d0992fa15a4bfaff59de50e6084a0a14882d3bdb Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Thu, 1 Oct 2015 14:46:01 +0200 Subject: [PATCH 008/270] main: buffer array variables can have array size of 0 if they are unsized From ARB_program_query_interface: For the property ARRAY_SIZE, a single integer identifying the number of active array elements of an active variable is written to . The array size returned is in units of the type associated with the property TYPE. For active variables not corresponding to an array of basic types, the value one is written to . If the variable is a shader storage block member in an array with no declared size, the value zero is written to . v2: - Unsized arrays of arrays have an array size different than zero v3: - Arrays and unsized arrays will have an array_stride > 0. Use it instead of is_unsized_array flag (Timothy). Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Timothy Arceri --- src/mesa/main/shader_query.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index ed0c89fda17..f1ab4904450 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -1308,8 +1308,15 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, switch (res->Type) { case GL_UNIFORM: case GL_BUFFER_VARIABLE: + /* Test if a buffer variable is an array or an unsized array. + * Unsized arrays return zero as array size. + */ + if (RESOURCE_UNI(res)->is_shader_storage && + RESOURCE_UNI(res)->array_stride > 0) + *val = RESOURCE_UNI(res)->array_elements; + else *val = MAX2(RESOURCE_UNI(res)->array_elements, 1); - return 1; + return 1; case GL_PROGRAM_INPUT: case GL_PROGRAM_OUTPUT: *val = MAX2(_mesa_program_resource_array_size(res), 1); From 3da58730eeb51d9490045260e4848ba14bf9bb65 Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Mon, 5 Oct 2015 13:14:26 +0200 Subject: [PATCH 009/270] main: fix length of values written to glGetProgramResourceiv() for ACTIVE_VARIABLES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return the number of values written. Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Tapani Pälli Reviewed-by: Timothy Arceri --- src/mesa/main/shader_query.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index f1ab4904450..324e1a68fa4 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -1190,7 +1190,8 @@ get_buffer_property(struct gl_shader_program *shProg, (*val)++; } return 1; - case GL_ACTIVE_VARIABLES: + case GL_ACTIVE_VARIABLES: { + unsigned num_values = 0; for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) { const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName; struct gl_program_resource *uni = @@ -1200,8 +1201,10 @@ get_buffer_property(struct gl_shader_program *shProg, continue; *val++ = _mesa_program_resource_index(shProg, uni); + num_values++; } - return RESOURCE_UBO(res)->NumUniforms; + return num_values; + } } } else if (res->Type == GL_SHADER_STORAGE_BLOCK) { switch (prop) { @@ -1223,7 +1226,8 @@ get_buffer_property(struct gl_shader_program *shProg, (*val)++; } return 1; - case GL_ACTIVE_VARIABLES: + case GL_ACTIVE_VARIABLES: { + unsigned num_values = 0; for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) { const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName; struct gl_program_resource *uni = @@ -1233,8 +1237,10 @@ get_buffer_property(struct gl_shader_program *shProg, continue; *val++ = _mesa_program_resource_index(shProg, uni); + num_values++; } - return RESOURCE_UBO(res)->NumUniforms; + return num_values; + } } } else if (res->Type == GL_ATOMIC_COUNTER_BUFFER) { switch (prop) { From 0644196ab13c769570e5e2dcd738ebe5deca5754 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sun, 17 May 2015 16:46:54 +0200 Subject: [PATCH 010/270] nvc0: add a header file for nvc0_query This will allow to split SW and HW queries in an upcoming patch. While we are at it, make use of nvc0_query struct instead of pipe_query. Signed-off-by: Samuel Pitoiset --- src/gallium/drivers/nouveau/Makefile.sources | 1 + .../drivers/nouveau/nvc0/nvc0_context.h | 12 +- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 37 +--- src/gallium/drivers/nouveau/nvc0/nvc0_query.h | 191 ++++++++++++++++++ .../drivers/nouveau/nvc0/nvc0_screen.h | 142 ------------- .../drivers/nouveau/nvc0/nvc0_shader_state.c | 4 +- src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c | 4 +- 7 files changed, 202 insertions(+), 189 deletions(-) create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query.h diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 9346ea3204d..350837cd694 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -151,6 +151,7 @@ NVC0_C_SOURCES := \ nvc0/nvc0_program.c \ nvc0/nvc0_program.h \ nvc0/nvc0_query.c \ + nvc0/nvc0_query.h \ nvc0/nvc0_resource.c \ nvc0/nvc0_resource.h \ nvc0/nvc0_screen.c \ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 30bee3a0f8c..4af83c53224 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -15,6 +15,7 @@ #include "nvc0/nvc0_screen.h" #include "nvc0/nvc0_program.h" #include "nvc0/nvc0_resource.h" +#include "nvc0/nvc0_query.h" #include "nv50/nv50_transfer.h" @@ -231,17 +232,6 @@ uint32_t nvc0_program_symbol_offset(const struct nvc0_program *, uint32_t label); void nvc0_program_init_tcp_empty(struct nvc0_context *); -/* nvc0_query.c */ -void nvc0_init_query_functions(struct nvc0_context *); -void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, - struct pipe_query *, unsigned result_offset); -void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *); -void nvc0_so_target_save_offset(struct pipe_context *, - struct pipe_stream_output_target *, unsigned i, - bool *serialize); - -#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) - /* nvc0_shader_state.c */ void nvc0_vertprog_validate(struct nvc0_context *); void nvc0_tctlprog_validate(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index b13df6a9485..793425b7b02 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -25,6 +25,8 @@ #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query.h" + #include "nv_object.xml.h" #include "nvc0/nve4_compute.xml.h" #include "nvc0/nvc0_compute.xml.h" @@ -34,26 +36,6 @@ #define NVC0_QUERY_STATE_ENDED 2 #define NVC0_QUERY_STATE_FLUSHED 3 -struct nvc0_query { - uint32_t *data; - uint16_t type; - uint16_t index; - int8_t ctr[4]; - uint32_t sequence; - struct nouveau_bo *bo; - uint32_t base; - uint32_t offset; /* base + i * rotate */ - uint8_t state; - bool is64bit; - uint8_t rotate; - int nesting; /* only used for occlusion queries */ - union { - struct nouveau_mm_allocation *mm; - uint64_t value; - } u; - struct nouveau_fence *fence; -}; - #define NVC0_QUERY_ALLOC_SPACE 256 static boolean nvc0_hw_sm_query_begin(struct nvc0_context *, @@ -62,12 +44,6 @@ static void nvc0_hw_sm_query_end(struct nvc0_context *, struct nvc0_query *); static boolean nvc0_hw_sm_query_result(struct nvc0_context *, struct nvc0_query *, void *, boolean); -static inline struct nvc0_query * -nvc0_query(struct pipe_query *pipe) -{ - return (struct nvc0_query *)pipe; -} - static bool nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) { @@ -523,9 +499,8 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, } void -nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq) +nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q) { - struct nvc0_query *q = nvc0_query(pq); unsigned offset = q->offset; if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20; @@ -596,7 +571,7 @@ nvc0_render_condition(struct pipe_context *pipe, } if (wait) - nvc0_query_fifo_wait(push, pq); + nvc0_query_fifo_wait(push, q); PUSH_SPACE(push, 7); PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); @@ -611,10 +586,8 @@ nvc0_render_condition(struct pipe_context *pipe, void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, - struct pipe_query *pq, unsigned result_offset) + struct nvc0_query *q, unsigned result_offset) { - struct nvc0_query *q = nvc0_query(pq); - #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h new file mode 100644 index 00000000000..f3e8946692c --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h @@ -0,0 +1,191 @@ +#ifndef __NVC0_QUERY_H__ +#define __NVC0_QUERY_H__ + +#include "pipe/p_context.h" + +#include "nouveau_context.h" +#include "nouveau_mm.h" + +#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) + +struct nvc0_context; + +struct nvc0_query { + uint32_t *data; + uint16_t type; + uint16_t index; + int8_t ctr[4]; + uint32_t sequence; + struct nouveau_bo *bo; + uint32_t base; + uint32_t offset; /* base + i * rotate */ + uint8_t state; + boolean is64bit; + uint8_t rotate; + int nesting; /* only used for occlusion queries */ + union { + struct nouveau_mm_allocation *mm; + uint64_t value; + } u; + struct nouveau_fence *fence; +}; + +static inline struct nvc0_query * +nvc0_query(struct pipe_query *pipe) +{ + return (struct nvc0_query *)pipe; +} + +/* + * Driver queries groups: + */ +#define NVC0_QUERY_MP_COUNTER_GROUP 0 +#define NVC0_QUERY_DRV_STAT_GROUP 1 + +/* + * Performance counter queries: + */ +#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) +#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) +enum nve4_pm_queries +{ + NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVE4_HW_SM_QUERY_ACTIVE_WARPS, + NVE4_HW_SM_QUERY_ATOM_COUNT, + NVE4_HW_SM_QUERY_BRANCH, + NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, + NVE4_HW_SM_QUERY_GLD_REQUEST, + NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GST_TRANSACTIONS, + NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GRED_COUNT, + NVE4_HW_SM_QUERY_GST_REQUEST, + NVE4_HW_SM_QUERY_INST_EXECUTED, + NVE4_HW_SM_QUERY_INST_ISSUED, + NVE4_HW_SM_QUERY_INST_ISSUED1, + NVE4_HW_SM_QUERY_INST_ISSUED2, + NVE4_HW_SM_QUERY_L1_GLD_HIT, + NVE4_HW_SM_QUERY_L1_GLD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_LD, + NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_ST, + NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_PROF_TRIGGER_0, + NVE4_HW_SM_QUERY_PROF_TRIGGER_1, + NVE4_HW_SM_QUERY_PROF_TRIGGER_2, + NVE4_HW_SM_QUERY_PROF_TRIGGER_3, + NVE4_HW_SM_QUERY_PROF_TRIGGER_4, + NVE4_HW_SM_QUERY_PROF_TRIGGER_5, + NVE4_HW_SM_QUERY_PROF_TRIGGER_6, + NVE4_HW_SM_QUERY_PROF_TRIGGER_7, + NVE4_HW_SM_QUERY_SHARED_LD, + NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, + NVE4_HW_SM_QUERY_SHARED_ST, + NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, + NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, + NVE4_HW_SM_QUERY_THREADS_LAUNCHED, + NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, + NVE4_HW_SM_QUERY_WARPS_LAUNCHED, + NVE4_HW_SM_QUERY_METRIC_IPC, + NVE4_HW_SM_QUERY_METRIC_IPAC, + NVE4_HW_SM_QUERY_METRIC_IPEC, + NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, + NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, + NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, + NVE4_HW_SM_QUERY_COUNT +}; + +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) +enum nvc0_pm_queries +{ + NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_COUNT, + NVC0_HW_SM_QUERY_BRANCH, + NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, + NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GRED_COUNT, + NVC0_HW_SM_QUERY_GST_REQUEST, + NVC0_HW_SM_QUERY_INST_EXECUTED, + NVC0_HW_SM_QUERY_INST_ISSUED1_0, + NVC0_HW_SM_QUERY_INST_ISSUED1_1, + NVC0_HW_SM_QUERY_INST_ISSUED2_0, + NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_PROF_TRIGGER_0, + NVC0_HW_SM_QUERY_PROF_TRIGGER_1, + NVC0_HW_SM_QUERY_PROF_TRIGGER_2, + NVC0_HW_SM_QUERY_PROF_TRIGGER_3, + NVC0_HW_SM_QUERY_PROF_TRIGGER_4, + NVC0_HW_SM_QUERY_PROF_TRIGGER_5, + NVC0_HW_SM_QUERY_PROF_TRIGGER_6, + NVC0_HW_SM_QUERY_PROF_TRIGGER_7, + NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_WARPS_LAUNCHED, + NVC0_HW_SM_QUERY_COUNT +}; + +/* + * Driver statistics queries: + */ +#define NVC0_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i)) +#define NVC0_QUERY_DRV_STAT_LAST NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1) +enum nvc0_drv_stats_queries +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0, + NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES, + NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT, + NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID, + NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS, + NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ, + NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE, + NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT, + NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT, + NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT, + NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ, + NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE, + NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID, + NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT, + NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID, + NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS, + NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES, + NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT, + NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT, + NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT, + NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT, + NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY, + NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED, + NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT, + NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES, + NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT, + NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES, + NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT, + NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT, +#endif + NVC0_QUERY_DRV_STAT_COUNT +}; + +void nvc0_init_query_functions(struct nvc0_context *); +void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, + unsigned); +void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *); +void nvc0_so_target_save_offset(struct pipe_context *, + struct pipe_stream_output_target *, unsigned, + bool *); + +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index f57a316f01e..fa4f8645ffe 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -112,148 +112,6 @@ nvc0_screen(struct pipe_screen *screen) return (struct nvc0_screen *)screen; } -/* - * Performance counters groups: - */ -#define NVC0_QUERY_MP_COUNTER_GROUP 0 -#define NVC0_QUERY_DRV_STAT_GROUP 1 - -/* Performance counter queries: - */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_pm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_METRIC_IPC, - NVE4_HW_SM_QUERY_METRIC_IPAC, - NVE4_HW_SM_QUERY_METRIC_IPEC, - NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, - NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, - NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) -#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) -enum nvc0_pm_queries -{ - NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVC0_HW_SM_QUERY_ACTIVE_WARPS, - NVC0_HW_SM_QUERY_ATOM_COUNT, - NVC0_HW_SM_QUERY_BRANCH, - NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, - NVC0_HW_SM_QUERY_GLD_REQUEST, - NVC0_HW_SM_QUERY_GRED_COUNT, - NVC0_HW_SM_QUERY_GST_REQUEST, - NVC0_HW_SM_QUERY_INST_EXECUTED, - NVC0_HW_SM_QUERY_INST_ISSUED1_0, - NVC0_HW_SM_QUERY_INST_ISSUED1_1, - NVC0_HW_SM_QUERY_INST_ISSUED2_0, - NVC0_HW_SM_QUERY_INST_ISSUED2_1, - NVC0_HW_SM_QUERY_LOCAL_LD, - NVC0_HW_SM_QUERY_LOCAL_ST, - NVC0_HW_SM_QUERY_PROF_TRIGGER_0, - NVC0_HW_SM_QUERY_PROF_TRIGGER_1, - NVC0_HW_SM_QUERY_PROF_TRIGGER_2, - NVC0_HW_SM_QUERY_PROF_TRIGGER_3, - NVC0_HW_SM_QUERY_PROF_TRIGGER_4, - NVC0_HW_SM_QUERY_PROF_TRIGGER_5, - NVC0_HW_SM_QUERY_PROF_TRIGGER_6, - NVC0_HW_SM_QUERY_PROF_TRIGGER_7, - NVC0_HW_SM_QUERY_SHARED_LD, - NVC0_HW_SM_QUERY_SHARED_ST, - NVC0_HW_SM_QUERY_THREADS_LAUNCHED, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, - NVC0_HW_SM_QUERY_WARPS_LAUNCHED, - NVC0_HW_SM_QUERY_COUNT -}; - -/* Driver statistics queries: - */ -#define NVC0_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i)) -#define NVC0_QUERY_DRV_STAT_LAST NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1) -enum nvc0_drv_stats_queries -{ -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0, - NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS, - NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ, - NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE, - NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT, - NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT, - NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT, - NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ, - NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE, - NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS, - NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES, - NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT, - NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES, - NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT, - NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES, - NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT, - NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT, -#endif - NVC0_QUERY_DRV_STAT_COUNT -}; - int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned, struct pipe_driver_query_info *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 8f8ac2d34b9..2fade982b83 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -272,14 +272,14 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) continue; if (!targ->clean) - nvc0_query_fifo_wait(push, targ->pq); + nvc0_query_fifo_wait(push, nvc0_query(targ->pq)); BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5); PUSH_DATA (push, 1); PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset); PUSH_DATA (push, buf->address + targ->pipe.buffer_offset); PUSH_DATA (push, targ->pipe.buffer_size); if (!targ->clean) { - nvc0_query_pushbuf_submit(push, targ->pq, 0x4); + nvc0_query_pushbuf_submit(push, nvc0_query(targ->pq), 0x4); } else { PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */ targ->clean = false; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 188c7d7cdc8..9be25cfe66e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -775,7 +775,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; PUSH_SPACE(push, 2); IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0); - nvc0_query_fifo_wait(push, so->pq); + nvc0_query_fifo_wait(push, nvc0_query(so->pq)); if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS) IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0); @@ -791,7 +791,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_STRIDE), 1); PUSH_DATA (push, so->stride); BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BYTES), 1); - nvc0_query_pushbuf_submit(push, so->pq, 0x4); + nvc0_query_pushbuf_submit(push, nvc0_query(so->pq), 0x4); IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0); mode |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT; From 0678530b9e60479f33eabb62f96a40af46edd714 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sun, 4 Oct 2015 17:43:15 +0200 Subject: [PATCH 011/270] nvc0: move nvc0_so_target_save_offset() to its correct location Signed-off-by: Samuel Pitoiset --- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 21 ------------------- src/gallium/drivers/nouveau/nvc0/nvc0_query.h | 3 --- src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 19 +++++++++++++++++ 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 793425b7b02..69e9cdb8714 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -596,27 +596,6 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, NVC0_IB_ENTRY_1_NO_PREFETCH); } -void -nvc0_so_target_save_offset(struct pipe_context *pipe, - struct pipe_stream_output_target *ptarg, - unsigned index, bool *serialize) -{ - struct nvc0_so_target *targ = nvc0_so_target(ptarg); - - if (*serialize) { - *serialize = false; - PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1); - IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0); - - NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1); - } - - nvc0_query(targ->pq)->index = index; - - nvc0_query_end(pipe, targ->pq); -} - - /* === DRIVER STATISTICS === */ #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h index f3e8946692c..65240a125b7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h @@ -184,8 +184,5 @@ void nvc0_init_query_functions(struct nvc0_context *); void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, unsigned); void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *); -void nvc0_so_target_save_offset(struct pipe_context *, - struct pipe_stream_output_target *, unsigned, - bool *); #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index c5bfd03956d..269c75b03a6 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -1090,6 +1090,25 @@ nvc0_so_target_create(struct pipe_context *pipe, return &targ->pipe; } +static void +nvc0_so_target_save_offset(struct pipe_context *pipe, + struct pipe_stream_output_target *ptarg, + unsigned index, bool *serialize) +{ + struct nvc0_so_target *targ = nvc0_so_target(ptarg); + + if (*serialize) { + *serialize = false; + PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1); + IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0); + + NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1); + } + + nvc0_query(targ->pq)->index = index; + pipe->end_query(pipe, targ->pq); +} + static void nvc0_so_target_destroy(struct pipe_context *pipe, struct pipe_stream_output_target *ptarg) From 77b6990d14e6a97eb3928c445f3524494da36ad8 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sun, 4 Oct 2015 16:01:51 +0200 Subject: [PATCH 012/270] nvc0: move SW queries to nvc0_query_sw.c/h files Loosely based on freedreno driver. Signed-off-by: Samuel Pitoiset --- src/gallium/drivers/nouveau/Makefile.sources | 2 + src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 69 ++++++------- src/gallium/drivers/nouveau/nvc0/nvc0_query.h | 58 +++-------- .../drivers/nouveau/nvc0/nvc0_query_sw.c | 98 +++++++++++++++++++ .../drivers/nouveau/nvc0/nvc0_query_sw.h | 61 ++++++++++++ 5 files changed, 204 insertions(+), 84 deletions(-) create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 350837cd694..0e1cb19bed7 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -152,6 +152,8 @@ NVC0_C_SOURCES := \ nvc0/nvc0_program.h \ nvc0/nvc0_query.c \ nvc0/nvc0_query.h \ + nvc0/nvc0_query_sw.c \ + nvc0/nvc0_query_sw.h \ nvc0/nvc0_resource.c \ nvc0/nvc0_resource.h \ nvc0/nvc0_screen.c \ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 69e9cdb8714..9409bdea81f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -26,6 +26,7 @@ #include "nvc0/nvc0_context.h" #include "nvc0/nvc0_query.h" +#include "nvc0/nvc0_query_sw.h" #include "nv_object.xml.h" #include "nvc0/nve4_compute.xml.h" @@ -52,16 +53,16 @@ nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) if (q->bo) { nouveau_bo_ref(NULL, &q->bo); - if (q->u.mm) { + if (q->mm) { if (q->state == NVC0_QUERY_STATE_READY) - nouveau_mm_free(q->u.mm); + nouveau_mm_free(q->mm); else nouveau_fence_work(screen->base.fence.current, - nouveau_mm_free_work, q->u.mm); + nouveau_mm_free_work, q->mm); } } if (size) { - q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base); + q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base); if (!q->bo) return false; q->offset = q->base; @@ -91,6 +92,10 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index) struct nvc0_query *q; unsigned space = NVC0_QUERY_ALLOC_SPACE; + q = nvc0_sw_create_query(nvc0, type, index); + if (q) + return (struct pipe_query *)q; + q = CALLOC_STRUCT(nvc0_query); if (!q) return NULL; @@ -126,14 +131,6 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index) space = 16; break; default: -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (type >= NVC0_QUERY_DRV_STAT(0) && type <= NVC0_QUERY_DRV_STAT_LAST) { - space = 0; - q->is64bit = true; - q->index = type - NVC0_QUERY_DRV_STAT(0); - break; - } else -#endif if (nvc0->screen->base.device->drm_version >= 0x01000101) { if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) { /* for each MP: @@ -295,12 +292,9 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq) break; default: #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_QUERY_DRV_STAT(0) && - q->type <= NVC0_QUERY_DRV_STAT_LAST) { - if (q->index >= 5) - q->u.value = nvc0->screen->base.stats.v[q->index]; - else - q->u.value = 0; + if (q->type >= NVC0_SW_QUERY_DRV_STAT(0) && + q->type <= NVC0_SW_QUERY_DRV_STAT_LAST) { + return q->funcs->begin_query(nvc0, q); } else #endif if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || @@ -382,9 +376,9 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq) break; default: #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_QUERY_DRV_STAT(0) && - q->type <= NVC0_QUERY_DRV_STAT_LAST) { - q->u.value = nvc0->screen->base.stats.v[q->index] - q->u.value; + if (q->type >= NVC0_SW_QUERY_DRV_STAT(0) && + q->type <= NVC0_SW_QUERY_DRV_STAT_LAST) { + q->funcs->end_query(nvc0, q); return; } else #endif @@ -423,10 +417,9 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, unsigned i; #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_QUERY_DRV_STAT(0) && - q->type <= NVC0_QUERY_DRV_STAT_LAST) { - res64[0] = q->u.value; - return true; + if (q->type >= NVC0_SW_QUERY_DRV_STAT(0) && + q->type <= NVC0_SW_QUERY_DRV_STAT_LAST) { + return q->funcs->get_query_result(nvc0, q, wait, result); } else #endif if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || @@ -600,7 +593,7 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS -static const char *nvc0_drv_stat_names[] = +static const char *nvc0_sw_query_drv_stat_names[] = { "drv-tex_obj_current_count", "drv-tex_obj_current_bytes", @@ -1357,7 +1350,7 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, struct nvc0_screen *screen = nvc0_screen(pscreen); int count = 0; - count += NVC0_QUERY_DRV_STAT_COUNT; + count += NVC0_SW_QUERY_DRV_STAT_COUNT; if (screen->base.device->drm_version >= 0x01000101) { if (screen->compute) { @@ -1382,29 +1375,29 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, info->group_id = -1; #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (id < NVC0_QUERY_DRV_STAT_COUNT) { - info->name = nvc0_drv_stat_names[id]; - info->query_type = NVC0_QUERY_DRV_STAT(id); + if (id < NVC0_SW_QUERY_DRV_STAT_COUNT) { + info->name = nvc0_sw_query_drv_stat_names[id]; + info->query_type = NVC0_SW_QUERY_DRV_STAT(id); info->max_value.u64 = 0; if (strstr(info->name, "bytes")) info->type = PIPE_DRIVER_QUERY_TYPE_BYTES; - info->group_id = NVC0_QUERY_DRV_STAT_GROUP; + info->group_id = NVC0_SW_QUERY_DRV_STAT_GROUP; return 1; } else #endif if (id < count) { if (screen->compute) { if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; - info->query_type = NVE4_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); + info->name = nve4_pm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; + info->query_type = NVE4_HW_SM_QUERY(id - NVC0_SW_QUERY_DRV_STAT_COUNT); info->max_value.u64 = (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; return 1; } else if (screen->base.class_3d < NVE4_3D_CLASS) { - info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT]; - info->query_type = NVC0_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT); + info->name = nvc0_pm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; + info->query_type = NVC0_HW_SM_QUERY(id - NVC0_SW_QUERY_DRV_STAT_COUNT); info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; return 1; } @@ -1466,11 +1459,11 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, } } #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - else if (id == NVC0_QUERY_DRV_STAT_GROUP) { + else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) { info->name = "Driver statistics"; info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_CPU; - info->max_active_queries = NVC0_QUERY_DRV_STAT_COUNT; - info->num_queries = NVC0_QUERY_DRV_STAT_COUNT; + info->max_active_queries = NVC0_SW_QUERY_DRV_STAT_COUNT; + info->num_queries = NVC0_SW_QUERY_DRV_STAT_COUNT; return 1; } #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h index 65240a125b7..dfb2fe3c749 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h @@ -9,8 +9,18 @@ #define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) struct nvc0_context; +struct nvc0_query; + +struct nvc0_query_funcs { + void (*destroy_query)(struct nvc0_context *, struct nvc0_query *); + boolean (*begin_query)(struct nvc0_context *, struct nvc0_query *); + void (*end_query)(struct nvc0_context *, struct nvc0_query *); + boolean (*get_query_result)(struct nvc0_context *, struct nvc0_query *, + boolean, union pipe_query_result *); +}; struct nvc0_query { + const struct nvc0_query_funcs *funcs; uint32_t *data; uint16_t type; uint16_t index; @@ -23,10 +33,7 @@ struct nvc0_query { boolean is64bit; uint8_t rotate; int nesting; /* only used for occlusion queries */ - union { - struct nouveau_mm_allocation *mm; - uint64_t value; - } u; + struct nouveau_mm_allocation *mm; struct nouveau_fence *fence; }; @@ -40,7 +47,7 @@ nvc0_query(struct pipe_query *pipe) * Driver queries groups: */ #define NVC0_QUERY_MP_COUNTER_GROUP 0 -#define NVC0_QUERY_DRV_STAT_GROUP 1 +#define NVC0_SW_QUERY_DRV_STAT_GROUP 1 /* * Performance counter queries: @@ -139,47 +146,6 @@ enum nvc0_pm_queries NVC0_HW_SM_QUERY_COUNT }; -/* - * Driver statistics queries: - */ -#define NVC0_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i)) -#define NVC0_QUERY_DRV_STAT_LAST NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1) -enum nvc0_drv_stats_queries -{ -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0, - NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID, - NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS, - NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ, - NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE, - NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT, - NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT, - NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT, - NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ, - NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE, - NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID, - NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS, - NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES, - NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT, - NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED, - NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT, - NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES, - NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT, - NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES, - NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT, - NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT, -#endif - NVC0_QUERY_DRV_STAT_COUNT -}; - void nvc0_init_query_functions(struct nvc0_context *); void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, unsigned); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c new file mode 100644 index 00000000000..5f33b1e019a --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c @@ -0,0 +1,98 @@ +/* + * Copyright 2011 Christoph Bumiller + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nvc0/nvc0_context.h" + +#include "nvc0_query_sw.h" + +static void +nvc0_sw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_sw_query *sq = nvc0_sw_query(q); + FREE(sq); +} + +static boolean +nvc0_sw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + struct nvc0_sw_query *sq = nvc0_sw_query(q); + + if (q->index >= 5) { + sq->value = nvc0->screen->base.stats.v[q->index]; + } else { + sq->value = 0; + } +#endif + return true; +} + +static void +nvc0_sw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + struct nvc0_sw_query *sq = nvc0_sw_query(q); + sq->value = nvc0->screen->base.stats.v[q->index] - sq->value; +#endif +} + +static boolean +nvc0_sw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, + boolean wait, union pipe_query_result *result) +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + struct nvc0_sw_query *sq = nvc0_sw_query(q); + uint64_t *res64 = (uint64_t *)result; + + res64[0] = sq->value; +#endif + return true; +} + +static const struct nvc0_query_funcs sw_query_funcs = { + .destroy_query = nvc0_sw_destroy_query, + .begin_query = nvc0_sw_begin_query, + .end_query = nvc0_sw_end_query, + .get_query_result = nvc0_sw_get_query_result, +}; + +struct nvc0_query * +nvc0_sw_create_query(struct nvc0_context *nvcO, unsigned type, unsigned index) +{ + struct nvc0_sw_query *sq; + struct nvc0_query *q; + + if (type < NVC0_SW_QUERY_DRV_STAT(0) || type > NVC0_SW_QUERY_DRV_STAT_LAST) + return NULL; + + sq = CALLOC_STRUCT(nvc0_sw_query); + if (!sq) + return NULL; + + q = &sq->base; + q->funcs = &sw_query_funcs; + q->type = type; + q->index = type - NVC0_SW_QUERY_DRV_STAT(0); + + return q; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h new file mode 100644 index 00000000000..71d23d9b41e --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h @@ -0,0 +1,61 @@ +#ifndef __NVC0_QUERY_SW_H__ +#define __NVC0_QUERY_SW_H__ + +#include "nvc0_query.h" + +struct nvc0_sw_query { + struct nvc0_query base; + uint64_t value; +}; + +static inline struct nvc0_sw_query * +nvc0_sw_query(struct nvc0_query *q) +{ + return (struct nvc0_sw_query *)q; +} + +/* + * Driver statistics queries: + */ +#define NVC0_SW_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i)) +#define NVC0_SW_QUERY_DRV_STAT_LAST NVC0_SW_QUERY_DRV_STAT(NVC0_SW_QUERY_DRV_STAT_COUNT - 1) +enum nvc0_sw_query_drv_stat +{ +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + NVC0_SW_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0, + NVC0_SW_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES, + NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT, + NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID, + NVC0_SW_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS, + NVC0_SW_QUERY_DRV_STAT_TEX_TRANSFERS_READ, + NVC0_SW_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE, + NVC0_SW_QUERY_DRV_STAT_TEX_COPY_COUNT, + NVC0_SW_QUERY_DRV_STAT_TEX_BLIT_COUNT, + NVC0_SW_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT, + NVC0_SW_QUERY_DRV_STAT_BUF_TRANSFERS_READ, + NVC0_SW_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE, + NVC0_SW_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID, + NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT, + NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID, + NVC0_SW_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS, + NVC0_SW_QUERY_DRV_STAT_BUF_COPY_BYTES, + NVC0_SW_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT, + NVC0_SW_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT, + NVC0_SW_QUERY_DRV_STAT_QUERY_SYNC_COUNT, + NVC0_SW_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT, + NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_ARRAY, + NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_INDEXED, + NVC0_SW_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT, + NVC0_SW_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES, + NVC0_SW_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT, + NVC0_SW_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES, + NVC0_SW_QUERY_DRV_STAT_PUSHBUF_COUNT, + NVC0_SW_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT, +#endif + NVC0_SW_QUERY_DRV_STAT_COUNT +}; + +struct nvc0_query * +nvc0_sw_create_query(struct nvc0_context *, unsigned, unsigned); + +#endif From 224fec05eac136d734e6ae06f6aab44d5ba640df Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sun, 4 Oct 2015 18:28:55 +0200 Subject: [PATCH 013/270] nvc0: move HW queries to nvc0_query_hw.c/h files Signed-off-by: Samuel Pitoiset --- src/gallium/drivers/nouveau/Makefile.sources | 2 + src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 1122 +--------------- src/gallium/drivers/nouveau/nvc0/nvc0_query.h | 115 -- .../drivers/nouveau/nvc0/nvc0_query_hw.c | 1135 +++++++++++++++++ .../drivers/nouveau/nvc0/nvc0_query_hw.h | 138 ++ .../drivers/nouveau/nvc0/nvc0_shader_state.c | 5 +- src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 3 +- src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c | 5 +- 8 files changed, 1310 insertions(+), 1215 deletions(-) create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 0e1cb19bed7..e45c564f431 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -152,6 +152,8 @@ NVC0_C_SOURCES := \ nvc0/nvc0_program.h \ nvc0/nvc0_query.c \ nvc0/nvc0_query.h \ + nvc0/nvc0_query_hw.c \ + nvc0/nvc0_query_hw.h \ nvc0/nvc0_query_sw.c \ nvc0/nvc0_query_sw.h \ nvc0/nvc0_resource.c \ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 9409bdea81f..a0ca3fa533c 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -27,485 +27,48 @@ #include "nvc0/nvc0_context.h" #include "nvc0/nvc0_query.h" #include "nvc0/nvc0_query_sw.h" - -#include "nv_object.xml.h" -#include "nvc0/nve4_compute.xml.h" -#include "nvc0/nvc0_compute.xml.h" - -#define NVC0_QUERY_STATE_READY 0 -#define NVC0_QUERY_STATE_ACTIVE 1 -#define NVC0_QUERY_STATE_ENDED 2 -#define NVC0_QUERY_STATE_FLUSHED 3 - -#define NVC0_QUERY_ALLOC_SPACE 256 - -static boolean nvc0_hw_sm_query_begin(struct nvc0_context *, - struct nvc0_query *); -static void nvc0_hw_sm_query_end(struct nvc0_context *, struct nvc0_query *); -static boolean nvc0_hw_sm_query_result(struct nvc0_context *, - struct nvc0_query *, void *, boolean); - -static bool -nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) -{ - struct nvc0_screen *screen = nvc0->screen; - int ret; - - if (q->bo) { - nouveau_bo_ref(NULL, &q->bo); - if (q->mm) { - if (q->state == NVC0_QUERY_STATE_READY) - nouveau_mm_free(q->mm); - else - nouveau_fence_work(screen->base.fence.current, - nouveau_mm_free_work, q->mm); - } - } - if (size) { - q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base); - if (!q->bo) - return false; - q->offset = q->base; - - ret = nouveau_bo_map(q->bo, 0, screen->base.client); - if (ret) { - nvc0_query_allocate(nvc0, q, 0); - return false; - } - q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base); - } - return true; -} - -static void -nvc0_query_destroy(struct pipe_context *pipe, struct pipe_query *pq) -{ - nvc0_query_allocate(nvc0_context(pipe), nvc0_query(pq), 0); - nouveau_fence_ref(NULL, &nvc0_query(pq)->fence); - FREE(nvc0_query(pq)); -} +#include "nvc0/nvc0_query_hw.h" static struct pipe_query * -nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index) +nvc0_create_query(struct pipe_context *pipe, unsigned type, unsigned index) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nvc0_query *q; - unsigned space = NVC0_QUERY_ALLOC_SPACE; q = nvc0_sw_create_query(nvc0, type, index); - if (q) - return (struct pipe_query *)q; - - q = CALLOC_STRUCT(nvc0_query); if (!q) - return NULL; - - switch (type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - q->rotate = 32; - space = NVC0_QUERY_ALLOC_SPACE; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - q->is64bit = true; - space = 512; - break; - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - q->is64bit = true; - space = 64; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_PRIMITIVES_EMITTED: - q->is64bit = true; - q->index = index; - space = 32; - break; - case PIPE_QUERY_TIME_ELAPSED: - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_TIMESTAMP_DISJOINT: - case PIPE_QUERY_GPU_FINISHED: - space = 32; - break; - case NVC0_QUERY_TFB_BUFFER_OFFSET: - space = 16; - break; - default: - if (nvc0->screen->base.device->drm_version >= 0x01000101) { - if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) { - /* for each MP: - * [00] = WS0.C0 - * [04] = WS0.C1 - * [08] = WS0.C2 - * [0c] = WS0.C3 - * [10] = WS1.C0 - * [14] = WS1.C1 - * [18] = WS1.C2 - * [1c] = WS1.C3 - * [20] = WS2.C0 - * [24] = WS2.C1 - * [28] = WS2.C2 - * [2c] = WS2.C3 - * [30] = WS3.C0 - * [34] = WS3.C1 - * [38] = WS3.C2 - * [3c] = WS3.C3 - * [40] = MP.C4 - * [44] = MP.C5 - * [48] = MP.C6 - * [4c] = MP.C7 - * [50] = WS0.sequence - * [54] = WS1.sequence - * [58] = WS2.sequence - * [5c] = WS3.sequence - */ - space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); - break; - } else - if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) { - /* for each MP: - * [00] = MP.C0 - * [04] = MP.C1 - * [08] = MP.C2 - * [0c] = MP.C3 - * [10] = MP.C4 - * [14] = MP.C5 - * [18] = MP.C6 - * [1c] = MP.C7 - * [20] = MP.sequence - */ - space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t); - break; - } - } - debug_printf("invalid query type: %u\n", type); - FREE(q); - return NULL; - } - if (!nvc0_query_allocate(nvc0, q, space)) { - FREE(q); - return NULL; - } - - q->type = type; - - if (q->rotate) { - /* we advance before query_begin ! */ - q->offset -= q->rotate; - q->data -= q->rotate / sizeof(*q->data); - } else - if (!q->is64bit) - q->data[0] = 0; /* initialize sequence */ + q = nvc0_hw_create_query(nvc0, type, index); return (struct pipe_query *)q; } static void -nvc0_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q, - unsigned offset, uint32_t get) +nvc0_destroy_query(struct pipe_context *pipe, struct pipe_query *pq) { - offset += q->offset; - - PUSH_SPACE(push, 5); - PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR); - BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4); - PUSH_DATAh(push, q->bo->offset + offset); - PUSH_DATA (push, q->bo->offset + offset); - PUSH_DATA (push, q->sequence); - PUSH_DATA (push, get); -} - -static void -nvc0_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - q->offset += q->rotate; - q->data += q->rotate / sizeof(*q->data); - if (q->offset - q->base == NVC0_QUERY_ALLOC_SPACE) - nvc0_query_allocate(nvc0, q, NVC0_QUERY_ALLOC_SPACE); + struct nvc0_query *q = nvc0_query(pq); + q->funcs->destroy_query(nvc0_context(pipe), q); } static boolean -nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq) +nvc0_begin_query(struct pipe_context *pipe, struct pipe_query *pq) { - struct nvc0_context *nvc0 = nvc0_context(pipe); - struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_query *q = nvc0_query(pq); - bool ret = true; - - /* For occlusion queries we have to change the storage, because a previous - * query might set the initial render conition to false even *after* we re- - * initialized it to true. - */ - if (q->rotate) { - nvc0_query_rotate(nvc0, q); - - /* XXX: can we do this with the GPU, and sync with respect to a previous - * query ? - */ - q->data[0] = q->sequence; /* initialize sequence */ - q->data[1] = 1; /* initial render condition = true */ - q->data[4] = q->sequence + 1; /* for comparison COND_MODE */ - q->data[5] = 0; - } - q->sequence++; - - switch (q->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - q->nesting = nvc0->screen->num_occlusion_queries_active++; - if (q->nesting) { - nvc0_query_get(push, q, 0x10, 0x0100f002); - } else { - PUSH_SPACE(push, 3); - BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1); - PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT); - IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); - } - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - nvc0_query_get(push, q, 0x10, 0x09005002 | (q->index << 5)); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - nvc0_query_get(push, q, 0x10, 0x05805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_STATISTICS: - nvc0_query_get(push, q, 0x20, 0x05805002 | (q->index << 5)); - nvc0_query_get(push, q, 0x30, 0x06805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - nvc0_query_get(push, q, 0x10, 0x03005002 | (q->index << 5)); - break; - case PIPE_QUERY_TIME_ELAPSED: - nvc0_query_get(push, q, 0x10, 0x00005002); - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - nvc0_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */ - nvc0_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */ - nvc0_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */ - nvc0_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */ - nvc0_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */ - nvc0_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */ - nvc0_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */ - nvc0_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ - nvc0_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ - nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ - break; - default: -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_SW_QUERY_DRV_STAT(0) && - q->type <= NVC0_SW_QUERY_DRV_STAT_LAST) { - return q->funcs->begin_query(nvc0, q); - } else -#endif - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - ret = nvc0_hw_sm_query_begin(nvc0, q); - } - break; - } - q->state = NVC0_QUERY_STATE_ACTIVE; - return ret; + return q->funcs->begin_query(nvc0_context(pipe), q); } static void -nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq) +nvc0_end_query(struct pipe_context *pipe, struct pipe_query *pq) { - struct nvc0_context *nvc0 = nvc0_context(pipe); - struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_query *q = nvc0_query(pq); - - if (q->state != NVC0_QUERY_STATE_ACTIVE) { - /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */ - if (q->rotate) - nvc0_query_rotate(nvc0, q); - q->sequence++; - } - q->state = NVC0_QUERY_STATE_ENDED; - - switch (q->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - nvc0_query_get(push, q, 0, 0x0100f002); - if (--nvc0->screen->num_occlusion_queries_active == 0) { - PUSH_SPACE(push, 1); - IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0); - } - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - nvc0_query_get(push, q, 0, 0x09005002 | (q->index << 5)); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - nvc0_query_get(push, q, 0, 0x05805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_STATISTICS: - nvc0_query_get(push, q, 0x00, 0x05805002 | (q->index << 5)); - nvc0_query_get(push, q, 0x10, 0x06805002 | (q->index << 5)); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - /* TODO: How do we sum over all streams for render condition ? */ - /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */ - nvc0_query_get(push, q, 0x00, 0x03005002 | (q->index << 5)); - nvc0_query_get(push, q, 0x20, 0x00005002); - break; - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_TIME_ELAPSED: - nvc0_query_get(push, q, 0, 0x00005002); - break; - case PIPE_QUERY_GPU_FINISHED: - nvc0_query_get(push, q, 0, 0x1000f010); - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - nvc0_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */ - nvc0_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */ - nvc0_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */ - nvc0_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */ - nvc0_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */ - nvc0_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */ - nvc0_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */ - nvc0_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ - nvc0_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ - nvc0_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ - break; - case NVC0_QUERY_TFB_BUFFER_OFFSET: - /* indexed by TFB buffer instead of by vertex stream */ - nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); - break; - case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* This query is not issued on GPU because disjoint is forced to false */ - q->state = NVC0_QUERY_STATE_READY; - break; - default: -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_SW_QUERY_DRV_STAT(0) && - q->type <= NVC0_SW_QUERY_DRV_STAT_LAST) { - q->funcs->end_query(nvc0, q); - return; - } else -#endif - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - nvc0_hw_sm_query_end(nvc0, q); - } - break; - } - if (q->is64bit) - nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence); -} - -static inline void -nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q) -{ - if (q->is64bit) { - if (nouveau_fence_signalled(q->fence)) - q->state = NVC0_QUERY_STATE_READY; - } else { - if (q->data[0] == q->sequence) - q->state = NVC0_QUERY_STATE_READY; - } + q->funcs->end_query(nvc0_context(pipe), q); } static boolean -nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq, - boolean wait, union pipe_query_result *result) +nvc0_get_query_result(struct pipe_context *pipe, struct pipe_query *pq, + boolean wait, union pipe_query_result *result) { - struct nvc0_context *nvc0 = nvc0_context(pipe); struct nvc0_query *q = nvc0_query(pq); - uint64_t *res64 = (uint64_t*)result; - uint32_t *res32 = (uint32_t*)result; - uint8_t *res8 = (uint8_t*)result; - uint64_t *data64 = (uint64_t *)q->data; - unsigned i; - -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (q->type >= NVC0_SW_QUERY_DRV_STAT(0) && - q->type <= NVC0_SW_QUERY_DRV_STAT_LAST) { - return q->funcs->get_query_result(nvc0, q, wait, result); - } else -#endif - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - return nvc0_hw_sm_query_result(nvc0, q, result, wait); - } - - if (q->state != NVC0_QUERY_STATE_READY) - nvc0_query_update(nvc0->screen->base.client, q); - - if (q->state != NVC0_QUERY_STATE_READY) { - if (!wait) { - if (q->state != NVC0_QUERY_STATE_FLUSHED) { - q->state = NVC0_QUERY_STATE_FLUSHED; - /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */ - PUSH_KICK(nvc0->base.pushbuf); - } - return false; - } - if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client)) - return false; - NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1); - } - q->state = NVC0_QUERY_STATE_READY; - - switch (q->type) { - case PIPE_QUERY_GPU_FINISHED: - res8[0] = true; - break; - case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */ - res64[0] = q->data[1] - q->data[5]; - break; - case PIPE_QUERY_OCCLUSION_PREDICATE: - res8[0] = q->data[1] != q->data[5]; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */ - case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */ - res64[0] = data64[0] - data64[2]; - break; - case PIPE_QUERY_SO_STATISTICS: - res64[0] = data64[0] - data64[4]; - res64[1] = data64[2] - data64[6]; - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - res8[0] = data64[0] != data64[2]; - break; - case PIPE_QUERY_TIMESTAMP: - res64[0] = data64[1]; - break; - case PIPE_QUERY_TIMESTAMP_DISJOINT: - res64[0] = 1000000000; - res8[8] = false; - break; - case PIPE_QUERY_TIME_ELAPSED: - res64[0] = data64[1] - data64[3]; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - for (i = 0; i < 10; ++i) - res64[i] = data64[i * 2] - data64[24 + i * 2]; - break; - case NVC0_QUERY_TFB_BUFFER_OFFSET: - res32[0] = q->data[1]; - break; - default: - assert(0); /* can't happen, we don't create queries with invalid type */ - return false; - } - - return true; -} - -void -nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q) -{ - unsigned offset = q->offset; - - if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20; - - PUSH_SPACE(push, 5); - PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); - BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4); - PUSH_DATAh(push, q->bo->offset + offset); - PUSH_DATA (push, q->bo->offset + offset); - PUSH_DATA (push, q->sequence); - PUSH_DATA (push, (1 << 12) | - NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL); + return q->funcs->get_query_result(nvc0_context(pipe), q, wait, result); } static void @@ -515,7 +78,8 @@ nvc0_render_condition(struct pipe_context *pipe, { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nvc0_query *q; + struct nvc0_query *q = nvc0_query(pq); + struct nvc0_hw_query *hq = nvc0_hw_query(q); uint32_t cond; bool wait = mode != PIPE_RENDER_COND_NO_WAIT && @@ -525,7 +89,6 @@ nvc0_render_condition(struct pipe_context *pipe, cond = NVC0_3D_COND_MODE_ALWAYS; } else { - q = nvc0_query(pq); /* NOTE: comparison of 2 queries only works if both have completed */ switch (q->type) { case PIPE_QUERY_SO_OVERFLOW_PREDICATE: @@ -536,7 +99,7 @@ nvc0_render_condition(struct pipe_context *pipe, case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: if (likely(!condition)) { - if (unlikely(q->nesting)) + if (unlikely(hq->nesting)) cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS; else @@ -564,29 +127,17 @@ nvc0_render_condition(struct pipe_context *pipe, } if (wait) - nvc0_query_fifo_wait(push, q); + nvc0_hw_query_fifo_wait(push, q); PUSH_SPACE(push, 7); - PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); + PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); BEGIN_NVC0(push, NVC0_3D(COND_ADDRESS_HIGH), 3); - PUSH_DATAh(push, q->bo->offset + q->offset); - PUSH_DATA (push, q->bo->offset + q->offset); + PUSH_DATAh(push, hq->bo->offset + hq->offset); + PUSH_DATA (push, hq->bo->offset + hq->offset); PUSH_DATA (push, cond); BEGIN_NVC0(push, NVC0_2D(COND_ADDRESS_HIGH), 2); - PUSH_DATAh(push, q->bo->offset + q->offset); - PUSH_DATA (push, q->bo->offset + q->offset); -} - -void -nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push, - struct nvc0_query *q, unsigned result_offset) -{ -#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) - - PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); - nouveau_pushbuf_space(push, 0, 0, 1); - nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 | - NVC0_IB_ENTRY_1_NO_PREFETCH); + PUSH_DATAh(push, hq->bo->offset + hq->offset); + PUSH_DATA (push, hq->bo->offset + hq->offset); } /* === DRIVER STATISTICS === */ @@ -628,95 +179,8 @@ static const char *nvc0_sw_query_drv_stat_names[] = #endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */ - /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ -/* Code to read out MP counters: They are accessible via mmio, too, but let's - * just avoid mapping registers in userspace. We'd have to know which MPs are - * enabled/present, too, and that information is not presently exposed. - * We could add a kernel interface for it, but reading the counters like this - * has the advantage of being async (if get_result isn't called immediately). - */ -static const uint64_t nve4_read_hw_sm_counters_code[] = -{ - /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 - * mov b32 $r8 $tidx - * mov b32 $r12 $physid - * mov b32 $r0 $pm0 - * mov b32 $r1 $pm1 - * mov b32 $r2 $pm2 - * mov b32 $r3 $pm3 - * mov b32 $r4 $pm4 - * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b - * mov b32 $r5 $pm5 - * mov b32 $r6 $pm6 - * mov b32 $r7 $pm7 - * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * ext u32 $r8 $r12 0x414 - * mov b32 $r11 c0[0x4] - * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 - * ext u32 $r9 $r12 0x208 - * (not $p0) exit - * set $p1 0x1 eq u32 $r9 0x0 - * mul $r8 u32 $r8 u32 96 - * mul $r12 u32 $r9 u32 16 - * mul $r13 u32 $r9 u32 4 - * add b32 $r9 $r8 $r13 - * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c - * add b32 $r8 $r8 $r12 - * mov b32 $r12 $r10 - * add b32 $r10 $c $r10 $r8 - * mov b32 $r13 $r11 - * add b32 $r11 $r11 0x0 $c - * add b32 $r12 $c $r12 $r9 - * st b128 wt g[$r10d] $r0q - * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 - * mov b32 $r0 c0[0x8] - * add b32 $r13 $r13 0x0 $c - * $p1 st b128 wt g[$r12d+0x40] $r4q - * st b32 wt g[$r12d+0x50] $r0 - * exit */ - 0x2202020202020207ULL, - 0x2c00000084021c04ULL, - 0x2c0000000c031c04ULL, - 0x2c00000010001c04ULL, - 0x2c00000014005c04ULL, - 0x2c00000018009c04ULL, - 0x2c0000001c00dc04ULL, - 0x2c00000020011c04ULL, - 0x22b0420042320207ULL, - 0x2c00000024015c04ULL, - 0x2c00000028019c04ULL, - 0x2c0000002c01dc04ULL, - 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x7000c01050c21c03ULL, - 0x280040001002dde4ULL, - 0x204282020042e047ULL, - 0x7000c00820c25c03ULL, - 0x80000000000021e7ULL, - 0x190e0000fc93dc03ULL, - 0x1000000180821c02ULL, - 0x1000000040931c02ULL, - 0x1000000010935c02ULL, - 0x4800000034825c03ULL, - 0x22c042c042c04287ULL, - 0x4800000030821c03ULL, - 0x2800000028031de4ULL, - 0x4801000020a29c03ULL, - 0x280000002c035de4ULL, - 0x0800000000b2dc42ULL, - 0x4801000024c31c03ULL, - 0x9400000000a01fc5ULL, - 0x200002e04202c047ULL, - 0x2800400020001de4ULL, - 0x0800000000d35c42ULL, - 0x9400000100c107c5ULL, - 0x9400000140c01f85ULL, - 0x8000000000001de7ULL -}; - /* NOTE: intentionally using the same names as NV */ static const char *nve4_pm_query_names[] = { @@ -773,168 +237,7 @@ static const char *nve4_pm_query_names[] = "metric-inst_replay_overhead" /* inst_issued, inst_executed */ }; -/* For simplicity, we will allocate as many group slots as we allocate counter - * slots. This means that a single counter which wants to source from 2 groups - * will have to be declared as using 2 counter slots. This shouldn't really be - * a problem because such queries don't make much sense ... (unless someone is - * really creative). - */ -struct nvc0_mp_counter_cfg -{ - uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ - uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ - uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ - uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ - uint32_t sig_sel : 8; /* signal group */ - uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */ -}; - -#define NVC0_COUNTER_OPn_SUM 0 -#define NVC0_COUNTER_OPn_OR 1 -#define NVC0_COUNTER_OPn_AND 2 -#define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */ -#define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */ -#define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */ -#define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */ - -struct nvc0_hw_sm_query_cfg -{ - struct nvc0_mp_counter_cfg ctr[4]; - uint8_t num_counters; - uint8_t op; - uint8_t norm[2]; /* normalization num,denom */ -}; - -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } -#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } -#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } - -/* NOTES: - * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps - * inst_executed etc.: we only count a single warp scheduler - * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers; - * this is inaccurate ! - */ -static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = -{ - _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), - _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), - _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), - _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), - _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), - _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), - _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), - _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), - _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), - _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), - _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), - _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), - _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1), - _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), - _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), - _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), - _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), - _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), - _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), - _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), - _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), - _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), - _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), - _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), - _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), - _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), - _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), - _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), - _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), - _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), - _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), - _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), - _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), - _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), - _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), - _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), - _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), - _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), - _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), - _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), - _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1), - _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1), - _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1), - _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1), - _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64), - _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1), -}; - -#undef _Q1A -#undef _Q1B -#undef _M2A -#undef _M2B - /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ -static const uint64_t nvc0_read_hw_sm_counters_code[] = -{ - /* mov b32 $r8 $tidx - * mov b32 $r9 $physid - * mov b32 $r0 $pm0 - * mov b32 $r1 $pm1 - * mov b32 $r2 $pm2 - * mov b32 $r3 $pm3 - * mov b32 $r4 $pm4 - * mov b32 $r5 $pm5 - * mov b32 $r6 $pm6 - * mov b32 $r7 $pm7 - * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * mov b32 $r11 c0[0x4] - * ext u32 $r8 $r9 0x414 - * (not $p0) exit - * mul $r8 u32 $r8 u32 36 - * add b32 $r10 $c $r10 $r8 - * add b32 $r11 $r11 0x0 $c - * mov b32 $r8 c0[0x8] - * st b128 wt g[$r10d+0x00] $r0q - * st b128 wt g[$r10d+0x10] $r4q - * st b32 wt g[$r10d+0x20] $r8 - * exit */ - 0x2c00000084021c04ULL, - 0x2c0000000c025c04ULL, - 0x2c00000010001c04ULL, - 0x2c00000014005c04ULL, - 0x2c00000018009c04ULL, - 0x2c0000001c00dc04ULL, - 0x2c00000020011c04ULL, - 0x2c00000024015c04ULL, - 0x2c00000028019c04ULL, - 0x2c0000002c01dc04ULL, - 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x280040001002dde4ULL, - 0x7000c01050921c03ULL, - 0x80000000000021e7ULL, - 0x1000000090821c02ULL, - 0x4801000020a29c03ULL, - 0x0800000000b2dc42ULL, - 0x2800400020021de4ULL, - 0x9400000000a01fc5ULL, - 0x9400000040a11fc5ULL, - 0x9400000080a21f85ULL, - 0x8000000000001de7ULL -}; - static const char *nvc0_pm_query_names[] = { /* MP counters */ @@ -971,377 +274,6 @@ static const char *nvc0_pm_query_names[] = "warps_launched", }; -#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } } - -static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = -{ - _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00), - _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00), - _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), -}; - -#undef _Q - -static const struct nvc0_hw_sm_query_cfg * -nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - - if (screen->base.class_3d >= NVE4_3D_CLASS) - return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; - return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)]; -} - -boolean -nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; - const struct nvc0_hw_sm_query_cfg *cfg; - unsigned i, c; - unsigned num_ab[2] = { 0, 0 }; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - /* check if we have enough free counter slots */ - for (i = 0; i < cfg->num_counters; ++i) - num_ab[cfg->ctr[i].sig_dom]++; - - if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || - screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { - NOUVEAU_ERR("Not enough free MP counter slots !\n"); - return false; - } - - assert(cfg->num_counters <= 4); - PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); - - if (!screen->pm.mp_counters_enabled) { - screen->pm.mp_counters_enabled = true; - BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); - PUSH_DATA (push, 0x1fcb); - } - - /* set sequence field to 0 (used to check if result is available) */ - for (i = 0; i < screen->mp_count; ++i) - q->data[i * 10 + 10] = 0; - - for (i = 0; i < cfg->num_counters; ++i) { - const unsigned d = cfg->ctr[i].sig_dom; - - if (!screen->pm.num_hw_sm_active[d]) { - uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); - if (screen->pm.num_hw_sm_active[!d]) - m |= 1 << (7 + (8 * d)); - BEGIN_NVC0(push, SUBC_SW(0x0600), 1); - PUSH_DATA (push, m); - } - screen->pm.num_hw_sm_active[d]++; - - for (c = d * 4; c < (d * 4 + 4); ++c) { - if (!screen->pm.mp_counter[c]) { - q->ctr[i] = c; - screen->pm.mp_counter[c] = (struct pipe_query *)q; - break; - } - } - assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ - - /* configure and reset the counter(s) */ - if (is_nve4) { - if (d == 0) - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); - else - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); - PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); - PUSH_DATA (push, 0); - } else { - unsigned s; - - for (s = 0; s < cfg->ctr[i].num_src; s++) { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); - PUSH_DATA (push, 0); - } - } - } - return true; -} - -static void -nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - struct pipe_context *pipe = &nvc0->base.pipe; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; - uint32_t mask; - uint32_t input[3]; - const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; - const uint grid[3] = { screen->mp_count, 1, 1 }; - unsigned c; - const struct nvc0_hw_sm_query_cfg *cfg; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - if (unlikely(!screen->pm.prog)) { - struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); - prog->type = PIPE_SHADER_COMPUTE; - prog->translated = true; - prog->num_gprs = 14; - prog->parm_size = 12; - if (is_nve4) { - prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; - prog->code_size = sizeof(nve4_read_hw_sm_counters_code); - } else { - prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; - prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); - } - screen->pm.prog = prog; - } - - /* disable all counting */ - PUSH_SPACE(push, 8); - for (c = 0; c < 8; ++c) - if (screen->pm.mp_counter[c]) { - if (is_nve4) { - IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); - } else { - IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); - } - } - /* release counters for this query */ - for (c = 0; c < 8; ++c) { - if (nvc0_query(screen->pm.mp_counter[c]) == q) { - screen->pm.num_hw_sm_active[c / 4]--; - screen->pm.mp_counter[c] = NULL; - } - } - - BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, - q->bo); - - PUSH_SPACE(push, 1); - IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); - - pipe->bind_compute_state(pipe, screen->pm.prog); - input[0] = (q->bo->offset + q->base); - input[1] = (q->bo->offset + q->base) >> 32; - input[2] = q->sequence; - pipe->launch_grid(pipe, block, grid, 0, input); - - nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); - - /* re-activate other counters */ - PUSH_SPACE(push, 16); - mask = 0; - for (c = 0; c < 8; ++c) { - unsigned i; - q = nvc0_query(screen->pm.mp_counter[c]); - if (!q) - continue; - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - for (i = 0; i < cfg->num_counters; ++i) { - if (mask & (1 << q->ctr[i])) - break; - mask |= 1 << q->ctr[i]; - if (is_nve4) { - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1); - } else { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(q->ctr[i])), 1); - } - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - } - } -} - -static inline bool -nvc0_hw_sm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, bool wait, - struct nvc0_query *q, - const struct nvc0_hw_sm_query_cfg *cfg, - unsigned mp_count) -{ - unsigned p, c; - - for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x24 / 4) * p; - - for (c = 0; c < cfg->num_counters; ++c) { - if (q->data[b + 8] != q->sequence) { - if (!wait) - return false; - if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return false; - } - count[p][c] = q->data[b + q->ctr[c]]; - } - } - return true; -} - -static inline bool -nve4_hw_sm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, bool wait, - struct nvc0_query *q, - const struct nvc0_hw_sm_query_cfg *cfg, - unsigned mp_count) -{ - unsigned p, c, d; - - for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x60 / 4) * p; - - for (c = 0; c < cfg->num_counters; ++c) { - count[p][c] = 0; - for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) { - if (q->data[b + 20 + d] != q->sequence) { - if (!wait) - return false; - if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return false; - } - if (q->ctr[c] & ~0x3) - count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)]; - else - count[p][c] += q->data[b + d * 4 + q->ctr[c]]; - } - } - } - return true; -} - -/* Metric calculations: - * sum(x) ... sum of x over all MPs - * avg(x) ... average of x over all MPs - * - * IPC : sum(inst_executed) / clock - * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) - * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) - * MP_EFFICIENCY : avg(active_cycles / clock) - * - * NOTE: Interpretation of IPC requires knowledge of MP count. - */ -static boolean -nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, - void *result, boolean wait) -{ - uint32_t count[32][4]; - uint64_t value = 0; - unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); - unsigned p, c; - const struct nvc0_hw_sm_query_cfg *cfg; - bool ret; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) - ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); - else - ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); - if (!ret) - return false; - - if (cfg->op == NVC0_COUNTER_OPn_SUM) { - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - value += count[p][c]; - value = (value * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OPn_OR) { - uint32_t v = 0; - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - v |= count[p][c]; - value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OPn_AND) { - uint32_t v = ~0; - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - v &= count[p][c]; - value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) { - uint64_t v[2] = { 0, 0 }; - for (p = 0; p < mp_count; ++p) { - v[0] += count[p][0]; - v[1] += count[p][1]; - } - if (v[0]) - value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]); - } else - if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) { - for (p = 0; p < mp_count; ++p) - value += count[p][0]; - if (count[0][1]) - value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]); - else - value = 0; - } else - if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) { - unsigned mp_used = 0; - for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) - if (count[p][1]) - value += (count[p][0] * cfg->norm[0]) / count[p][1]; - if (mp_used) - value /= (uint64_t)mp_used * cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) { - unsigned mp_used = 0; - for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) - value += count[p][0]; - if (count[0][1] && mp_used) { - value *= cfg->norm[0]; - value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1]; - } else { - value = 0; - } - } - - *(uint64_t *)result = value; - return true; -} - int nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, unsigned id, @@ -1481,10 +413,10 @@ nvc0_init_query_functions(struct nvc0_context *nvc0) { struct pipe_context *pipe = &nvc0->base.pipe; - pipe->create_query = nvc0_query_create; - pipe->destroy_query = nvc0_query_destroy; - pipe->begin_query = nvc0_query_begin; - pipe->end_query = nvc0_query_end; - pipe->get_query_result = nvc0_query_result; + pipe->create_query = nvc0_create_query; + pipe->destroy_query = nvc0_destroy_query; + pipe->begin_query = nvc0_begin_query; + pipe->end_query = nvc0_end_query; + pipe->get_query_result = nvc0_get_query_result; pipe->render_condition = nvc0_render_condition; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h index dfb2fe3c749..c4f0cb0ec6e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h @@ -4,9 +4,6 @@ #include "pipe/p_context.h" #include "nouveau_context.h" -#include "nouveau_mm.h" - -#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) struct nvc0_context; struct nvc0_query; @@ -21,20 +18,8 @@ struct nvc0_query_funcs { struct nvc0_query { const struct nvc0_query_funcs *funcs; - uint32_t *data; uint16_t type; uint16_t index; - int8_t ctr[4]; - uint32_t sequence; - struct nouveau_bo *bo; - uint32_t base; - uint32_t offset; /* base + i * rotate */ - uint8_t state; - boolean is64bit; - uint8_t rotate; - int nesting; /* only used for occlusion queries */ - struct nouveau_mm_allocation *mm; - struct nouveau_fence *fence; }; static inline struct nvc0_query * @@ -49,106 +34,6 @@ nvc0_query(struct pipe_query *pipe) #define NVC0_QUERY_MP_COUNTER_GROUP 0 #define NVC0_SW_QUERY_DRV_STAT_GROUP 1 -/* - * Performance counter queries: - */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_pm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_METRIC_IPC, - NVE4_HW_SM_QUERY_METRIC_IPAC, - NVE4_HW_SM_QUERY_METRIC_IPEC, - NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, - NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, - NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) -#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) -enum nvc0_pm_queries -{ - NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVC0_HW_SM_QUERY_ACTIVE_WARPS, - NVC0_HW_SM_QUERY_ATOM_COUNT, - NVC0_HW_SM_QUERY_BRANCH, - NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, - NVC0_HW_SM_QUERY_GLD_REQUEST, - NVC0_HW_SM_QUERY_GRED_COUNT, - NVC0_HW_SM_QUERY_GST_REQUEST, - NVC0_HW_SM_QUERY_INST_EXECUTED, - NVC0_HW_SM_QUERY_INST_ISSUED1_0, - NVC0_HW_SM_QUERY_INST_ISSUED1_1, - NVC0_HW_SM_QUERY_INST_ISSUED2_0, - NVC0_HW_SM_QUERY_INST_ISSUED2_1, - NVC0_HW_SM_QUERY_LOCAL_LD, - NVC0_HW_SM_QUERY_LOCAL_ST, - NVC0_HW_SM_QUERY_PROF_TRIGGER_0, - NVC0_HW_SM_QUERY_PROF_TRIGGER_1, - NVC0_HW_SM_QUERY_PROF_TRIGGER_2, - NVC0_HW_SM_QUERY_PROF_TRIGGER_3, - NVC0_HW_SM_QUERY_PROF_TRIGGER_4, - NVC0_HW_SM_QUERY_PROF_TRIGGER_5, - NVC0_HW_SM_QUERY_PROF_TRIGGER_6, - NVC0_HW_SM_QUERY_PROF_TRIGGER_7, - NVC0_HW_SM_QUERY_SHARED_LD, - NVC0_HW_SM_QUERY_SHARED_ST, - NVC0_HW_SM_QUERY_THREADS_LAUNCHED, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, - NVC0_HW_SM_QUERY_WARPS_LAUNCHED, - NVC0_HW_SM_QUERY_COUNT -}; - void nvc0_init_query_functions(struct nvc0_context *); -void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, - unsigned); -void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *); #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c new file mode 100644 index 00000000000..b7923d549f4 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -0,0 +1,1135 @@ +/* + * Copyright 2011 Christoph Bumiller + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING + +#include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" + +#include "nv_object.xml.h" +#include "nvc0/nve4_compute.xml.h" +#include "nvc0/nvc0_compute.xml.h" + +#define NVC0_HW_QUERY_STATE_READY 0 +#define NVC0_HW_QUERY_STATE_ACTIVE 1 +#define NVC0_HW_QUERY_STATE_ENDED 2 +#define NVC0_HW_QUERY_STATE_FLUSHED 3 + +#define NVC0_HW_QUERY_ALLOC_SPACE 256 + +/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ + +/* Code to read out MP counters: They are accessible via mmio, too, but let's + * just avoid mapping registers in userspace. We'd have to know which MPs are + * enabled/present, too, and that information is not presently exposed. + * We could add a kernel interface for it, but reading the counters like this + * has the advantage of being async (if get_result isn't called immediately). + */ +static const uint64_t nve4_read_hw_sm_counters_code[] = +{ + /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 + * mov b32 $r8 $tidx + * mov b32 $r12 $physid + * mov b32 $r0 $pm0 + * mov b32 $r1 $pm1 + * mov b32 $r2 $pm2 + * mov b32 $r3 $pm3 + * mov b32 $r4 $pm4 + * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b + * mov b32 $r5 $pm5 + * mov b32 $r6 $pm6 + * mov b32 $r7 $pm7 + * set $p0 0x1 eq u32 $r8 0x0 + * mov b32 $r10 c0[0x0] + * ext u32 $r8 $r12 0x414 + * mov b32 $r11 c0[0x4] + * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 + * ext u32 $r9 $r12 0x208 + * (not $p0) exit + * set $p1 0x1 eq u32 $r9 0x0 + * mul $r8 u32 $r8 u32 96 + * mul $r12 u32 $r9 u32 16 + * mul $r13 u32 $r9 u32 4 + * add b32 $r9 $r8 $r13 + * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c + * add b32 $r8 $r8 $r12 + * mov b32 $r12 $r10 + * add b32 $r10 $c $r10 $r8 + * mov b32 $r13 $r11 + * add b32 $r11 $r11 0x0 $c + * add b32 $r12 $c $r12 $r9 + * st b128 wt g[$r10d] $r0q + * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 + * mov b32 $r0 c0[0x8] + * add b32 $r13 $r13 0x0 $c + * $p1 st b128 wt g[$r12d+0x40] $r4q + * st b32 wt g[$r12d+0x50] $r0 + * exit */ + 0x2202020202020207ULL, + 0x2c00000084021c04ULL, + 0x2c0000000c031c04ULL, + 0x2c00000010001c04ULL, + 0x2c00000014005c04ULL, + 0x2c00000018009c04ULL, + 0x2c0000001c00dc04ULL, + 0x2c00000020011c04ULL, + 0x22b0420042320207ULL, + 0x2c00000024015c04ULL, + 0x2c00000028019c04ULL, + 0x2c0000002c01dc04ULL, + 0x190e0000fc81dc03ULL, + 0x2800400000029de4ULL, + 0x7000c01050c21c03ULL, + 0x280040001002dde4ULL, + 0x204282020042e047ULL, + 0x7000c00820c25c03ULL, + 0x80000000000021e7ULL, + 0x190e0000fc93dc03ULL, + 0x1000000180821c02ULL, + 0x1000000040931c02ULL, + 0x1000000010935c02ULL, + 0x4800000034825c03ULL, + 0x22c042c042c04287ULL, + 0x4800000030821c03ULL, + 0x2800000028031de4ULL, + 0x4801000020a29c03ULL, + 0x280000002c035de4ULL, + 0x0800000000b2dc42ULL, + 0x4801000024c31c03ULL, + 0x9400000000a01fc5ULL, + 0x200002e04202c047ULL, + 0x2800400020001de4ULL, + 0x0800000000d35c42ULL, + 0x9400000100c107c5ULL, + 0x9400000140c01f85ULL, + 0x8000000000001de7ULL +}; + +/* For simplicity, we will allocate as many group slots as we allocate counter + * slots. This means that a single counter which wants to source from 2 groups + * will have to be declared as using 2 counter slots. This shouldn't really be + * a problem because such queries don't make much sense ... (unless someone is + * really creative). + */ +struct nvc0_mp_counter_cfg +{ + uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ + uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ + uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ + uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ + uint32_t sig_sel : 8; /* signal group */ + uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */ +}; + +#define NVC0_COUNTER_OPn_SUM 0 +#define NVC0_COUNTER_OPn_OR 1 +#define NVC0_COUNTER_OPn_AND 2 +#define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */ +#define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */ +#define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */ +#define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */ + +struct nvc0_hw_sm_query_cfg +{ + struct nvc0_mp_counter_cfg ctr[4]; + uint8_t num_counters; + uint8_t op; + uint8_t norm[2]; /* normalization num,denom */ +}; + +#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } +#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } +#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } + +/* NOTES: + * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps + * inst_executed etc.: we only count a single warp scheduler + * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers; + * this is inaccurate ! + */ +static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = +{ + _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), + _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), + _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), + _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), + _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), + _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), + _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), + _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), + _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), + _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), + _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), + _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), + _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1), + _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), + _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), + _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), + _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), + _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), + _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), + _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), + _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), + _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), + _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), + _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), + _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), + _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), + _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), + _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), + _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), + _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), + _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), + _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), + _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), + _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), + _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), + _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), + _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), + _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), + _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), + _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), + _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), + _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), + _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), + _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1), + _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1), + _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1), + _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1), + _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64), + _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1), +}; + +#undef _Q1A +#undef _Q1B +#undef _M2A +#undef _M2B + +/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ +static const uint64_t nvc0_read_hw_sm_counters_code[] = +{ + /* mov b32 $r8 $tidx + * mov b32 $r9 $physid + * mov b32 $r0 $pm0 + * mov b32 $r1 $pm1 + * mov b32 $r2 $pm2 + * mov b32 $r3 $pm3 + * mov b32 $r4 $pm4 + * mov b32 $r5 $pm5 + * mov b32 $r6 $pm6 + * mov b32 $r7 $pm7 + * set $p0 0x1 eq u32 $r8 0x0 + * mov b32 $r10 c0[0x0] + * mov b32 $r11 c0[0x4] + * ext u32 $r8 $r9 0x414 + * (not $p0) exit + * mul $r8 u32 $r8 u32 36 + * add b32 $r10 $c $r10 $r8 + * add b32 $r11 $r11 0x0 $c + * mov b32 $r8 c0[0x8] + * st b128 wt g[$r10d+0x00] $r0q + * st b128 wt g[$r10d+0x10] $r4q + * st b32 wt g[$r10d+0x20] $r8 + * exit */ + 0x2c00000084021c04ULL, + 0x2c0000000c025c04ULL, + 0x2c00000010001c04ULL, + 0x2c00000014005c04ULL, + 0x2c00000018009c04ULL, + 0x2c0000001c00dc04ULL, + 0x2c00000020011c04ULL, + 0x2c00000024015c04ULL, + 0x2c00000028019c04ULL, + 0x2c0000002c01dc04ULL, + 0x190e0000fc81dc03ULL, + 0x2800400000029de4ULL, + 0x280040001002dde4ULL, + 0x7000c01050921c03ULL, + 0x80000000000021e7ULL, + 0x1000000090821c02ULL, + 0x4801000020a29c03ULL, + 0x0800000000b2dc42ULL, + 0x2800400020021de4ULL, + 0x9400000000a01fc5ULL, + 0x9400000040a11fc5ULL, + 0x9400000080a21f85ULL, + 0x8000000000001de7ULL +}; + +#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } } + +static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = +{ + _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), + _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00), + _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00), + _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00), + _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), + _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), +}; + +#undef _Q + +static const struct nvc0_hw_sm_query_cfg * +nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_screen *screen = nvc0->screen; + + if (screen->base.class_3d >= NVE4_3D_CLASS) + return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)]; +} + +static boolean +nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + struct nvc0_hw_query *hq = nvc0_hw_query(q); + const struct nvc0_hw_sm_query_cfg *cfg; + unsigned i, c; + unsigned num_ab[2] = { 0, 0 }; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); + + /* check if we have enough free counter slots */ + for (i = 0; i < cfg->num_counters; ++i) + num_ab[cfg->ctr[i].sig_dom]++; + + if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || + screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { + NOUVEAU_ERR("Not enough free MP counter slots !\n"); + return false; + } + + assert(cfg->num_counters <= 4); + PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); + + if (!screen->pm.mp_counters_enabled) { + screen->pm.mp_counters_enabled = true; + BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); + PUSH_DATA (push, 0x1fcb); + } + + /* set sequence field to 0 (used to check if result is available) */ + for (i = 0; i < screen->mp_count; ++i) + hq->data[i * 10 + 10] = 0; + + for (i = 0; i < cfg->num_counters; ++i) { + const unsigned d = cfg->ctr[i].sig_dom; + + if (!screen->pm.num_hw_sm_active[d]) { + uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); + if (screen->pm.num_hw_sm_active[!d]) + m |= 1 << (7 + (8 * d)); + BEGIN_NVC0(push, SUBC_SW(0x0600), 1); + PUSH_DATA (push, m); + } + screen->pm.num_hw_sm_active[d]++; + + for (c = d * 4; c < (d * 4 + 4); ++c) { + if (!screen->pm.mp_counter[c]) { + hq->ctr[i] = c; + screen->pm.mp_counter[c] = (struct pipe_query *)q; + break; + } + } + assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ + + /* configure and reset the counter(s) */ + if (is_nve4) { + if (d == 0) + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); + else + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); + } else { + unsigned s; + + for (s = 0; s < cfg->ctr[i].num_src; s++) { + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); + PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); + PUSH_DATA (push, 0); + } + } + } + return true; +} + +static void +nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_screen *screen = nvc0->screen; + struct pipe_context *pipe = &nvc0->base.pipe; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + struct nvc0_hw_query *hq = nvc0_hw_query(q); + uint32_t mask; + uint32_t input[3]; + const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; + const uint grid[3] = { screen->mp_count, 1, 1 }; + unsigned c; + const struct nvc0_hw_sm_query_cfg *cfg; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); + + if (unlikely(!screen->pm.prog)) { + struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); + prog->type = PIPE_SHADER_COMPUTE; + prog->translated = true; + prog->num_gprs = 14; + prog->parm_size = 12; + if (is_nve4) { + prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; + prog->code_size = sizeof(nve4_read_hw_sm_counters_code); + } else { + prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); + } + screen->pm.prog = prog; + } + + /* disable all counting */ + PUSH_SPACE(push, 8); + for (c = 0; c < 8; ++c) + if (screen->pm.mp_counter[c]) { + if (is_nve4) { + IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); + } else { + IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); + } + } + /* release counters for this query */ + for (c = 0; c < 8; ++c) { + if (nvc0_query(screen->pm.mp_counter[c]) == q) { + screen->pm.num_hw_sm_active[c / 4]--; + screen->pm.mp_counter[c] = NULL; + } + } + + BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, + hq->bo); + + PUSH_SPACE(push, 1); + IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); + + pipe->bind_compute_state(pipe, screen->pm.prog); + input[0] = (hq->bo->offset + hq->base_offset); + input[1] = (hq->bo->offset + hq->base_offset) >> 32; + input[2] = hq->sequence; + pipe->launch_grid(pipe, block, grid, 0, input); + + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); + + /* re-activate other counters */ + PUSH_SPACE(push, 16); + mask = 0; + for (c = 0; c < 8; ++c) { + unsigned i; + q = nvc0_query(screen->pm.mp_counter[c]); + if (!q) + continue; + cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); + for (i = 0; i < cfg->num_counters; ++i) { + if (mask & (1 << hq->ctr[i])) + break; + mask |= 1 << hq->ctr[i]; + if (is_nve4) { + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hq->ctr[i])), 1); + } else { + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hq->ctr[i])), 1); + } + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + } + } +} + +static inline bool +nvc0_hw_sm_query_read_data(uint32_t count[32][4], + struct nvc0_context *nvc0, bool wait, + struct nvc0_query *q, + const struct nvc0_hw_sm_query_cfg *cfg, + unsigned mp_count) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + unsigned p, c; + + for (p = 0; p < mp_count; ++p) { + const unsigned b = (0x24 / 4) * p; + + for (c = 0; c < cfg->num_counters; ++c) { + if (hq->data[b + 8] != hq->sequence) { + if (!wait) + return false; + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return false; + } + count[p][c] = hq->data[b + hq->ctr[c]]; + } + } + return true; +} + +static inline bool +nve4_hw_sm_query_read_data(uint32_t count[32][4], + struct nvc0_context *nvc0, bool wait, + struct nvc0_query *q, + const struct nvc0_hw_sm_query_cfg *cfg, + unsigned mp_count) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + unsigned p, c, d; + + for (p = 0; p < mp_count; ++p) { + const unsigned b = (0x60 / 4) * p; + + for (c = 0; c < cfg->num_counters; ++c) { + count[p][c] = 0; + for (d = 0; d < ((hq->ctr[c] & ~3) ? 1 : 4); ++d) { + if (hq->data[b + 20 + d] != hq->sequence) { + if (!wait) + return false; + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return false; + } + if (hq->ctr[c] & ~0x3) + count[p][c] = hq->data[b + 16 + (hq->ctr[c] & 3)]; + else + count[p][c] += hq->data[b + d * 4 + hq->ctr[c]]; + } + } + } + return true; +} + +/* Metric calculations: + * sum(x) ... sum of x over all MPs + * avg(x) ... average of x over all MPs + * + * IPC : sum(inst_executed) / clock + * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) + * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) + * MP_EFFICIENCY : avg(active_cycles / clock) + * + * NOTE: Interpretation of IPC requires knowledge of MP count. + */ +static boolean +nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, + void *result, boolean wait) +{ + uint32_t count[32][4]; + uint64_t value = 0; + unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); + unsigned p, c; + const struct nvc0_hw_sm_query_cfg *cfg; + bool ret; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); + + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) + ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); + else + ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); + if (!ret) + return false; + + if (cfg->op == NVC0_COUNTER_OPn_SUM) { + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + value += count[p][c]; + value = (value * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OPn_OR) { + uint32_t v = 0; + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + v |= count[p][c]; + value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OPn_AND) { + uint32_t v = ~0; + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + v &= count[p][c]; + value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) { + uint64_t v[2] = { 0, 0 }; + for (p = 0; p < mp_count; ++p) { + v[0] += count[p][0]; + v[1] += count[p][1]; + } + if (v[0]) + value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]); + } else + if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) { + for (p = 0; p < mp_count; ++p) + value += count[p][0]; + if (count[0][1]) + value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]); + else + value = 0; + } else + if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) { + unsigned mp_used = 0; + for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) + if (count[p][1]) + value += (count[p][0] * cfg->norm[0]) / count[p][1]; + if (mp_used) + value /= (uint64_t)mp_used * cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) { + unsigned mp_used = 0; + for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) + value += count[p][0]; + if (count[0][1] && mp_used) { + value *= cfg->norm[0]; + value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1]; + } else { + value = 0; + } + } + + *(uint64_t *)result = value; + return true; +} + +static bool +nvc0_hw_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, + int size) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + struct nvc0_screen *screen = nvc0->screen; + int ret; + + if (hq->bo) { + nouveau_bo_ref(NULL, &hq->bo); + if (hq->mm) { + if (hq->state == NVC0_HW_QUERY_STATE_READY) + nouveau_mm_free(hq->mm); + else + nouveau_fence_work(screen->base.fence.current, + nouveau_mm_free_work, hq->mm); + } + } + if (size) { + hq->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &hq->bo, + &hq->base_offset); + if (!hq->bo) + return false; + hq->offset = hq->base_offset; + + ret = nouveau_bo_map(hq->bo, 0, screen->base.client); + if (ret) { + nvc0_hw_query_allocate(nvc0, q, 0); + return false; + } + hq->data = (uint32_t *)((uint8_t *)hq->bo->map + hq->base_offset); + } + return true; +} + +static void +nvc0_hw_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q, + unsigned offset, uint32_t get) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + offset += hq->offset; + + PUSH_SPACE(push, 5); + PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR); + BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4); + PUSH_DATAh(push, hq->bo->offset + offset); + PUSH_DATA (push, hq->bo->offset + offset); + PUSH_DATA (push, hq->sequence); + PUSH_DATA (push, get); +} + +static void +nvc0_hw_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + hq->offset += hq->rotate; + hq->data += hq->rotate / sizeof(*hq->data); + if (hq->offset - hq->base_offset == NVC0_HW_QUERY_ALLOC_SPACE) + nvc0_hw_query_allocate(nvc0, q, NVC0_HW_QUERY_ALLOC_SPACE); +} + +static inline void +nvc0_hw_query_update(struct nouveau_client *cli, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + if (hq->is64bit) { + if (nouveau_fence_signalled(hq->fence)) + hq->state = NVC0_HW_QUERY_STATE_READY; + } else { + if (hq->data[0] == hq->sequence) + hq->state = NVC0_HW_QUERY_STATE_READY; + } +} + +static void +nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + nvc0_hw_query_allocate(nvc0, q, 0); + nouveau_fence_ref(NULL, &hq->fence); + FREE(hq); +} + +static boolean +nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_query *hq = nvc0_hw_query(q); + bool ret = true; + + /* For occlusion queries we have to change the storage, because a previous + * query might set the initial render conition to false even *after* we re- + * initialized it to true. + */ + if (hq->rotate) { + nvc0_hw_query_rotate(nvc0, q); + + /* XXX: can we do this with the GPU, and sync with respect to a previous + * query ? + */ + hq->data[0] = hq->sequence; /* initialize sequence */ + hq->data[1] = 1; /* initial render condition = true */ + hq->data[4] = hq->sequence + 1; /* for comparison COND_MODE */ + hq->data[5] = 0; + } + hq->sequence++; + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + hq->nesting = nvc0->screen->num_occlusion_queries_active++; + if (hq->nesting) { + nvc0_hw_query_get(push, q, 0x10, 0x0100f002); + } else { + PUSH_SPACE(push, 3); + BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1); + PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT); + IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + nvc0_hw_query_get(push, q, 0x10, 0x09005002 | (q->index << 5)); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + nvc0_hw_query_get(push, q, 0x10, 0x05805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_STATISTICS: + nvc0_hw_query_get(push, q, 0x20, 0x05805002 | (q->index << 5)); + nvc0_hw_query_get(push, q, 0x30, 0x06805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + nvc0_hw_query_get(push, q, 0x10, 0x03005002 | (q->index << 5)); + break; + case PIPE_QUERY_TIME_ELAPSED: + nvc0_hw_query_get(push, q, 0x10, 0x00005002); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + nvc0_hw_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */ + nvc0_hw_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */ + nvc0_hw_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ + nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ + break; + default: + if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || + (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { + ret = nvc0_hw_sm_query_begin(nvc0, q); + } + break; + } + hq->state = NVC0_HW_QUERY_STATE_ACTIVE; + return ret; +} + +static void +nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_query *hq = nvc0_hw_query(q); + + if (hq->state != NVC0_HW_QUERY_STATE_ACTIVE) { + /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */ + if (hq->rotate) + nvc0_hw_query_rotate(nvc0, q); + hq->sequence++; + } + hq->state = NVC0_HW_QUERY_STATE_ENDED; + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + nvc0_hw_query_get(push, q, 0, 0x0100f002); + if (--nvc0->screen->num_occlusion_queries_active == 0) { + PUSH_SPACE(push, 1); + IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0); + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + nvc0_hw_query_get(push, q, 0, 0x09005002 | (q->index << 5)); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + nvc0_hw_query_get(push, q, 0, 0x05805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_STATISTICS: + nvc0_hw_query_get(push, q, 0x00, 0x05805002 | (q->index << 5)); + nvc0_hw_query_get(push, q, 0x10, 0x06805002 | (q->index << 5)); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* TODO: How do we sum over all streams for render condition ? */ + /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */ + nvc0_hw_query_get(push, q, 0x00, 0x03005002 | (q->index << 5)); + nvc0_hw_query_get(push, q, 0x20, 0x00005002); + break; + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIME_ELAPSED: + nvc0_hw_query_get(push, q, 0, 0x00005002); + break; + case PIPE_QUERY_GPU_FINISHED: + nvc0_hw_query_get(push, q, 0, 0x1000f010); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + nvc0_hw_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */ + nvc0_hw_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */ + nvc0_hw_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */ + nvc0_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */ + nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ + nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ + nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ + break; + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* This query is not issued on GPU because disjoint is forced to false */ + hq->state = NVC0_HW_QUERY_STATE_READY; + break; + case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: + /* indexed by TFB buffer instead of by vertex stream */ + nvc0_hw_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); + break; + default: + if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || + (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { + nvc0_hw_sm_query_end(nvc0, q); + } + break; + } + if (hq->is64bit) + nouveau_fence_ref(nvc0->screen->base.fence.current, &hq->fence); +} + +static boolean +nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, + boolean wait, union pipe_query_result *result) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + uint64_t *res64 = (uint64_t*)result; + uint32_t *res32 = (uint32_t*)result; + uint8_t *res8 = (uint8_t*)result; + uint64_t *data64 = (uint64_t *)hq->data; + unsigned i; + + if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || + (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { + return nvc0_hw_sm_query_result(nvc0, q, result, wait); + } + + if (hq->state != NVC0_HW_QUERY_STATE_READY) + nvc0_hw_query_update(nvc0->screen->base.client, q); + + if (hq->state != NVC0_HW_QUERY_STATE_READY) { + if (!wait) { + if (hq->state != NVC0_HW_QUERY_STATE_FLUSHED) { + hq->state = NVC0_HW_QUERY_STATE_FLUSHED; + /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */ + PUSH_KICK(nvc0->base.pushbuf); + } + return false; + } + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->screen->base.client)) + return false; + NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1); + } + hq->state = NVC0_HW_QUERY_STATE_READY; + + switch (q->type) { + case PIPE_QUERY_GPU_FINISHED: + res8[0] = true; + break; + case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */ + res64[0] = hq->data[1] - hq->data[5]; + break; + case PIPE_QUERY_OCCLUSION_PREDICATE: + res8[0] = hq->data[1] != hq->data[5]; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */ + case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */ + res64[0] = data64[0] - data64[2]; + break; + case PIPE_QUERY_SO_STATISTICS: + res64[0] = data64[0] - data64[4]; + res64[1] = data64[2] - data64[6]; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + res8[0] = data64[0] != data64[2]; + break; + case PIPE_QUERY_TIMESTAMP: + res64[0] = data64[1]; + break; + case PIPE_QUERY_TIMESTAMP_DISJOINT: + res64[0] = 1000000000; + res8[8] = false; + break; + case PIPE_QUERY_TIME_ELAPSED: + res64[0] = data64[1] - data64[3]; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + for (i = 0; i < 10; ++i) + res64[i] = data64[i * 2] - data64[24 + i * 2]; + break; + case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: + res32[0] = hq->data[1]; + break; + default: + assert(0); /* can't happen, we don't create queries with invalid type */ + return false; + } + + return true; +} + +static const struct nvc0_query_funcs hw_query_funcs = { + .destroy_query = nvc0_hw_destroy_query, + .begin_query = nvc0_hw_begin_query, + .end_query = nvc0_hw_end_query, + .get_query_result = nvc0_hw_get_query_result, +}; + +struct nvc0_query * +nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index) +{ + struct nvc0_hw_query *hq; + struct nvc0_query *q; + unsigned space = NVC0_HW_QUERY_ALLOC_SPACE; + + hq = CALLOC_STRUCT(nvc0_hw_query); + if (!hq) + return NULL; + + q = &hq->base; + q->funcs = &hw_query_funcs; + q->type = type; + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + hq->rotate = 32; + space = NVC0_HW_QUERY_ALLOC_SPACE; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + hq->is64bit = true; + space = 512; + break; + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + hq->is64bit = true; + space = 64; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + hq->is64bit = true; + q->index = index; + space = 32; + break; + case PIPE_QUERY_TIME_ELAPSED: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + case PIPE_QUERY_GPU_FINISHED: + space = 32; + break; + case NVC0_HW_QUERY_TFB_BUFFER_OFFSET: + space = 16; + break; + default: + if (nvc0->screen->base.device->drm_version >= 0x01000101) { + if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) { + /* for each MP: + * [00] = WS0.C0 + * [04] = WS0.C1 + * [08] = WS0.C2 + * [0c] = WS0.C3 + * [10] = WS1.C0 + * [14] = WS1.C1 + * [18] = WS1.C2 + * [1c] = WS1.C3 + * [20] = WS2.C0 + * [24] = WS2.C1 + * [28] = WS2.C2 + * [2c] = WS2.C3 + * [30] = WS3.C0 + * [34] = WS3.C1 + * [38] = WS3.C2 + * [3c] = WS3.C3 + * [40] = MP.C4 + * [44] = MP.C5 + * [48] = MP.C6 + * [4c] = MP.C7 + * [50] = WS0.sequence + * [54] = WS1.sequence + * [58] = WS2.sequence + * [5c] = WS3.sequence + */ + space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); + break; + } else + if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) { + /* for each MP: + * [00] = MP.C0 + * [04] = MP.C1 + * [08] = MP.C2 + * [0c] = MP.C3 + * [10] = MP.C4 + * [14] = MP.C5 + * [18] = MP.C6 + * [1c] = MP.C7 + * [20] = MP.sequence + */ + space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t); + break; + } + } + debug_printf("invalid query type: %u\n", type); + FREE(q); + return NULL; + } + + if (!nvc0_hw_query_allocate(nvc0, q, space)) { + FREE(hq); + return NULL; + } + + if (hq->rotate) { + /* we advance before query_begin ! */ + hq->offset -= hq->rotate; + hq->data -= hq->rotate / sizeof(*hq->data); + } else + if (!hq->is64bit) + hq->data[0] = 0; /* initialize sequence */ + + return q; +} + +void +nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, + struct nvc0_query *q, unsigned result_offset) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + +#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8)) + + PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART); + nouveau_pushbuf_space(push, 0, 0, 1); + nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 | + NVC0_IB_ENTRY_1_NO_PREFETCH); +} + +void +nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q) +{ + struct nvc0_hw_query *hq = nvc0_hw_query(q); + unsigned offset = hq->offset; + + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20; + + PUSH_SPACE(push, 5); + PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD); + BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4); + PUSH_DATAh(push, hq->bo->offset + offset); + PUSH_DATA (push, hq->bo->offset + offset); + PUSH_DATA (push, hq->sequence); + PUSH_DATA (push, (1 << 12) | + NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL); +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h new file mode 100644 index 00000000000..d998c42c213 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h @@ -0,0 +1,138 @@ +#ifndef __NVC0_QUERY_HW_H__ +#define __NVC0_QUERY_HW_H__ + +#include "nouveau_fence.h" +#include "nouveau_mm.h" + +#include "nvc0_query.h" + +#define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) + +struct nvc0_hw_query { + struct nvc0_query base; + uint32_t *data; + int8_t ctr[4]; + uint32_t sequence; + struct nouveau_bo *bo; + uint32_t base_offset; + uint32_t offset; /* base_offset + i * rotate */ + uint8_t state; + boolean is64bit; + uint8_t rotate; + int nesting; /* only used for occlusion queries */ + struct nouveau_mm_allocation *mm; + struct nouveau_fence *fence; +}; + +static inline struct nvc0_hw_query * +nvc0_hw_query(struct nvc0_query *q) +{ + return (struct nvc0_hw_query *)q; +} + +/* + * Performance counter queries: + */ +#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) +#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) +enum nve4_pm_queries +{ + NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVE4_HW_SM_QUERY_ACTIVE_WARPS, + NVE4_HW_SM_QUERY_ATOM_COUNT, + NVE4_HW_SM_QUERY_BRANCH, + NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, + NVE4_HW_SM_QUERY_GLD_REQUEST, + NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GST_TRANSACTIONS, + NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GRED_COUNT, + NVE4_HW_SM_QUERY_GST_REQUEST, + NVE4_HW_SM_QUERY_INST_EXECUTED, + NVE4_HW_SM_QUERY_INST_ISSUED, + NVE4_HW_SM_QUERY_INST_ISSUED1, + NVE4_HW_SM_QUERY_INST_ISSUED2, + NVE4_HW_SM_QUERY_L1_GLD_HIT, + NVE4_HW_SM_QUERY_L1_GLD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_LD, + NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_ST, + NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_PROF_TRIGGER_0, + NVE4_HW_SM_QUERY_PROF_TRIGGER_1, + NVE4_HW_SM_QUERY_PROF_TRIGGER_2, + NVE4_HW_SM_QUERY_PROF_TRIGGER_3, + NVE4_HW_SM_QUERY_PROF_TRIGGER_4, + NVE4_HW_SM_QUERY_PROF_TRIGGER_5, + NVE4_HW_SM_QUERY_PROF_TRIGGER_6, + NVE4_HW_SM_QUERY_PROF_TRIGGER_7, + NVE4_HW_SM_QUERY_SHARED_LD, + NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, + NVE4_HW_SM_QUERY_SHARED_ST, + NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, + NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, + NVE4_HW_SM_QUERY_THREADS_LAUNCHED, + NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, + NVE4_HW_SM_QUERY_WARPS_LAUNCHED, + NVE4_HW_SM_QUERY_METRIC_IPC, + NVE4_HW_SM_QUERY_METRIC_IPAC, + NVE4_HW_SM_QUERY_METRIC_IPEC, + NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, + NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, + NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, + NVE4_HW_SM_QUERY_COUNT +}; + +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) +enum nvc0_pm_queries +{ + NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_COUNT, + NVC0_HW_SM_QUERY_BRANCH, + NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, + NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GRED_COUNT, + NVC0_HW_SM_QUERY_GST_REQUEST, + NVC0_HW_SM_QUERY_INST_EXECUTED, + NVC0_HW_SM_QUERY_INST_ISSUED1_0, + NVC0_HW_SM_QUERY_INST_ISSUED1_1, + NVC0_HW_SM_QUERY_INST_ISSUED2_0, + NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_PROF_TRIGGER_0, + NVC0_HW_SM_QUERY_PROF_TRIGGER_1, + NVC0_HW_SM_QUERY_PROF_TRIGGER_2, + NVC0_HW_SM_QUERY_PROF_TRIGGER_3, + NVC0_HW_SM_QUERY_PROF_TRIGGER_4, + NVC0_HW_SM_QUERY_PROF_TRIGGER_5, + NVC0_HW_SM_QUERY_PROF_TRIGGER_6, + NVC0_HW_SM_QUERY_PROF_TRIGGER_7, + NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_WARPS_LAUNCHED, + NVC0_HW_SM_QUERY_COUNT +}; + +struct nvc0_query * +nvc0_hw_create_query(struct nvc0_context *, unsigned, unsigned); +void +nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, + unsigned); +void +nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nvc0_query *); + +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 2fade982b83..af837fc4a33 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -26,6 +26,7 @@ #include "util/u_inlines.h" #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" static inline void nvc0_program_update_context_state(struct nvc0_context *nvc0, @@ -272,14 +273,14 @@ nvc0_tfb_validate(struct nvc0_context *nvc0) continue; if (!targ->clean) - nvc0_query_fifo_wait(push, nvc0_query(targ->pq)); + nvc0_hw_query_fifo_wait(push, nvc0_query(targ->pq)); BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5); PUSH_DATA (push, 1); PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset); PUSH_DATA (push, buf->address + targ->pipe.buffer_offset); PUSH_DATA (push, targ->pipe.buffer_size); if (!targ->clean) { - nvc0_query_pushbuf_submit(push, nvc0_query(targ->pq), 0x4); + nvc0_hw_query_pushbuf_submit(push, nvc0_query(targ->pq), 0x4); } else { PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */ targ->clean = false; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 269c75b03a6..742bef39247 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -29,6 +29,7 @@ #include "nvc0/nvc0_stateobj.h" #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" #include "nvc0/nvc0_3d.xml.h" #include "nv50/nv50_texture.xml.h" @@ -1070,7 +1071,7 @@ nvc0_so_target_create(struct pipe_context *pipe, if (!targ) return NULL; - targ->pq = pipe->create_query(pipe, NVC0_QUERY_TFB_BUFFER_OFFSET, 0); + targ->pq = pipe->create_query(pipe, NVC0_HW_QUERY_TFB_BUFFER_OFFSET, 0); if (!targ->pq) { FREE(targ); return NULL; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 9be25cfe66e..c464904d6d4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -29,6 +29,7 @@ #include "translate/translate.h" #include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw.h" #include "nvc0/nvc0_resource.h" #include "nvc0/nvc0_3d.xml.h" @@ -775,7 +776,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; PUSH_SPACE(push, 2); IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0); - nvc0_query_fifo_wait(push, nvc0_query(so->pq)); + nvc0_hw_query_fifo_wait(push, nvc0_query(so->pq)); if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS) IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0); @@ -791,7 +792,7 @@ nvc0_draw_stream_output(struct nvc0_context *nvc0, BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_STRIDE), 1); PUSH_DATA (push, so->stride); BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BYTES), 1); - nvc0_query_pushbuf_submit(push, nvc0_query(so->pq), 0x4); + nvc0_hw_query_pushbuf_submit(push, nvc0_query(so->pq), 0x4); IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0); mode |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT; From 7129cbf5f4acaa86512c0dd6c127b8fb617fb441 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sun, 4 Oct 2015 23:43:20 +0200 Subject: [PATCH 014/270] nvc0: move HW SM queries to nvc0_query_hw_sm.c/h files Global performance counters (PCOUNTER) will be added to nvc0_query_hw_pm.c/h files. Signed-off-by: Samuel Pitoiset --- src/gallium/drivers/nouveau/Makefile.sources | 2 + src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 15 +- src/gallium/drivers/nouveau/nvc0/nvc0_query.h | 2 +- .../drivers/nouveau/nvc0/nvc0_query_hw.c | 707 +---------------- .../drivers/nouveau/nvc0/nvc0_query_hw.h | 111 +-- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 748 ++++++++++++++++++ .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h | 117 +++ .../drivers/nouveau/nvc0/nvc0_screen.h | 2 +- 8 files changed, 908 insertions(+), 796 deletions(-) create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index e45c564f431..edc6cf41885 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -154,6 +154,8 @@ NVC0_C_SOURCES := \ nvc0/nvc0_query.h \ nvc0/nvc0_query_hw.c \ nvc0/nvc0_query_hw.h \ + nvc0/nvc0_query_hw_sm.c \ + nvc0/nvc0_query_hw_sm.h \ nvc0/nvc0_query_sw.c \ nvc0/nvc0_query_sw.h \ nvc0/nvc0_resource.c \ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index a0ca3fa533c..f8d4ba16237 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -28,6 +28,7 @@ #include "nvc0/nvc0_query.h" #include "nvc0/nvc0_query_sw.h" #include "nvc0/nvc0_query_hw.h" +#include "nvc0/nvc0_query_hw_sm.h" static struct pipe_query * nvc0_create_query(struct pipe_context *pipe, unsigned type, unsigned index) @@ -182,7 +183,7 @@ static const char *nvc0_sw_query_drv_stat_names[] = /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ /* NOTE: intentionally using the same names as NV */ -static const char *nve4_pm_query_names[] = +static const char *nve4_hw_sm_query_names[] = { /* MP counters */ "active_cycles", @@ -238,7 +239,7 @@ static const char *nve4_pm_query_names[] = }; /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ -static const char *nvc0_pm_query_names[] = +static const char *nvc0_hw_sm_query_names[] = { /* MP counters */ "active_cycles", @@ -320,17 +321,17 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, if (id < count) { if (screen->compute) { if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_pm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; + info->name = nve4_hw_sm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; info->query_type = NVE4_HW_SM_QUERY(id - NVC0_SW_QUERY_DRV_STAT_COUNT); info->max_value.u64 = (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; - info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; + info->group_id = NVC0_HW_SM_QUERY_GROUP; return 1; } else if (screen->base.class_3d < NVE4_3D_CLASS) { - info->name = nvc0_pm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; + info->name = nvc0_hw_sm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; info->query_type = NVC0_HW_SM_QUERY(id - NVC0_SW_QUERY_DRV_STAT_COUNT); - info->group_id = NVC0_QUERY_MP_COUNTER_GROUP; + info->group_id = NVC0_HW_SM_QUERY_GROUP; return 1; } } @@ -365,7 +366,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, if (!info) return count; - if (id == NVC0_QUERY_MP_COUNTER_GROUP) { + if (id == NVC0_HW_SM_QUERY_GROUP) { if (screen->compute) { info->name = "MP counters"; info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h index c4f0cb0ec6e..6883ab6ab9d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h @@ -31,7 +31,7 @@ nvc0_query(struct pipe_query *pipe) /* * Driver queries groups: */ -#define NVC0_QUERY_MP_COUNTER_GROUP 0 +#define NVC0_HW_SM_QUERY_GROUP 0 #define NVC0_SW_QUERY_DRV_STAT_GROUP 1 void nvc0_init_query_functions(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index b7923d549f4..16a639e3c48 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -25,10 +25,7 @@ #include "nvc0/nvc0_context.h" #include "nvc0/nvc0_query_hw.h" - -#include "nv_object.xml.h" -#include "nvc0/nve4_compute.xml.h" -#include "nvc0/nvc0_compute.xml.h" +#include "nvc0/nvc0_query_hw_sm.h" #define NVC0_HW_QUERY_STATE_READY 0 #define NVC0_HW_QUERY_STATE_ACTIVE 1 @@ -37,632 +34,7 @@ #define NVC0_HW_QUERY_ALLOC_SPACE 256 -/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ - -/* Code to read out MP counters: They are accessible via mmio, too, but let's - * just avoid mapping registers in userspace. We'd have to know which MPs are - * enabled/present, too, and that information is not presently exposed. - * We could add a kernel interface for it, but reading the counters like this - * has the advantage of being async (if get_result isn't called immediately). - */ -static const uint64_t nve4_read_hw_sm_counters_code[] = -{ - /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 - * mov b32 $r8 $tidx - * mov b32 $r12 $physid - * mov b32 $r0 $pm0 - * mov b32 $r1 $pm1 - * mov b32 $r2 $pm2 - * mov b32 $r3 $pm3 - * mov b32 $r4 $pm4 - * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b - * mov b32 $r5 $pm5 - * mov b32 $r6 $pm6 - * mov b32 $r7 $pm7 - * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * ext u32 $r8 $r12 0x414 - * mov b32 $r11 c0[0x4] - * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 - * ext u32 $r9 $r12 0x208 - * (not $p0) exit - * set $p1 0x1 eq u32 $r9 0x0 - * mul $r8 u32 $r8 u32 96 - * mul $r12 u32 $r9 u32 16 - * mul $r13 u32 $r9 u32 4 - * add b32 $r9 $r8 $r13 - * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c - * add b32 $r8 $r8 $r12 - * mov b32 $r12 $r10 - * add b32 $r10 $c $r10 $r8 - * mov b32 $r13 $r11 - * add b32 $r11 $r11 0x0 $c - * add b32 $r12 $c $r12 $r9 - * st b128 wt g[$r10d] $r0q - * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 - * mov b32 $r0 c0[0x8] - * add b32 $r13 $r13 0x0 $c - * $p1 st b128 wt g[$r12d+0x40] $r4q - * st b32 wt g[$r12d+0x50] $r0 - * exit */ - 0x2202020202020207ULL, - 0x2c00000084021c04ULL, - 0x2c0000000c031c04ULL, - 0x2c00000010001c04ULL, - 0x2c00000014005c04ULL, - 0x2c00000018009c04ULL, - 0x2c0000001c00dc04ULL, - 0x2c00000020011c04ULL, - 0x22b0420042320207ULL, - 0x2c00000024015c04ULL, - 0x2c00000028019c04ULL, - 0x2c0000002c01dc04ULL, - 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x7000c01050c21c03ULL, - 0x280040001002dde4ULL, - 0x204282020042e047ULL, - 0x7000c00820c25c03ULL, - 0x80000000000021e7ULL, - 0x190e0000fc93dc03ULL, - 0x1000000180821c02ULL, - 0x1000000040931c02ULL, - 0x1000000010935c02ULL, - 0x4800000034825c03ULL, - 0x22c042c042c04287ULL, - 0x4800000030821c03ULL, - 0x2800000028031de4ULL, - 0x4801000020a29c03ULL, - 0x280000002c035de4ULL, - 0x0800000000b2dc42ULL, - 0x4801000024c31c03ULL, - 0x9400000000a01fc5ULL, - 0x200002e04202c047ULL, - 0x2800400020001de4ULL, - 0x0800000000d35c42ULL, - 0x9400000100c107c5ULL, - 0x9400000140c01f85ULL, - 0x8000000000001de7ULL -}; - -/* For simplicity, we will allocate as many group slots as we allocate counter - * slots. This means that a single counter which wants to source from 2 groups - * will have to be declared as using 2 counter slots. This shouldn't really be - * a problem because such queries don't make much sense ... (unless someone is - * really creative). - */ -struct nvc0_mp_counter_cfg -{ - uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ - uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ - uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ - uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ - uint32_t sig_sel : 8; /* signal group */ - uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */ -}; - -#define NVC0_COUNTER_OPn_SUM 0 -#define NVC0_COUNTER_OPn_OR 1 -#define NVC0_COUNTER_OPn_AND 2 -#define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */ -#define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */ -#define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */ -#define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */ - -struct nvc0_hw_sm_query_cfg -{ - struct nvc0_mp_counter_cfg ctr[4]; - uint8_t num_counters; - uint8_t op; - uint8_t norm[2]; /* normalization num,denom */ -}; - -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } -#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } -#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ - {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } - -/* NOTES: - * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps - * inst_executed etc.: we only count a single warp scheduler - * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers; - * this is inaccurate ! - */ -static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = -{ - _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), - _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), - _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), - _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), - _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), - _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), - _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), - _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), - _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), - _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), - _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), - _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), - _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1), - _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), - _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), - _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), - _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), - _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), - _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), - _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), - _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), - _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), - _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), - _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), - _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), - _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), - _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), - _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), - _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), - _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), - _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), - _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), - _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), - _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), - _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), - _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), - _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), - _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), - _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), - _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), - _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), - _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), - _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1), - _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1), - _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1), - _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1), - _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64), - _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1), -}; - -#undef _Q1A -#undef _Q1B -#undef _M2A -#undef _M2B - -/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ -static const uint64_t nvc0_read_hw_sm_counters_code[] = -{ - /* mov b32 $r8 $tidx - * mov b32 $r9 $physid - * mov b32 $r0 $pm0 - * mov b32 $r1 $pm1 - * mov b32 $r2 $pm2 - * mov b32 $r3 $pm3 - * mov b32 $r4 $pm4 - * mov b32 $r5 $pm5 - * mov b32 $r6 $pm6 - * mov b32 $r7 $pm7 - * set $p0 0x1 eq u32 $r8 0x0 - * mov b32 $r10 c0[0x0] - * mov b32 $r11 c0[0x4] - * ext u32 $r8 $r9 0x414 - * (not $p0) exit - * mul $r8 u32 $r8 u32 36 - * add b32 $r10 $c $r10 $r8 - * add b32 $r11 $r11 0x0 $c - * mov b32 $r8 c0[0x8] - * st b128 wt g[$r10d+0x00] $r0q - * st b128 wt g[$r10d+0x10] $r4q - * st b32 wt g[$r10d+0x20] $r8 - * exit */ - 0x2c00000084021c04ULL, - 0x2c0000000c025c04ULL, - 0x2c00000010001c04ULL, - 0x2c00000014005c04ULL, - 0x2c00000018009c04ULL, - 0x2c0000001c00dc04ULL, - 0x2c00000020011c04ULL, - 0x2c00000024015c04ULL, - 0x2c00000028019c04ULL, - 0x2c0000002c01dc04ULL, - 0x190e0000fc81dc03ULL, - 0x2800400000029de4ULL, - 0x280040001002dde4ULL, - 0x7000c01050921c03ULL, - 0x80000000000021e7ULL, - 0x1000000090821c02ULL, - 0x4801000020a29c03ULL, - 0x0800000000b2dc42ULL, - 0x2800400020021de4ULL, - 0x9400000000a01fc5ULL, - 0x9400000040a11fc5ULL, - 0x9400000080a21f85ULL, - 0x8000000000001de7ULL -}; - -#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } } - -static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = -{ - _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00), - _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00), - _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), -}; - -#undef _Q - -static const struct nvc0_hw_sm_query_cfg * -nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - - if (screen->base.class_3d >= NVE4_3D_CLASS) - return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; - return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)]; -} - -static boolean -nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; - struct nvc0_hw_query *hq = nvc0_hw_query(q); - const struct nvc0_hw_sm_query_cfg *cfg; - unsigned i, c; - unsigned num_ab[2] = { 0, 0 }; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - /* check if we have enough free counter slots */ - for (i = 0; i < cfg->num_counters; ++i) - num_ab[cfg->ctr[i].sig_dom]++; - - if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || - screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { - NOUVEAU_ERR("Not enough free MP counter slots !\n"); - return false; - } - - assert(cfg->num_counters <= 4); - PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); - - if (!screen->pm.mp_counters_enabled) { - screen->pm.mp_counters_enabled = true; - BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); - PUSH_DATA (push, 0x1fcb); - } - - /* set sequence field to 0 (used to check if result is available) */ - for (i = 0; i < screen->mp_count; ++i) - hq->data[i * 10 + 10] = 0; - - for (i = 0; i < cfg->num_counters; ++i) { - const unsigned d = cfg->ctr[i].sig_dom; - - if (!screen->pm.num_hw_sm_active[d]) { - uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); - if (screen->pm.num_hw_sm_active[!d]) - m |= 1 << (7 + (8 * d)); - BEGIN_NVC0(push, SUBC_SW(0x0600), 1); - PUSH_DATA (push, m); - } - screen->pm.num_hw_sm_active[d]++; - - for (c = d * 4; c < (d * 4 + 4); ++c) { - if (!screen->pm.mp_counter[c]) { - hq->ctr[i] = c; - screen->pm.mp_counter[c] = (struct pipe_query *)q; - break; - } - } - assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ - - /* configure and reset the counter(s) */ - if (is_nve4) { - if (d == 0) - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); - else - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); - PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); - PUSH_DATA (push, 0); - } else { - unsigned s; - - for (s = 0; s < cfg->ctr[i].num_src; s++) { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); - PUSH_DATA (push, 0); - } - } - } - return true; -} - -static void -nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q) -{ - struct nvc0_screen *screen = nvc0->screen; - struct pipe_context *pipe = &nvc0->base.pipe; - struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; - struct nvc0_hw_query *hq = nvc0_hw_query(q); - uint32_t mask; - uint32_t input[3]; - const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; - const uint grid[3] = { screen->mp_count, 1, 1 }; - unsigned c; - const struct nvc0_hw_sm_query_cfg *cfg; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - if (unlikely(!screen->pm.prog)) { - struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); - prog->type = PIPE_SHADER_COMPUTE; - prog->translated = true; - prog->num_gprs = 14; - prog->parm_size = 12; - if (is_nve4) { - prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; - prog->code_size = sizeof(nve4_read_hw_sm_counters_code); - } else { - prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; - prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); - } - screen->pm.prog = prog; - } - - /* disable all counting */ - PUSH_SPACE(push, 8); - for (c = 0; c < 8; ++c) - if (screen->pm.mp_counter[c]) { - if (is_nve4) { - IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); - } else { - IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); - } - } - /* release counters for this query */ - for (c = 0; c < 8; ++c) { - if (nvc0_query(screen->pm.mp_counter[c]) == q) { - screen->pm.num_hw_sm_active[c / 4]--; - screen->pm.mp_counter[c] = NULL; - } - } - - BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, - hq->bo); - - PUSH_SPACE(push, 1); - IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); - - pipe->bind_compute_state(pipe, screen->pm.prog); - input[0] = (hq->bo->offset + hq->base_offset); - input[1] = (hq->bo->offset + hq->base_offset) >> 32; - input[2] = hq->sequence; - pipe->launch_grid(pipe, block, grid, 0, input); - - nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); - - /* re-activate other counters */ - PUSH_SPACE(push, 16); - mask = 0; - for (c = 0; c < 8; ++c) { - unsigned i; - q = nvc0_query(screen->pm.mp_counter[c]); - if (!q) - continue; - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - for (i = 0; i < cfg->num_counters; ++i) { - if (mask & (1 << hq->ctr[i])) - break; - mask |= 1 << hq->ctr[i]; - if (is_nve4) { - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hq->ctr[i])), 1); - } else { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hq->ctr[i])), 1); - } - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - } - } -} - -static inline bool -nvc0_hw_sm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, bool wait, - struct nvc0_query *q, - const struct nvc0_hw_sm_query_cfg *cfg, - unsigned mp_count) -{ - struct nvc0_hw_query *hq = nvc0_hw_query(q); - unsigned p, c; - - for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x24 / 4) * p; - - for (c = 0; c < cfg->num_counters; ++c) { - if (hq->data[b + 8] != hq->sequence) { - if (!wait) - return false; - if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return false; - } - count[p][c] = hq->data[b + hq->ctr[c]]; - } - } - return true; -} - -static inline bool -nve4_hw_sm_query_read_data(uint32_t count[32][4], - struct nvc0_context *nvc0, bool wait, - struct nvc0_query *q, - const struct nvc0_hw_sm_query_cfg *cfg, - unsigned mp_count) -{ - struct nvc0_hw_query *hq = nvc0_hw_query(q); - unsigned p, c, d; - - for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x60 / 4) * p; - - for (c = 0; c < cfg->num_counters; ++c) { - count[p][c] = 0; - for (d = 0; d < ((hq->ctr[c] & ~3) ? 1 : 4); ++d) { - if (hq->data[b + 20 + d] != hq->sequence) { - if (!wait) - return false; - if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) - return false; - } - if (hq->ctr[c] & ~0x3) - count[p][c] = hq->data[b + 16 + (hq->ctr[c] & 3)]; - else - count[p][c] += hq->data[b + d * 4 + hq->ctr[c]]; - } - } - } - return true; -} - -/* Metric calculations: - * sum(x) ... sum of x over all MPs - * avg(x) ... average of x over all MPs - * - * IPC : sum(inst_executed) / clock - * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) - * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) - * MP_EFFICIENCY : avg(active_cycles / clock) - * - * NOTE: Interpretation of IPC requires knowledge of MP count. - */ -static boolean -nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, - void *result, boolean wait) -{ - uint32_t count[32][4]; - uint64_t value = 0; - unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); - unsigned p, c; - const struct nvc0_hw_sm_query_cfg *cfg; - bool ret; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, q); - - if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) - ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); - else - ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count); - if (!ret) - return false; - - if (cfg->op == NVC0_COUNTER_OPn_SUM) { - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - value += count[p][c]; - value = (value * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OPn_OR) { - uint32_t v = 0; - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - v |= count[p][c]; - value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OPn_AND) { - uint32_t v = ~0; - for (c = 0; c < cfg->num_counters; ++c) - for (p = 0; p < mp_count; ++p) - v &= count[p][c]; - value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) { - uint64_t v[2] = { 0, 0 }; - for (p = 0; p < mp_count; ++p) { - v[0] += count[p][0]; - v[1] += count[p][1]; - } - if (v[0]) - value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]); - } else - if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) { - for (p = 0; p < mp_count; ++p) - value += count[p][0]; - if (count[0][1]) - value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]); - else - value = 0; - } else - if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) { - unsigned mp_used = 0; - for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) - if (count[p][1]) - value += (count[p][0] * cfg->norm[0]) / count[p][1]; - if (mp_used) - value /= (uint64_t)mp_used * cfg->norm[1]; - } else - if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) { - unsigned mp_used = 0; - for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) - value += count[p][0]; - if (count[0][1] && mp_used) { - value *= cfg->norm[0]; - value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1]; - } else { - value = 0; - } - } - - *(uint64_t *)result = value; - return true; -} - -static bool +bool nvc0_hw_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size) { @@ -755,6 +127,9 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) struct nvc0_hw_query *hq = nvc0_hw_query(q); bool ret = true; + if (hq->funcs && hq->funcs->begin_query) + return hq->funcs->begin_query(nvc0, hq); + /* For occlusion queries we have to change the storage, because a previous * query might set the initial render conition to false even *after* we re- * initialized it to true. @@ -814,10 +189,6 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ break; default: - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - ret = nvc0_hw_sm_query_begin(nvc0, q); - } break; } hq->state = NVC0_HW_QUERY_STATE_ACTIVE; @@ -830,6 +201,11 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_hw_query *hq = nvc0_hw_query(q); + if (hq->funcs && hq->funcs->end_query) { + hq->funcs->end_query(nvc0, hq); + return; + } + if (hq->state != NVC0_HW_QUERY_STATE_ACTIVE) { /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */ if (hq->rotate) @@ -891,10 +267,6 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5)); break; default: - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - nvc0_hw_sm_query_end(nvc0, q); - } break; } if (hq->is64bit) @@ -912,10 +284,8 @@ nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q, uint64_t *data64 = (uint64_t *)hq->data; unsigned i; - if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) || - (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) { - return nvc0_hw_sm_query_result(nvc0, q, result, wait); - } + if (hq->funcs && hq->funcs->get_query_result) + return hq->funcs->get_query_result(nvc0, hq, wait, result); if (hq->state != NVC0_HW_QUERY_STATE_READY) nvc0_hw_query_update(nvc0->screen->base.client, q); @@ -995,6 +365,12 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index) struct nvc0_query *q; unsigned space = NVC0_HW_QUERY_ALLOC_SPACE; + hq = nvc0_hw_sm_create_query(nvc0, type); + if (hq) { + hq->base.funcs = &hw_query_funcs; + return (struct nvc0_query *)hq; + } + hq = CALLOC_STRUCT(nvc0_hw_query); if (!hq) return NULL; @@ -1034,53 +410,6 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index) space = 16; break; default: - if (nvc0->screen->base.device->drm_version >= 0x01000101) { - if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) { - /* for each MP: - * [00] = WS0.C0 - * [04] = WS0.C1 - * [08] = WS0.C2 - * [0c] = WS0.C3 - * [10] = WS1.C0 - * [14] = WS1.C1 - * [18] = WS1.C2 - * [1c] = WS1.C3 - * [20] = WS2.C0 - * [24] = WS2.C1 - * [28] = WS2.C2 - * [2c] = WS2.C3 - * [30] = WS3.C0 - * [34] = WS3.C1 - * [38] = WS3.C2 - * [3c] = WS3.C3 - * [40] = MP.C4 - * [44] = MP.C5 - * [48] = MP.C6 - * [4c] = MP.C7 - * [50] = WS0.sequence - * [54] = WS1.sequence - * [58] = WS2.sequence - * [5c] = WS3.sequence - */ - space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); - break; - } else - if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) { - /* for each MP: - * [00] = MP.C0 - * [04] = MP.C1 - * [08] = MP.C2 - * [0c] = MP.C3 - * [10] = MP.C4 - * [14] = MP.C5 - * [18] = MP.C6 - * [1c] = MP.C7 - * [20] = MP.sequence - */ - space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t); - break; - } - } debug_printf("invalid query type: %u\n", type); FREE(q); return NULL; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h index d998c42c213..d72d894cc5a 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h @@ -8,10 +8,20 @@ #define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) +struct nvc0_hw_query; + +struct nvc0_hw_query_funcs { + void (*destroy_query)(struct nvc0_context *, struct nvc0_hw_query *); + boolean (*begin_query)(struct nvc0_context *, struct nvc0_hw_query *); + void (*end_query)(struct nvc0_context *, struct nvc0_hw_query *); + boolean (*get_query_result)(struct nvc0_context *, struct nvc0_hw_query *, + boolean, union pipe_query_result *); +}; + struct nvc0_hw_query { struct nvc0_query base; + const struct nvc0_hw_query_funcs *funcs; uint32_t *data; - int8_t ctr[4]; uint32_t sequence; struct nouveau_bo *bo; uint32_t base_offset; @@ -30,105 +40,10 @@ nvc0_hw_query(struct nvc0_query *q) return (struct nvc0_hw_query *)q; } -/* - * Performance counter queries: - */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_pm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_METRIC_IPC, - NVE4_HW_SM_QUERY_METRIC_IPAC, - NVE4_HW_SM_QUERY_METRIC_IPEC, - NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, - NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, - NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) -#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) -enum nvc0_pm_queries -{ - NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVC0_HW_SM_QUERY_ACTIVE_WARPS, - NVC0_HW_SM_QUERY_ATOM_COUNT, - NVC0_HW_SM_QUERY_BRANCH, - NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, - NVC0_HW_SM_QUERY_GLD_REQUEST, - NVC0_HW_SM_QUERY_GRED_COUNT, - NVC0_HW_SM_QUERY_GST_REQUEST, - NVC0_HW_SM_QUERY_INST_EXECUTED, - NVC0_HW_SM_QUERY_INST_ISSUED1_0, - NVC0_HW_SM_QUERY_INST_ISSUED1_1, - NVC0_HW_SM_QUERY_INST_ISSUED2_0, - NVC0_HW_SM_QUERY_INST_ISSUED2_1, - NVC0_HW_SM_QUERY_LOCAL_LD, - NVC0_HW_SM_QUERY_LOCAL_ST, - NVC0_HW_SM_QUERY_PROF_TRIGGER_0, - NVC0_HW_SM_QUERY_PROF_TRIGGER_1, - NVC0_HW_SM_QUERY_PROF_TRIGGER_2, - NVC0_HW_SM_QUERY_PROF_TRIGGER_3, - NVC0_HW_SM_QUERY_PROF_TRIGGER_4, - NVC0_HW_SM_QUERY_PROF_TRIGGER_5, - NVC0_HW_SM_QUERY_PROF_TRIGGER_6, - NVC0_HW_SM_QUERY_PROF_TRIGGER_7, - NVC0_HW_SM_QUERY_SHARED_LD, - NVC0_HW_SM_QUERY_SHARED_ST, - NVC0_HW_SM_QUERY_THREADS_LAUNCHED, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, - NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, - NVC0_HW_SM_QUERY_WARPS_LAUNCHED, - NVC0_HW_SM_QUERY_COUNT -}; - struct nvc0_query * nvc0_hw_create_query(struct nvc0_context *, unsigned, unsigned); +bool +nvc0_hw_query_allocate(struct nvc0_context *, struct nvc0_query *, int); void nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *, struct nvc0_query *, unsigned); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c new file mode 100644 index 00000000000..3bdb90a8d7a --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -0,0 +1,748 @@ +/* + * Copyright 2011 Christoph Bumiller + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING + +#include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw_sm.h" + +#include "nv_object.xml.h" +#include "nvc0/nve4_compute.xml.h" +#include "nvc0/nvc0_compute.xml.h" + +/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ + +/* Code to read out MP counters: They are accessible via mmio, too, but let's + * just avoid mapping registers in userspace. We'd have to know which MPs are + * enabled/present, too, and that information is not presently exposed. + * We could add a kernel interface for it, but reading the counters like this + * has the advantage of being async (if get_result isn't called immediately). + */ +static const uint64_t nve4_read_hw_sm_counters_code[] = +{ + /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 + * mov b32 $r8 $tidx + * mov b32 $r12 $physid + * mov b32 $r0 $pm0 + * mov b32 $r1 $pm1 + * mov b32 $r2 $pm2 + * mov b32 $r3 $pm3 + * mov b32 $r4 $pm4 + * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b + * mov b32 $r5 $pm5 + * mov b32 $r6 $pm6 + * mov b32 $r7 $pm7 + * set $p0 0x1 eq u32 $r8 0x0 + * mov b32 $r10 c0[0x0] + * ext u32 $r8 $r12 0x414 + * mov b32 $r11 c0[0x4] + * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 + * ext u32 $r9 $r12 0x208 + * (not $p0) exit + * set $p1 0x1 eq u32 $r9 0x0 + * mul $r8 u32 $r8 u32 96 + * mul $r12 u32 $r9 u32 16 + * mul $r13 u32 $r9 u32 4 + * add b32 $r9 $r8 $r13 + * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c + * add b32 $r8 $r8 $r12 + * mov b32 $r12 $r10 + * add b32 $r10 $c $r10 $r8 + * mov b32 $r13 $r11 + * add b32 $r11 $r11 0x0 $c + * add b32 $r12 $c $r12 $r9 + * st b128 wt g[$r10d] $r0q + * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 + * mov b32 $r0 c0[0x8] + * add b32 $r13 $r13 0x0 $c + * $p1 st b128 wt g[$r12d+0x40] $r4q + * st b32 wt g[$r12d+0x50] $r0 + * exit */ + 0x2202020202020207ULL, + 0x2c00000084021c04ULL, + 0x2c0000000c031c04ULL, + 0x2c00000010001c04ULL, + 0x2c00000014005c04ULL, + 0x2c00000018009c04ULL, + 0x2c0000001c00dc04ULL, + 0x2c00000020011c04ULL, + 0x22b0420042320207ULL, + 0x2c00000024015c04ULL, + 0x2c00000028019c04ULL, + 0x2c0000002c01dc04ULL, + 0x190e0000fc81dc03ULL, + 0x2800400000029de4ULL, + 0x7000c01050c21c03ULL, + 0x280040001002dde4ULL, + 0x204282020042e047ULL, + 0x7000c00820c25c03ULL, + 0x80000000000021e7ULL, + 0x190e0000fc93dc03ULL, + 0x1000000180821c02ULL, + 0x1000000040931c02ULL, + 0x1000000010935c02ULL, + 0x4800000034825c03ULL, + 0x22c042c042c04287ULL, + 0x4800000030821c03ULL, + 0x2800000028031de4ULL, + 0x4801000020a29c03ULL, + 0x280000002c035de4ULL, + 0x0800000000b2dc42ULL, + 0x4801000024c31c03ULL, + 0x9400000000a01fc5ULL, + 0x200002e04202c047ULL, + 0x2800400020001de4ULL, + 0x0800000000d35c42ULL, + 0x9400000100c107c5ULL, + 0x9400000140c01f85ULL, + 0x8000000000001de7ULL +}; + +/* For simplicity, we will allocate as many group slots as we allocate counter + * slots. This means that a single counter which wants to source from 2 groups + * will have to be declared as using 2 counter slots. This shouldn't really be + * a problem because such queries don't make much sense ... (unless someone is + * really creative). + */ +struct nvc0_hw_sm_counter_cfg +{ + uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ + uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ + uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ + uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ + uint32_t sig_sel : 8; /* signal group */ + uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */ +}; + +#define NVC0_COUNTER_OPn_SUM 0 +#define NVC0_COUNTER_OPn_OR 1 +#define NVC0_COUNTER_OPn_AND 2 +#define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */ +#define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */ +#define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */ +#define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */ + +struct nvc0_hw_sm_query_cfg +{ + struct nvc0_hw_sm_counter_cfg ctr[4]; + uint8_t num_counters; + uint8_t op; + uint8_t norm[2]; /* normalization num,denom */ +}; + +#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } +#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } +#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ + {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } + +/* NOTES: + * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps + * inst_executed etc.: we only count a single warp scheduler + * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers; + * this is inaccurate ! + */ +static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = +{ + _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), + _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), + _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1), + _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1), + _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1), + _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1), + _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1), + _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1), + _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1), + _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1), + _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1), + _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1), + _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1), + _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1), + _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1), + _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1), + _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1), + _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1), + _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1), + _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1), + _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1), + _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1), + _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1), + _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1), + _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1), + _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1), + _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1), + _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1), + _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1), + _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1), + _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1), + _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1), + _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1), + _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1), + _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1), + _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1), + _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1), + _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1), + _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1), + _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1), + _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1), + _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1), + _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1), + _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1), + _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1), + _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1), + _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1), + _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64), + _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1), +}; + +#undef _Q1A +#undef _Q1B +#undef _M2A +#undef _M2B + +/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ +static const uint64_t nvc0_read_hw_sm_counters_code[] = +{ + /* mov b32 $r8 $tidx + * mov b32 $r9 $physid + * mov b32 $r0 $pm0 + * mov b32 $r1 $pm1 + * mov b32 $r2 $pm2 + * mov b32 $r3 $pm3 + * mov b32 $r4 $pm4 + * mov b32 $r5 $pm5 + * mov b32 $r6 $pm6 + * mov b32 $r7 $pm7 + * set $p0 0x1 eq u32 $r8 0x0 + * mov b32 $r10 c0[0x0] + * mov b32 $r11 c0[0x4] + * ext u32 $r8 $r9 0x414 + * (not $p0) exit + * mul $r8 u32 $r8 u32 36 + * add b32 $r10 $c $r10 $r8 + * add b32 $r11 $r11 0x0 $c + * mov b32 $r8 c0[0x8] + * st b128 wt g[$r10d+0x00] $r0q + * st b128 wt g[$r10d+0x10] $r4q + * st b32 wt g[$r10d+0x20] $r8 + * exit */ + 0x2c00000084021c04ULL, + 0x2c0000000c025c04ULL, + 0x2c00000010001c04ULL, + 0x2c00000014005c04ULL, + 0x2c00000018009c04ULL, + 0x2c0000001c00dc04ULL, + 0x2c00000020011c04ULL, + 0x2c00000024015c04ULL, + 0x2c00000028019c04ULL, + 0x2c0000002c01dc04ULL, + 0x190e0000fc81dc03ULL, + 0x2800400000029de4ULL, + 0x280040001002dde4ULL, + 0x7000c01050921c03ULL, + 0x80000000000021e7ULL, + 0x1000000090821c02ULL, + 0x4801000020a29c03ULL, + 0x0800000000b2dc42ULL, + 0x2800400020021de4ULL, + 0x9400000000a01fc5ULL, + 0x9400000040a11fc5ULL, + 0x9400000080a21f85ULL, + 0x8000000000001de7ULL +}; + +#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } } + +static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = +{ + _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), + _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00), + _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00), + _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00), + _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), + _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), + _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), +}; + +#undef _Q + +static const struct nvc0_hw_sm_query_cfg * +nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nvc0_query *q = &hq->base; + + if (screen->base.class_3d >= NVE4_3D_CLASS) + return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)]; +} + +static void +nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_query *q = &hq->base; + q->funcs->destroy_query(nvc0, q); +} + +static boolean +nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + const struct nvc0_hw_sm_query_cfg *cfg; + unsigned i, c; + unsigned num_ab[2] = { 0, 0 }; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + /* check if we have enough free counter slots */ + for (i = 0; i < cfg->num_counters; ++i) + num_ab[cfg->ctr[i].sig_dom]++; + + if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || + screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { + NOUVEAU_ERR("Not enough free MP counter slots !\n"); + return false; + } + + assert(cfg->num_counters <= 4); + PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); + + if (!screen->pm.mp_counters_enabled) { + screen->pm.mp_counters_enabled = true; + BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); + PUSH_DATA (push, 0x1fcb); + } + + /* set sequence field to 0 (used to check if result is available) */ + for (i = 0; i < screen->mp_count; ++i) + hq->data[i * 10 + 10] = 0; + hq->sequence++; + + for (i = 0; i < cfg->num_counters; ++i) { + const unsigned d = cfg->ctr[i].sig_dom; + + if (!screen->pm.num_hw_sm_active[d]) { + uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); + if (screen->pm.num_hw_sm_active[!d]) + m |= 1 << (7 + (8 * d)); + BEGIN_NVC0(push, SUBC_SW(0x0600), 1); + PUSH_DATA (push, m); + } + screen->pm.num_hw_sm_active[d]++; + + for (c = d * 4; c < (d * 4 + 4); ++c) { + if (!screen->pm.mp_counter[c]) { + hsq->ctr[i] = c; + screen->pm.mp_counter[c] = hsq; + break; + } + } + assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ + + /* configure and reset the counter(s) */ + if (is_nve4) { + if (d == 0) + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); + else + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); + } else { + unsigned s; + + for (s = 0; s < cfg->ctr[i].num_src; s++) { + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); + PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); + PUSH_DATA (push, 0); + } + } + } + return true; +} + +static void +nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct pipe_context *pipe = &nvc0->base.pipe; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + uint32_t mask; + uint32_t input[3]; + const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; + const uint grid[3] = { screen->mp_count, 1, 1 }; + unsigned c; + const struct nvc0_hw_sm_query_cfg *cfg; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + if (unlikely(!screen->pm.prog)) { + struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); + prog->type = PIPE_SHADER_COMPUTE; + prog->translated = true; + prog->num_gprs = 14; + prog->parm_size = 12; + if (is_nve4) { + prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; + prog->code_size = sizeof(nve4_read_hw_sm_counters_code); + } else { + prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; + prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); + } + screen->pm.prog = prog; + } + + /* disable all counting */ + PUSH_SPACE(push, 8); + for (c = 0; c < 8; ++c) + if (screen->pm.mp_counter[c]) { + if (is_nve4) { + IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0); + } else { + IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0); + } + } + /* release counters for this query */ + for (c = 0; c < 8; ++c) { + if (screen->pm.mp_counter[c] == hsq) { + screen->pm.num_hw_sm_active[c / 4]--; + screen->pm.mp_counter[c] = NULL; + } + } + + BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, + hq->bo); + + PUSH_SPACE(push, 1); + IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0); + + pipe->bind_compute_state(pipe, screen->pm.prog); + input[0] = (hq->bo->offset + hq->base_offset); + input[1] = (hq->bo->offset + hq->base_offset) >> 32; + input[2] = hq->sequence; + pipe->launch_grid(pipe, block, grid, 0, input); + + nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); + + /* re-activate other counters */ + PUSH_SPACE(push, 16); + mask = 0; + for (c = 0; c < 8; ++c) { + unsigned i; + + hsq = screen->pm.mp_counter[c]; + if (!hsq) + continue; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base); + for (i = 0; i < cfg->num_counters; ++i) { + if (mask & (1 << hsq->ctr[i])) + break; + mask |= 1 << hsq->ctr[i]; + if (is_nve4) { + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1); + } else { + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1); + } + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + } + } +} + +static inline bool +nvc0_hw_sm_query_read_data(uint32_t count[32][4], + struct nvc0_context *nvc0, bool wait, + struct nvc0_hw_query *hq, + const struct nvc0_hw_sm_query_cfg *cfg, + unsigned mp_count) +{ + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + unsigned p, c; + + for (p = 0; p < mp_count; ++p) { + const unsigned b = (0x24 / 4) * p; + + for (c = 0; c < cfg->num_counters; ++c) { + if (hq->data[b + 8] != hq->sequence) { + if (!wait) + return false; + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return false; + } + count[p][c] = hq->data[b + hsq->ctr[c]]; + } + } + return true; +} + +static inline bool +nve4_hw_sm_query_read_data(uint32_t count[32][4], + struct nvc0_context *nvc0, bool wait, + struct nvc0_hw_query *hq, + const struct nvc0_hw_sm_query_cfg *cfg, + unsigned mp_count) +{ + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + unsigned p, c, d; + + for (p = 0; p < mp_count; ++p) { + const unsigned b = (0x60 / 4) * p; + + for (c = 0; c < cfg->num_counters; ++c) { + count[p][c] = 0; + for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) { + if (hq->data[b + 20 + d] != hq->sequence) { + if (!wait) + return false; + if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) + return false; + } + if (hsq->ctr[c] & ~0x3) + count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)]; + else + count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]]; + } + } + } + return true; +} + +/* Metric calculations: + * sum(x) ... sum of x over all MPs + * avg(x) ... average of x over all MPs + * + * IPC : sum(inst_executed) / clock + * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) + * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) + * MP_EFFICIENCY : avg(active_cycles / clock) + * + * NOTE: Interpretation of IPC requires knowledge of MP count. + */ +static boolean +nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq, + boolean wait, union pipe_query_result *result) +{ + uint32_t count[32][4]; + uint64_t value = 0; + unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); + unsigned p, c; + const struct nvc0_hw_sm_query_cfg *cfg; + bool ret; + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) + ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count); + else + ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count); + if (!ret) + return false; + + if (cfg->op == NVC0_COUNTER_OPn_SUM) { + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + value += count[p][c]; + value = (value * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OPn_OR) { + uint32_t v = 0; + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + v |= count[p][c]; + value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OPn_AND) { + uint32_t v = ~0; + for (c = 0; c < cfg->num_counters; ++c) + for (p = 0; p < mp_count; ++p) + v &= count[p][c]; + value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) { + uint64_t v[2] = { 0, 0 }; + for (p = 0; p < mp_count; ++p) { + v[0] += count[p][0]; + v[1] += count[p][1]; + } + if (v[0]) + value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]); + } else + if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) { + for (p = 0; p < mp_count; ++p) + value += count[p][0]; + if (count[0][1]) + value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]); + else + value = 0; + } else + if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) { + unsigned mp_used = 0; + for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) + if (count[p][1]) + value += (count[p][0] * cfg->norm[0]) / count[p][1]; + if (mp_used) + value /= (uint64_t)mp_used * cfg->norm[1]; + } else + if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) { + unsigned mp_used = 0; + for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0]) + value += count[p][0]; + if (count[0][1] && mp_used) { + value *= cfg->norm[0]; + value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1]; + } else { + value = 0; + } + } + + *(uint64_t *)result = value; + return true; +} + +static const struct nvc0_hw_query_funcs hw_sm_query_funcs = { + .destroy_query = nvc0_hw_sm_destroy_query, + .begin_query = nvc0_hw_sm_begin_query, + .end_query = nvc0_hw_sm_end_query, + .get_query_result = nvc0_hw_sm_get_query_result, +}; + +struct nvc0_hw_query * +nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nvc0_hw_sm_query *hsq; + struct nvc0_hw_query *hq; + unsigned space; + + if (nvc0->screen->base.device->drm_version < 0x01000101) + return NULL; + + if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) && + (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)) + return NULL; + + hsq = CALLOC_STRUCT(nvc0_hw_sm_query); + if (!hsq) + return NULL; + + hq = &hsq->base; + hq->funcs = &hw_sm_query_funcs; + hq->base.type = type; + + if (screen->base.class_3d >= NVE4_3D_CLASS) { + /* for each MP: + * [00] = WS0.C0 + * [04] = WS0.C1 + * [08] = WS0.C2 + * [0c] = WS0.C3 + * [10] = WS1.C0 + * [14] = WS1.C1 + * [18] = WS1.C2 + * [1c] = WS1.C3 + * [20] = WS2.C0 + * [24] = WS2.C1 + * [28] = WS2.C2 + * [2c] = WS2.C3 + * [30] = WS3.C0 + * [34] = WS3.C1 + * [38] = WS3.C2 + * [3c] = WS3.C3 + * [40] = MP.C4 + * [44] = MP.C5 + * [48] = MP.C6 + * [4c] = MP.C7 + * [50] = WS0.sequence + * [54] = WS1.sequence + * [58] = WS2.sequence + * [5c] = WS3.sequence + */ + space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); + } else { + /* for each MP: + * [00] = MP.C0 + * [04] = MP.C1 + * [08] = MP.C2 + * [0c] = MP.C3 + * [10] = MP.C4 + * [14] = MP.C5 + * [18] = MP.C6 + * [1c] = MP.C7 + * [20] = MP.sequence + */ + space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t); + } + + if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) { + FREE(hq); + return NULL; + } + + return hq; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h new file mode 100644 index 00000000000..bab6f34afc8 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -0,0 +1,117 @@ +#ifndef __NVC0_QUERY_HW_SM_H__ +#define __NVC0_QUERY_HW_SM_H__ + +#include "nvc0_query_hw.h" + +struct nvc0_hw_sm_query { + struct nvc0_hw_query base; + int8_t ctr[4]; +}; + +static inline struct nvc0_hw_sm_query * +nvc0_hw_sm_query(struct nvc0_hw_query *hq) +{ + return (struct nvc0_hw_sm_query *)hq; +} + +/* + * Performance counter queries: + */ +#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) +#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) +enum nve4_hw_sm_queries +{ + NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVE4_HW_SM_QUERY_ACTIVE_WARPS, + NVE4_HW_SM_QUERY_ATOM_COUNT, + NVE4_HW_SM_QUERY_BRANCH, + NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, + NVE4_HW_SM_QUERY_GLD_REQUEST, + NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GST_TRANSACTIONS, + NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, + NVE4_HW_SM_QUERY_GRED_COUNT, + NVE4_HW_SM_QUERY_GST_REQUEST, + NVE4_HW_SM_QUERY_INST_EXECUTED, + NVE4_HW_SM_QUERY_INST_ISSUED, + NVE4_HW_SM_QUERY_INST_ISSUED1, + NVE4_HW_SM_QUERY_INST_ISSUED2, + NVE4_HW_SM_QUERY_L1_GLD_HIT, + NVE4_HW_SM_QUERY_L1_GLD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_LD, + NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, + NVE4_HW_SM_QUERY_LOCAL_ST, + NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, + NVE4_HW_SM_QUERY_PROF_TRIGGER_0, + NVE4_HW_SM_QUERY_PROF_TRIGGER_1, + NVE4_HW_SM_QUERY_PROF_TRIGGER_2, + NVE4_HW_SM_QUERY_PROF_TRIGGER_3, + NVE4_HW_SM_QUERY_PROF_TRIGGER_4, + NVE4_HW_SM_QUERY_PROF_TRIGGER_5, + NVE4_HW_SM_QUERY_PROF_TRIGGER_6, + NVE4_HW_SM_QUERY_PROF_TRIGGER_7, + NVE4_HW_SM_QUERY_SHARED_LD, + NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, + NVE4_HW_SM_QUERY_SHARED_ST, + NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, + NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, + NVE4_HW_SM_QUERY_THREADS_LAUNCHED, + NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, + NVE4_HW_SM_QUERY_WARPS_LAUNCHED, + NVE4_HW_SM_QUERY_METRIC_IPC, + NVE4_HW_SM_QUERY_METRIC_IPAC, + NVE4_HW_SM_QUERY_METRIC_IPEC, + NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY, + NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY, + NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD, + NVE4_HW_SM_QUERY_COUNT +}; + +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) +enum nvc0_hw_sm_queries +{ + NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, + NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_COUNT, + NVC0_HW_SM_QUERY_BRANCH, + NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, + NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GRED_COUNT, + NVC0_HW_SM_QUERY_GST_REQUEST, + NVC0_HW_SM_QUERY_INST_EXECUTED, + NVC0_HW_SM_QUERY_INST_ISSUED1_0, + NVC0_HW_SM_QUERY_INST_ISSUED1_1, + NVC0_HW_SM_QUERY_INST_ISSUED2_0, + NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_PROF_TRIGGER_0, + NVC0_HW_SM_QUERY_PROF_TRIGGER_1, + NVC0_HW_SM_QUERY_PROF_TRIGGER_2, + NVC0_HW_SM_QUERY_PROF_TRIGGER_3, + NVC0_HW_SM_QUERY_PROF_TRIGGER_4, + NVC0_HW_SM_QUERY_PROF_TRIGGER_5, + NVC0_HW_SM_QUERY_PROF_TRIGGER_6, + NVC0_HW_SM_QUERY_PROF_TRIGGER_7, + NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_THREADS_LAUNCHED, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, + NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_WARPS_LAUNCHED, + NVC0_HW_SM_QUERY_COUNT +}; + +struct nvc0_hw_query * +nvc0_hw_sm_create_query(struct nvc0_context *, unsigned); + +#endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index fa4f8645ffe..8cf7560e21f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -94,7 +94,7 @@ struct nvc0_screen { struct { struct nvc0_program *prog; /* compute state object to read MP counters */ - struct pipe_query *mp_counter[8]; /* counter to query allocation */ + struct nvc0_hw_sm_query *mp_counter[8]; /* counter to query allocation */ uint8_t num_hw_sm_active[2]; bool mp_counters_enabled; } pm; From 886d46b0897182e489e03f7302a575b54004faca Mon Sep 17 00:00:00 2001 From: Neil Roberts Date: Thu, 30 Jul 2015 12:10:08 +0100 Subject: [PATCH 015/270] nir: Add a function to determine if a source is dynamically uniform Adds nir_src_is_dynamically_uniform which returns true if the source is known to be dynamically uniform. This will be used in a later patch to add a workaround for cases that only work with dynamically uniform sources. Note that the function is not definitive, it can return false negatives (but not false positives). Currently it only detects constants and uniform accesses. It could easily be extended to include more cases. Reviewed-by: Matt Turner --- src/glsl/nir/nir.c | 29 +++++++++++++++++++++++++++++ src/glsl/nir/nir.h | 1 + 2 files changed, 30 insertions(+) diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c index e12da805281..c8fc4280f72 100644 --- a/src/glsl/nir/nir.c +++ b/src/glsl/nir/nir.c @@ -1080,6 +1080,35 @@ nir_src_as_const_value(nir_src src) return &load->value; } +/** + * Returns true if the source is known to be dynamically uniform. Otherwise it + * returns false which means it may or may not be dynamically uniform but it + * can't be determined. + */ +bool +nir_src_is_dynamically_uniform(nir_src src) +{ + if (!src.is_ssa) + return false; + + /* Constants are trivially dynamically uniform */ + if (src.ssa->parent_instr->type == nir_instr_type_load_const) + return true; + + /* As are uniform variables */ + if (src.ssa->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src.ssa->parent_instr); + + if (intr->intrinsic == nir_intrinsic_load_uniform) + return true; + } + + /* XXX: this could have many more tests, such as when a sampler function is + * called with dynamically uniform arguments. + */ + return false; +} + bool nir_srcs_equal(nir_src src1, nir_src src2) { diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index bde9f49a90c..95e219168c1 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1784,6 +1784,7 @@ bool nir_foreach_dest(nir_instr *instr, nir_foreach_dest_cb cb, void *state); bool nir_foreach_src(nir_instr *instr, nir_foreach_src_cb cb, void *state); nir_const_value *nir_src_as_const_value(nir_src src); +bool nir_src_is_dynamically_uniform(nir_src src); bool nir_srcs_equal(nir_src src1, nir_src src2); void nir_instr_rewrite_src(nir_instr *instr, nir_src *src, nir_src new_src); void nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src); From 728d7bc85f1f101875349690bf1637037a5a1817 Mon Sep 17 00:00:00 2001 From: Neil Roberts Date: Mon, 5 Oct 2015 13:50:56 +0200 Subject: [PATCH 016/270] i965: Add a second successor to BRW_OPCODE_WHILE It is possible to directly predicate the WHILE instruction. In this case there will be a second successor block because the execution can resume from the instruction after the loop. This will be used in a subsequent patch. Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_cfg.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp index 91d53eff5a7..531fa16b387 100644 --- a/src/mesa/drivers/dri/i965/brw_cfg.cpp +++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp @@ -305,6 +305,10 @@ cfg_t::cfg_t(exec_list *instructions) assert(cur_do != NULL && cur_while != NULL); cur->add_successor(mem_ctx, cur_do); + + if (inst->predicate) + cur->add_successor(mem_ctx, cur_while); + set_next_block(&cur, cur_while, ip); /* Pop the stack so we're in the previous loop */ From da361acd1c899d533caec6cae5a336f6ab35e076 Mon Sep 17 00:00:00 2001 From: Neil Roberts Date: Fri, 17 Jul 2015 14:40:03 +0100 Subject: [PATCH 017/270] i965/fs: Handle non-const sample number in interpolateAtSample If a non-const sample number is given to interpolateAtSample it will now generate an indirect send message with the sample ID similar to how non-const sampler array indexing works. Previously non-const values were ignored and instead it ended up using a constant 0 value. The generator will try to determine if the sample ID is dynamically uniform via nir_src_is_dynamically_uniform. If not it will query the pixel interpolator in a loop, once for each different live sample number. The next live sample number is found using emit_uniformize. If multiple live channels have the same sample number then they will be handled in a single iteration of the loop. The loop is necessary because the indirect send message doesn't seem to have a way to specify a different value for each fragment. This fixes the following two Piglit tests: arb_gpu_shader5-interpolateAtSample-nonconst arb_gpu_shader5-interpolateAtSample-dynamically-nonuniform v2: Handle dynamically non-uniform sample ids. v3: Remove the BREAK instruction and predicate the WHILE directly. Make the tokens arrays const. (Matt Turner) v4: Iterate over the live channels instead of each possible sample number. v5: Don't special case immediate values in brw_pixel_interpolator_query. Make a better wrapper for the function to set up the PI send instruction. Ensure that the SHL instructions are scalar. (Francisco Jerez). Reviewed-by: Francisco Jerez --- src/mesa/drivers/dri/i965/brw_eu.h | 2 +- src/mesa/drivers/dri/i965/brw_eu_emit.c | 25 ++-- .../drivers/dri/i965/brw_fs_generator.cpp | 5 +- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 139 ++++++++++++++---- 4 files changed, 129 insertions(+), 42 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 761aa0ec5fa..0ac1ad9378b 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -461,7 +461,7 @@ brw_pixel_interpolator_query(struct brw_codegen *p, struct brw_reg mrf, bool noperspective, unsigned mode, - unsigned data, + struct brw_reg data, unsigned msg_length, unsigned response_length); diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index dc699bb6321..bf2fee9ed48 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -3212,26 +3212,29 @@ brw_pixel_interpolator_query(struct brw_codegen *p, struct brw_reg mrf, bool noperspective, unsigned mode, - unsigned data, + struct brw_reg data, unsigned msg_length, unsigned response_length) { const struct brw_device_info *devinfo = p->devinfo; - struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); + struct brw_inst *insn; + const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current); - brw_set_dest(p, insn, dest); - brw_set_src0(p, insn, mrf); - brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR, - msg_length, response_length, - false /* header is never present for PI */, - false); + /* brw_send_indirect_message will automatically use a direct send message + * if data is actually immediate. + */ + insn = brw_send_indirect_message(p, + GEN7_SFID_PIXEL_INTERPOLATOR, + dest, + mrf, + vec1(data)); + brw_inst_set_mlen(devinfo, insn, msg_length); + brw_inst_set_rlen(devinfo, insn, response_length); - brw_inst_set_pi_simd_mode( - devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16); + brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16); brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */ brw_inst_set_pi_nopersp(devinfo, insn, noperspective); brw_inst_set_pi_message_type(devinfo, insn, mode); - brw_inst_set_pi_message_data(devinfo, insn, data); } void diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 6f8b75e339f..17e19cf807a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -1377,15 +1377,14 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst, struct brw_reg msg_data, unsigned msg_type) { - assert(msg_data.file == BRW_IMMEDIATE_VALUE && - msg_data.type == BRW_REGISTER_TYPE_UD); + assert(msg_data.type == BRW_REGISTER_TYPE_UD); brw_pixel_interpolator_query(p, retype(dst, BRW_REGISTER_TYPE_UW), src, inst->pi_noperspective, msg_type, - msg_data.dw1.ud, + msg_data, inst->mlen, inst->regs_written); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 03fe6804701..bc0df6850c4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1180,6 +1180,36 @@ get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type) } } +static fs_inst * +emit_pixel_interpolater_send(const fs_builder &bld, + enum opcode opcode, + const fs_reg &dst, + const fs_reg &src, + const fs_reg &desc, + glsl_interp_qualifier interpolation) +{ + fs_inst *inst; + fs_reg payload; + int mlen; + + if (src.file == BAD_FILE) { + /* Dummy payload */ + payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1); + mlen = 1; + } else { + payload = src; + mlen = 2 * bld.dispatch_width() / 8; + } + + inst = bld.emit(opcode, dst, payload, desc); + inst->mlen = mlen; + /* 2 floats per slot returned */ + inst->regs_written = 2 * bld.dispatch_width() / 8; + inst->pi_noperspective = interpolation == INTERP_QUALIFIER_NOPERSPECTIVE; + + return inst; +} + void fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { @@ -1583,28 +1613,81 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true; fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - - /* For most messages, we need one reg of ignored data; the hardware - * requires mlen==1 even when there is no payload. in the per-slot - * offset case, we'll replace this with the proper source data. - */ - fs_reg src = vgrf(glsl_type::float_type); - int mlen = 1; /* one reg unless overriden */ - fs_inst *inst; + const glsl_interp_qualifier interpolation = + (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation; switch (instr->intrinsic) { case nir_intrinsic_interp_var_at_centroid: - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, - dst_xy, src, fs_reg(0u)); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_CENTROID, + dst_xy, + fs_reg(), /* src */ + fs_reg(0u), + interpolation); break; case nir_intrinsic_interp_var_at_sample: { - /* XXX: We should probably handle non-constant sample id's */ nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); - assert(const_sample); - unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0; - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, - fs_reg(msg_data)); + + if (const_sample) { + unsigned msg_data = const_sample->i[0] << 4; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + fs_reg(msg_data), + interpolation); + } else { + const fs_reg sample_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + if (nir_src_is_dynamically_uniform(instr->src[0])) { + const fs_reg sample_id = bld.emit_uniformize(sample_src); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + msg_data, + interpolation); + } else { + /* Make a loop that sends a message to the pixel interpolater + * for the sample number in each live channel. If there are + * multiple channels with the same sample number then these + * will be handled simultaneously with a single interation of + * the loop. + */ + bld.emit(BRW_OPCODE_DO); + + /* Get the next live sample number into sample_id_reg */ + const fs_reg sample_id = bld.emit_uniformize(sample_src); + + /* Set the flag register so that we can perform the send + * message on all channels that have the same sample number + */ + bld.CMP(bld.null_reg_ud(), + sample_src, sample_id, + BRW_CONDITIONAL_EQ); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); + fs_inst *inst = + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + msg_data, + interpolation); + set_predicate(BRW_PREDICATE_NORMAL, inst); + + /* Continue the loop if there are any live channels left */ + set_predicate_inv(BRW_PREDICATE_NORMAL, + true, /* inverse */ + bld.emit(BRW_OPCODE_WHILE)); + } + } + break; } @@ -1615,10 +1698,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src, - fs_reg(off_x | (off_y << 4))); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + dst_xy, + fs_reg(), /* src */ + fs_reg(off_x | (off_y << 4)), + interpolation); } else { - src = vgrf(glsl_type::ivec2_type); + fs_reg src = vgrf(glsl_type::ivec2_type); fs_reg offset_src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); for (int i = 0; i < 2; i++) { @@ -1646,9 +1733,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.SEL(offset(src, bld, i), itemp, fs_reg(7))); } - mlen = 2 * dispatch_width / 8; - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src, - fs_reg(0u)); + const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; + emit_pixel_interpolater_send(bld, + opcode, + dst_xy, + src, + fs_reg(0u), + interpolation); } break; } @@ -1657,12 +1748,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr unreachable("Invalid intrinsic"); } - inst->mlen = mlen; - /* 2 floats per slot returned */ - inst->regs_written = 2 * dispatch_width / 8; - inst->pi_noperspective = instr->variables[0]->var->data.interpolation == - INTERP_QUALIFIER_NOPERSPECTIVE; - for (unsigned j = 0; j < instr->num_components; j++) { fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); src.type = dest.type; From 20d6d812dc9d35cb082142ac6c9744971692797e Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 24 Sep 2015 00:54:52 -0400 Subject: [PATCH 018/270] nir: split out instruction comparison functions Right now nir_instrs_equal() is tied pretty tightly to CSE, but we're going to introduce the idea of an instruction set and tie it to that instead. In anticipation of that, move this into its own file where we'll add the rest of the instruction set implementation later. v2: Rebase on texture support. Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/Makefile.sources | 1 + src/glsl/nir/nir.c | 27 ----- src/glsl/nir/nir_instr_set.c | 206 +++++++++++++++++++++++++++++++++++ src/glsl/nir/nir_instr_set.h | 29 +++++ src/glsl/nir/nir_opt_cse.c | 155 +------------------------- 5 files changed, 237 insertions(+), 181 deletions(-) create mode 100644 src/glsl/nir/nir_instr_set.c create mode 100644 src/glsl/nir/nir_instr_set.h diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index 70832460af9..bb8bddc69af 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -33,6 +33,7 @@ NIR_FILES = \ nir/nir_gs_count_vertices.c \ nir/nir_intrinsics.c \ nir/nir_intrinsics.h \ + nir/nir_instr_set.c \ nir/nir_live_variables.c \ nir/nir_lower_alu_to_scalar.c \ nir/nir_lower_atomics.c \ diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c index c8fc4280f72..83670889a29 100644 --- a/src/glsl/nir/nir.c +++ b/src/glsl/nir/nir.c @@ -1109,33 +1109,6 @@ nir_src_is_dynamically_uniform(nir_src src) return false; } -bool -nir_srcs_equal(nir_src src1, nir_src src2) -{ - if (src1.is_ssa) { - if (src2.is_ssa) { - return src1.ssa == src2.ssa; - } else { - return false; - } - } else { - if (src2.is_ssa) { - return false; - } else { - if ((src1.reg.indirect == NULL) != (src2.reg.indirect == NULL)) - return false; - - if (src1.reg.indirect) { - if (!nir_srcs_equal(*src1.reg.indirect, *src2.reg.indirect)) - return false; - } - - return src1.reg.reg == src2.reg.reg && - src1.reg.base_offset == src2.reg.base_offset; - } - } -} - static void src_remove_all_uses(nir_src *src) { diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c new file mode 100644 index 00000000000..b397442fbe5 --- /dev/null +++ b/src/glsl/nir/nir_instr_set.c @@ -0,0 +1,206 @@ +/* + * Copyright © 2014 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir_instr_set.h" + +bool +nir_srcs_equal(nir_src src1, nir_src src2) +{ + if (src1.is_ssa) { + if (src2.is_ssa) { + return src1.ssa == src2.ssa; + } else { + return false; + } + } else { + if (src2.is_ssa) { + return false; + } else { + if ((src1.reg.indirect == NULL) != (src2.reg.indirect == NULL)) + return false; + + if (src1.reg.indirect) { + if (!nir_srcs_equal(*src1.reg.indirect, *src2.reg.indirect)) + return false; + } + + return src1.reg.reg == src2.reg.reg && + src1.reg.base_offset == src2.reg.base_offset; + } + } +} + +static bool +nir_alu_srcs_equal(nir_alu_instr *alu1, nir_alu_instr *alu2, unsigned src1, + unsigned src2) +{ + if (alu1->src[src1].abs != alu2->src[src2].abs || + alu1->src[src1].negate != alu2->src[src2].negate) + return false; + + for (unsigned i = 0; i < nir_ssa_alu_instr_src_components(alu1, src1); i++) { + if (alu1->src[src1].swizzle[i] != alu2->src[src2].swizzle[i]) + return false; + } + + return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src); +} + +bool +nir_instrs_equal(nir_instr *instr1, nir_instr *instr2) +{ + if (instr1->type != instr2->type) + return false; + + switch (instr1->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu1 = nir_instr_as_alu(instr1); + nir_alu_instr *alu2 = nir_instr_as_alu(instr2); + + if (alu1->op != alu2->op) + return false; + + /* TODO: We can probably acutally do something more inteligent such + * as allowing different numbers and taking a maximum or something + * here */ + if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) + return false; + + if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { + assert(nir_op_infos[alu1->op].num_inputs == 2); + return (nir_alu_srcs_equal(alu1, alu2, 0, 0) && + nir_alu_srcs_equal(alu1, alu2, 1, 1)) || + (nir_alu_srcs_equal(alu1, alu2, 0, 1) && + nir_alu_srcs_equal(alu1, alu2, 1, 0)); + } else { + for (unsigned i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) { + if (!nir_alu_srcs_equal(alu1, alu2, i, i)) + return false; + } + } + return true; + } + case nir_instr_type_tex: { + nir_tex_instr *tex1 = nir_instr_as_tex(instr1); + nir_tex_instr *tex2 = nir_instr_as_tex(instr2); + + if (tex1->op != tex2->op) + return false; + + if (tex1->num_srcs != tex2->num_srcs) + return false; + for (unsigned i = 0; i < tex1->num_srcs; i++) { + if (tex1->src[i].src_type != tex2->src[i].src_type || + !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) { + return false; + } + } + + if (tex1->coord_components != tex2->coord_components || + tex1->sampler_dim != tex2->sampler_dim || + tex1->is_array != tex2->is_array || + tex1->is_shadow != tex2->is_shadow || + tex1->is_new_style_shadow != tex2->is_new_style_shadow || + memcmp(tex1->const_offset, tex2->const_offset, + sizeof(tex1->const_offset)) != 0 || + tex1->component != tex2->component || + tex1->sampler_index != tex2->sampler_index || + tex1->sampler_array_size != tex2->sampler_array_size) { + return false; + } + + /* Don't support un-lowered sampler derefs currently. */ + if (tex1->sampler || tex2->sampler) + return false; + + return true; + } + case nir_instr_type_load_const: { + nir_load_const_instr *load1 = nir_instr_as_load_const(instr1); + nir_load_const_instr *load2 = nir_instr_as_load_const(instr2); + + if (load1->def.num_components != load2->def.num_components) + return false; + + return memcmp(load1->value.f, load2->value.f, + load1->def.num_components * sizeof(*load2->value.f)) == 0; + } + case nir_instr_type_phi: { + nir_phi_instr *phi1 = nir_instr_as_phi(instr1); + nir_phi_instr *phi2 = nir_instr_as_phi(instr2); + + if (phi1->instr.block != phi2->instr.block) + return false; + + nir_foreach_phi_src(phi1, src1) { + nir_foreach_phi_src(phi2, src2) { + if (src1->pred == src2->pred) { + if (!nir_srcs_equal(src1->src, src2->src)) + return false; + + break; + } + } + } + + return true; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrinsic1 = nir_instr_as_intrinsic(instr1); + nir_intrinsic_instr *intrinsic2 = nir_instr_as_intrinsic(instr2); + const nir_intrinsic_info *info = + &nir_intrinsic_infos[intrinsic1->intrinsic]; + + if (intrinsic1->intrinsic != intrinsic2->intrinsic || + intrinsic1->num_components != intrinsic2->num_components) + return false; + + if (info->has_dest && intrinsic1->dest.ssa.num_components != + intrinsic2->dest.ssa.num_components) + return false; + + for (unsigned i = 0; i < info->num_srcs; i++) { + if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i])) + return false; + } + + assert(info->num_variables == 0); + + for (unsigned i = 0; i < info->num_indices; i++) { + if (intrinsic1->const_index[i] != intrinsic2->const_index[i]) + return false; + } + + return true; + } + case nir_instr_type_call: + case nir_instr_type_jump: + case nir_instr_type_ssa_undef: + case nir_instr_type_parallel_copy: + default: + unreachable("Invalid instruction type"); + } + + return false; +} + diff --git a/src/glsl/nir/nir_instr_set.h b/src/glsl/nir/nir_instr_set.h new file mode 100644 index 00000000000..531bdade319 --- /dev/null +++ b/src/glsl/nir/nir_instr_set.h @@ -0,0 +1,29 @@ +/* + * Copyright © 2014 Connor Abbott + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "nir.h" + +bool nir_instrs_equal(nir_instr *instr1, nir_instr *instr2); + diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c index 64c94afd480..72438dda43f 100644 --- a/src/glsl/nir/nir_opt_cse.c +++ b/src/glsl/nir/nir_opt_cse.c @@ -25,7 +25,7 @@ * */ -#include "nir.h" +#include "nir_instr_set.h" /* * Implements common subexpression elimination @@ -36,159 +36,6 @@ struct cse_state { bool progress; }; -static bool -nir_alu_srcs_equal(nir_alu_instr *alu1, nir_alu_instr *alu2, unsigned src1, - unsigned src2) -{ - if (alu1->src[src1].abs != alu2->src[src2].abs || - alu1->src[src1].negate != alu2->src[src2].negate) - return false; - - for (unsigned i = 0; i < nir_ssa_alu_instr_src_components(alu1, src1); i++) { - if (alu1->src[src1].swizzle[i] != alu2->src[src2].swizzle[i]) - return false; - } - - return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src); -} - -static bool -nir_instrs_equal(nir_instr *instr1, nir_instr *instr2) -{ - if (instr1->type != instr2->type) - return false; - - switch (instr1->type) { - case nir_instr_type_alu: { - nir_alu_instr *alu1 = nir_instr_as_alu(instr1); - nir_alu_instr *alu2 = nir_instr_as_alu(instr2); - - if (alu1->op != alu2->op) - return false; - - /* TODO: We can probably acutally do something more inteligent such - * as allowing different numbers and taking a maximum or something - * here */ - if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) - return false; - - if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { - assert(nir_op_infos[alu1->op].num_inputs == 2); - return (nir_alu_srcs_equal(alu1, alu2, 0, 0) && - nir_alu_srcs_equal(alu1, alu2, 1, 1)) || - (nir_alu_srcs_equal(alu1, alu2, 0, 1) && - nir_alu_srcs_equal(alu1, alu2, 1, 0)); - } else { - for (unsigned i = 0; i < nir_op_infos[alu1->op].num_inputs; i++) { - if (!nir_alu_srcs_equal(alu1, alu2, i, i)) - return false; - } - } - return true; - } - case nir_instr_type_tex: { - nir_tex_instr *tex1 = nir_instr_as_tex(instr1); - nir_tex_instr *tex2 = nir_instr_as_tex(instr2); - - if (tex1->op != tex2->op) - return false; - - if (tex1->num_srcs != tex2->num_srcs) - return false; - for (unsigned i = 0; i < tex1->num_srcs; i++) { - if (tex1->src[i].src_type != tex2->src[i].src_type || - !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) { - return false; - } - } - - if (tex1->coord_components != tex2->coord_components || - tex1->sampler_dim != tex2->sampler_dim || - tex1->is_array != tex2->is_array || - tex1->is_shadow != tex2->is_shadow || - tex1->is_new_style_shadow != tex2->is_new_style_shadow || - memcmp(tex1->const_offset, tex2->const_offset, - sizeof(tex1->const_offset)) != 0 || - tex1->component != tex2->component || - tex1->sampler_index != tex2->sampler_index || - tex1->sampler_array_size != tex2->sampler_array_size) { - return false; - } - - /* Don't support un-lowered sampler derefs currently. */ - if (tex1->sampler || tex2->sampler) - return false; - - return true; - } - case nir_instr_type_load_const: { - nir_load_const_instr *load1 = nir_instr_as_load_const(instr1); - nir_load_const_instr *load2 = nir_instr_as_load_const(instr2); - - if (load1->def.num_components != load2->def.num_components) - return false; - - return memcmp(load1->value.f, load2->value.f, - load1->def.num_components * sizeof(*load2->value.f)) == 0; - } - case nir_instr_type_phi: { - nir_phi_instr *phi1 = nir_instr_as_phi(instr1); - nir_phi_instr *phi2 = nir_instr_as_phi(instr2); - - if (phi1->instr.block != phi2->instr.block) - return false; - - nir_foreach_phi_src(phi1, src1) { - nir_foreach_phi_src(phi2, src2) { - if (src1->pred == src2->pred) { - if (!nir_srcs_equal(src1->src, src2->src)) - return false; - - break; - } - } - } - - return true; - } - case nir_instr_type_intrinsic: { - nir_intrinsic_instr *intrinsic1 = nir_instr_as_intrinsic(instr1); - nir_intrinsic_instr *intrinsic2 = nir_instr_as_intrinsic(instr2); - const nir_intrinsic_info *info = - &nir_intrinsic_infos[intrinsic1->intrinsic]; - - if (intrinsic1->intrinsic != intrinsic2->intrinsic || - intrinsic1->num_components != intrinsic2->num_components) - return false; - - if (info->has_dest && intrinsic1->dest.ssa.num_components != - intrinsic2->dest.ssa.num_components) - return false; - - for (unsigned i = 0; i < info->num_srcs; i++) { - if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i])) - return false; - } - - assert(info->num_variables == 0); - - for (unsigned i = 0; i < info->num_indices; i++) { - if (intrinsic1->const_index[i] != intrinsic2->const_index[i]) - return false; - } - - return true; - } - case nir_instr_type_call: - case nir_instr_type_jump: - case nir_instr_type_ssa_undef: - case nir_instr_type_parallel_copy: - default: - unreachable("Invalid instruction type"); - } - - return false; -} static bool src_is_ssa(nir_src *src, void *data) From d6bc35934f3d1150a1da0edfb0746cd0ae8bd63b Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 17 Mar 2015 01:03:28 -0400 Subject: [PATCH 019/270] nir: constify nir_ssa_alu_instr_src_components() Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/nir/nir.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 95e219168c1..befc7fce724 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -725,7 +725,7 @@ nir_alu_instr_channel_used(nir_alu_instr *instr, unsigned src, unsigned channel) * used for a source */ static inline unsigned -nir_ssa_alu_instr_src_components(nir_alu_instr *instr, unsigned src) +nir_ssa_alu_instr_src_components(const nir_alu_instr *instr, unsigned src) { assert(instr->dest.dest.is_ssa); From 005c2efb7b755ac5887dc5938baa7d95a50fe853 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 24 Sep 2015 01:05:15 -0400 Subject: [PATCH 020/270] nir: constify instruction comparison functions v2: rebase, don't constify nir_srcs_equal() as it's pass-by-value anyways Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/nir/nir_instr_set.c | 6 +++--- src/glsl/nir/nir_instr_set.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c index b397442fbe5..72ab04895f6 100644 --- a/src/glsl/nir/nir_instr_set.c +++ b/src/glsl/nir/nir_instr_set.c @@ -51,8 +51,8 @@ nir_srcs_equal(nir_src src1, nir_src src2) } static bool -nir_alu_srcs_equal(nir_alu_instr *alu1, nir_alu_instr *alu2, unsigned src1, - unsigned src2) +nir_alu_srcs_equal(const nir_alu_instr *alu1, const nir_alu_instr *alu2, + unsigned src1, unsigned src2) { if (alu1->src[src1].abs != alu2->src[src2].abs || alu1->src[src1].negate != alu2->src[src2].negate) @@ -67,7 +67,7 @@ nir_alu_srcs_equal(nir_alu_instr *alu1, nir_alu_instr *alu2, unsigned src1, } bool -nir_instrs_equal(nir_instr *instr1, nir_instr *instr2) +nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) { if (instr1->type != instr2->type) return false; diff --git a/src/glsl/nir/nir_instr_set.h b/src/glsl/nir/nir_instr_set.h index 531bdade319..f5baffacb0e 100644 --- a/src/glsl/nir/nir_instr_set.h +++ b/src/glsl/nir/nir_instr_set.h @@ -25,5 +25,5 @@ #include "nir.h" -bool nir_instrs_equal(nir_instr *instr1, nir_instr *instr2); +bool nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2); From 523a28d3fe0dd371ae01b7353f263a6541480d89 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 24 Sep 2015 01:57:04 -0400 Subject: [PATCH 021/270] nir: add an instruction set API This will replace direct usage of nir_instrs_equal() in the CSE pass, which reduces an O(n^2) algorithm with an effectively O(n) one. It'll also be useful for implementing GVN on top of GCM. v2: - Add texture support. - Add more comments. - Rename instr_can_hash() to instr_can_rewrite() since it's really more about whether its uses can be rewritten, and it's implicitly used by nir_instrs_equal() as well. - Rename nir_instr_set_add() to nir_instr_set_add_or_rewrite() (Jason). - Make the HASH() macro less magical (Topi). - Rewrite the commit message. v3: - For sorting phi sources, use a VLA, store pointers to the sources, and compare the predecessor pointer directly (Jason). Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/nir/nir_instr_set.c | 314 +++++++++++++++++++++++++++++++++++ src/glsl/nir/nir_instr_set.h | 35 ++++ 2 files changed, 349 insertions(+) diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c index 72ab04895f6..7460fccba10 100644 --- a/src/glsl/nir/nir_instr_set.c +++ b/src/glsl/nir/nir_instr_set.c @@ -22,6 +22,181 @@ */ #include "nir_instr_set.h" +#include "nir_vla.h" + +#define HASH(hash, data) _mesa_fnv32_1a_accumulate((hash), (data)) + +static uint32_t +hash_src(uint32_t hash, const nir_src *src) +{ + assert(src->is_ssa); + hash = HASH(hash, src->ssa); + return hash; +} + +static uint32_t +hash_alu_src(uint32_t hash, const nir_alu_src *src, unsigned num_components) +{ + hash = HASH(hash, src->abs); + hash = HASH(hash, src->negate); + + for (unsigned i = 0; i < num_components; i++) + hash = HASH(hash, src->swizzle[i]); + + hash = hash_src(hash, &src->src); + return hash; +} + +static uint32_t +hash_alu(uint32_t hash, const nir_alu_instr *instr) +{ + hash = HASH(hash, instr->op); + hash = HASH(hash, instr->dest.dest.ssa.num_components); + + if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { + assert(nir_op_infos[instr->op].num_inputs == 2); + uint32_t hash0 = hash_alu_src(hash, &instr->src[0], + nir_ssa_alu_instr_src_components(instr, 0)); + uint32_t hash1 = hash_alu_src(hash, &instr->src[1], + nir_ssa_alu_instr_src_components(instr, 1)); + /* For commutative operations, we need some commutative way of + * combining the hashes. One option would be to XOR them but that + * means that anything with two identical sources will hash to 0 and + * that's common enough we probably don't want the guaranteed + * collision. Either addition or multiplication will also work. + */ + hash = hash0 * hash1; + } else { + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { + hash = hash_alu_src(hash, &instr->src[i], + nir_ssa_alu_instr_src_components(instr, i)); + } + } + + return hash; +} + +static uint32_t +hash_load_const(uint32_t hash, const nir_load_const_instr *instr) +{ + hash = HASH(hash, instr->def.num_components); + + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f, + instr->def.num_components + * sizeof(instr->value.f[0])); + + return hash; +} + +static int +cmp_phi_src(const void *data1, const void *data2) +{ + nir_phi_src *src1 = *(nir_phi_src **)data1; + nir_phi_src *src2 = *(nir_phi_src **)data2; + return src1->pred - src2->pred; +} + +static uint32_t +hash_phi(uint32_t hash, const nir_phi_instr *instr) +{ + hash = HASH(hash, instr->instr.block); + + /* sort sources by predecessor, since the order shouldn't matter */ + unsigned num_preds = instr->instr.block->predecessors->entries; + NIR_VLA(nir_phi_src *, srcs, num_preds); + unsigned i = 0; + nir_foreach_phi_src(instr, src) { + srcs[i++] = src; + } + + qsort(srcs, num_preds, sizeof(nir_phi_src *), cmp_phi_src); + + for (i = 0; i < num_preds; i++) { + hash = hash_src(hash, &srcs[i]->src); + hash = HASH(hash, srcs[i]->pred); + } + + return hash; +} + +static uint32_t +hash_intrinsic(uint32_t hash, const nir_intrinsic_instr *instr) +{ + const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; + hash = HASH(hash, instr->intrinsic); + + if (info->has_dest) + hash = HASH(hash, instr->dest.ssa.num_components); + + assert(info->num_variables == 0); + + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->const_index, + info->num_indices + * sizeof(instr->const_index[0])); + return hash; +} + +static uint32_t +hash_tex(uint32_t hash, const nir_tex_instr *instr) +{ + hash = HASH(hash, instr->op); + hash = HASH(hash, instr->num_srcs); + + for (unsigned i = 0; i < instr->num_srcs; i++) { + hash = HASH(hash, instr->src[i].src_type); + hash = hash_src(hash, &instr->src[i].src); + } + + hash = HASH(hash, instr->coord_components); + hash = HASH(hash, instr->sampler_dim); + hash = HASH(hash, instr->is_array); + hash = HASH(hash, instr->is_shadow); + hash = HASH(hash, instr->is_new_style_shadow); + hash = HASH(hash, instr->const_offset); + unsigned component = instr->component; + hash = HASH(hash, component); + hash = HASH(hash, instr->sampler_index); + hash = HASH(hash, instr->sampler_array_size); + + assert(!instr->sampler); + + return hash; +} + +/* Computes a hash of an instruction for use in a hash table. Note that this + * will only work for instructions where instr_can_rewrite() returns true, and + * it should return identical hashes for two instructions that are the same + * according nir_instrs_equal(). + */ + +static uint32_t +hash_instr(const void *data) +{ + const nir_instr *instr = data; + uint32_t hash = _mesa_fnv32_1a_offset_bias; + + switch (instr->type) { + case nir_instr_type_alu: + hash = hash_alu(hash, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + hash = hash_load_const(hash, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_phi: + hash = hash_phi(hash, nir_instr_as_phi(instr)); + break; + case nir_instr_type_intrinsic: + hash = hash_intrinsic(hash, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + hash = hash_tex(hash, nir_instr_as_tex(instr)); + break; + default: + unreachable("Invalid instruction type"); + } + + return hash; +} bool nir_srcs_equal(nir_src src1, nir_src src2) @@ -66,6 +241,12 @@ nir_alu_srcs_equal(const nir_alu_instr *alu1, const nir_alu_instr *alu2, return nir_srcs_equal(alu1->src[src1].src, alu2->src[src2].src); } +/* Returns "true" if two instructions are equal. Note that this will only + * work for the subset of instructions defined by instr_can_rewrite(). Also, + * it should only return "true" for instructions that hash_instr() will return + * the same hash for (ignoring collisions, of course). + */ + bool nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) { @@ -204,3 +385,136 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) return false; } +static bool +src_is_ssa(nir_src *src, void *data) +{ + (void) data; + return src->is_ssa; +} + +static bool +dest_is_ssa(nir_dest *dest, void *data) +{ + (void) data; + return dest->is_ssa; +} + +/* This function determines if uses of an instruction can safely be rewritten + * to use another identical instruction instead. Note that this function must + * be kept in sync with hash_instr() and nir_instrs_equal() -- only + * instructions that pass this test will be handed on to those functions, and + * conversely they must handle everything that this function returns true for. + */ + +static bool +instr_can_rewrite(nir_instr *instr) +{ + /* We only handle SSA. */ + if (!nir_foreach_dest(instr, dest_is_ssa, NULL) || + !nir_foreach_src(instr, src_is_ssa, NULL)) + return false; + + switch (instr->type) { + case nir_instr_type_alu: + case nir_instr_type_load_const: + case nir_instr_type_phi: + return true; + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + /* Don't support un-lowered sampler derefs currently. */ + if (tex->sampler) + return false; + + return true; + } + case nir_instr_type_intrinsic: { + const nir_intrinsic_info *info = + &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic]; + return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) && + (info->flags & NIR_INTRINSIC_CAN_REORDER) && + info->num_variables == 0; /* not implemented yet */ + } + case nir_instr_type_call: + case nir_instr_type_jump: + case nir_instr_type_ssa_undef: + return false; + case nir_instr_type_parallel_copy: + default: + unreachable("Invalid instruction type"); + } + + return false; +} + +static nir_ssa_def * +nir_instr_get_dest_ssa_def(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + assert(nir_instr_as_alu(instr)->dest.dest.is_ssa); + return &nir_instr_as_alu(instr)->dest.dest.ssa; + case nir_instr_type_load_const: + return &nir_instr_as_load_const(instr)->def; + case nir_instr_type_phi: + assert(nir_instr_as_phi(instr)->dest.is_ssa); + return &nir_instr_as_phi(instr)->dest.ssa; + case nir_instr_type_intrinsic: + assert(nir_instr_as_intrinsic(instr)->dest.is_ssa); + return &nir_instr_as_intrinsic(instr)->dest.ssa; + case nir_instr_type_tex: + assert(nir_instr_as_tex(instr)->dest.is_ssa); + return &nir_instr_as_tex(instr)->dest.ssa; + default: + unreachable("We never ask for any of these"); + } +} + +static bool +cmp_func(const void *data1, const void *data2) +{ + return nir_instrs_equal(data1, data2); +} + +struct set * +nir_instr_set_create(void *mem_ctx) +{ + return _mesa_set_create(mem_ctx, hash_instr, cmp_func); +} + +void +nir_instr_set_destroy(struct set *instr_set) +{ + _mesa_set_destroy(instr_set, NULL); +} + +bool +nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr) +{ + if (!instr_can_rewrite(instr)) + return false; + + struct set_entry *entry = _mesa_set_search(instr_set, instr); + if (entry) { + nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr); + nir_ssa_def *new_def = + nir_instr_get_dest_ssa_def((nir_instr *) entry->key); + nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def)); + return true; + } + + _mesa_set_add(instr_set, instr); + return false; +} + +void +nir_instr_set_remove(struct set *instr_set, nir_instr *instr) +{ + if (!instr_can_rewrite(instr)) + return; + + struct set_entry *entry = _mesa_set_search(instr_set, instr); + if (entry) + _mesa_set_remove(instr_set, entry); +} + diff --git a/src/glsl/nir/nir_instr_set.h b/src/glsl/nir/nir_instr_set.h index f5baffacb0e..a7f6c9dd1eb 100644 --- a/src/glsl/nir/nir_instr_set.h +++ b/src/glsl/nir/nir_instr_set.h @@ -27,3 +27,38 @@ bool nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2); +/** + * This file defines functions for creating, destroying, and manipulating an + * "instruction set," which is an abstraction for finding duplicate + * instructions using a hash set. Note that the question of whether an + * instruction is actually a duplicate (e.g. whether it has any side effects) + * is handled transparently. The user can pass any instruction to + * nir_instr_set_add_or_rewrite() and nir_instr_set_remove(), and if the + * instruction isn't safe to rewrite or isn't supported, it's silently + * removed. + */ + +/*@{*/ + +/** Creates an instruction set, using a given ralloc mem_ctx */ +struct set *nir_instr_set_create(void *mem_ctx); + +/** Destroys an instruction set. */ +void nir_instr_set_destroy(struct set *instr_set); + +/** + * Adds an instruction to an instruction set if it doesn't exist, or if it + * does already exist, rewrites all uses of it to point to the other + * already-inserted instruction. Returns 'true' if the uses of the instruction + * were rewritten. + */ +bool nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr); + +/** + * Removes an instruction from an instruction set, so that other instructions + * won't be merged with it. + */ +void nir_instr_set_remove(struct set *instr_set, nir_instr *instr); + +/*@}*/ + From e8308d0523f7dc78b34099cfe2c3d3daedb27d4c Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 22 May 2015 00:41:45 -0400 Subject: [PATCH 022/270] nir/cse: use the instruction set API This replaces an O(n^2) algorithm with an O(n) one, while allowing us to import most of the infrastructure required for GVN. The idea is to walk the dominance tree depth-first, similar when converting to SSA, and remove the instructions from the set when we're done visiting the sub-tree of the dominance tree so that the only instructions in the set are the instructions that dominate the current block. No piglit regressions. No shader-db changes. Compilation time for full shader-db: Difference at 95.0% confidence -35.826 +/- 2.16018 -6.2852% +/- 0.378975% (Student's t, pooled s = 3.37504) v2: - rebase on start_block removal - remove useless state struct - change commit message Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/nir/nir_opt_cse.c | 138 +++++++------------------------------ 1 file changed, 23 insertions(+), 115 deletions(-) diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c index 72438dda43f..93a6635337a 100644 --- a/src/glsl/nir/nir_opt_cse.c +++ b/src/glsl/nir/nir_opt_cse.c @@ -22,6 +22,7 @@ * * Authors: * Jason Ekstrand (jason@jlekstrand.net) + * Connor Abbott (cwabbott0@gmail.com) * */ @@ -31,144 +32,50 @@ * Implements common subexpression elimination */ -struct cse_state { - void *mem_ctx; - bool progress; -}; - +/* + * Visits and CSE's the given block and all its descendants in the dominance + * tree recursively. Note that the instr_set is guaranteed to only ever + * contain instructions that dominate the current block. + */ static bool -src_is_ssa(nir_src *src, void *data) +cse_block(nir_block *block, struct set *instr_set) { - (void) data; - return src->is_ssa; -} + bool progress = false; -static bool -dest_is_ssa(nir_dest *dest, void *data) -{ - (void) data; - return dest->is_ssa; -} - -static bool -nir_instr_can_cse(nir_instr *instr) -{ - /* We only handle SSA. */ - if (!nir_foreach_dest(instr, dest_is_ssa, NULL) || - !nir_foreach_src(instr, src_is_ssa, NULL)) - return false; - - switch (instr->type) { - case nir_instr_type_alu: - case nir_instr_type_tex: - case nir_instr_type_load_const: - case nir_instr_type_phi: - return true; - case nir_instr_type_intrinsic: { - const nir_intrinsic_info *info = - &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic]; - return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) && - (info->flags & NIR_INTRINSIC_CAN_REORDER) && - info->num_variables == 0; /* not implemented yet */ - } - case nir_instr_type_call: - case nir_instr_type_jump: - case nir_instr_type_ssa_undef: - return false; - case nir_instr_type_parallel_copy: - default: - unreachable("Invalid instruction type"); - } - - return false; -} - -static nir_ssa_def * -nir_instr_get_dest_ssa_def(nir_instr *instr) -{ - switch (instr->type) { - case nir_instr_type_alu: - assert(nir_instr_as_alu(instr)->dest.dest.is_ssa); - return &nir_instr_as_alu(instr)->dest.dest.ssa; - case nir_instr_type_tex: - assert(nir_instr_as_tex(instr)->dest.is_ssa); - return &nir_instr_as_tex(instr)->dest.ssa; - case nir_instr_type_load_const: - return &nir_instr_as_load_const(instr)->def; - case nir_instr_type_phi: - assert(nir_instr_as_phi(instr)->dest.is_ssa); - return &nir_instr_as_phi(instr)->dest.ssa; - case nir_instr_type_intrinsic: - assert(nir_instr_as_intrinsic(instr)->dest.is_ssa); - return &nir_instr_as_intrinsic(instr)->dest.ssa; - default: - unreachable("We never ask for any of these"); - } -} - -static void -nir_opt_cse_instr(nir_instr *instr, struct cse_state *state) -{ - if (!nir_instr_can_cse(instr)) - return; - - for (struct exec_node *node = instr->node.prev; - !exec_node_is_head_sentinel(node); node = node->prev) { - nir_instr *other = exec_node_data(nir_instr, node, node); - if (nir_instrs_equal(instr, other)) { - nir_ssa_def *other_def = nir_instr_get_dest_ssa_def(other); - nir_ssa_def_rewrite_uses(nir_instr_get_dest_ssa_def(instr), - nir_src_for_ssa(other_def)); + nir_foreach_instr_safe(block, instr) { + if (nir_instr_set_add_or_rewrite(instr_set, instr)) { + progress = true; nir_instr_remove(instr); - state->progress = true; - return; } } - for (nir_block *block = instr->block->imm_dom; - block != NULL; block = block->imm_dom) { - nir_foreach_instr_reverse(block, other) { - if (nir_instrs_equal(instr, other)) { - nir_ssa_def *other_def = nir_instr_get_dest_ssa_def(other); - nir_ssa_def_rewrite_uses(nir_instr_get_dest_ssa_def(instr), - nir_src_for_ssa(other_def)); - nir_instr_remove(instr); - state->progress = true; - return; - } - } + for (unsigned i = 0; i < block->num_dom_children; i++) { + nir_block *child = block->dom_children[i]; + progress |= cse_block(child, instr_set); } -} -static bool -nir_opt_cse_block(nir_block *block, void *void_state) -{ - struct cse_state *state = void_state; + nir_foreach_instr(block, instr) + nir_instr_set_remove(instr_set, instr); - nir_foreach_instr_safe(block, instr) - nir_opt_cse_instr(instr, state); - - return true; + return progress; } static bool nir_opt_cse_impl(nir_function_impl *impl) { - struct cse_state state; - - state.mem_ctx = ralloc_parent(impl); - state.progress = false; + struct set *instr_set = nir_instr_set_create(NULL); nir_metadata_require(impl, nir_metadata_dominance); - nir_foreach_block(impl, nir_opt_cse_block, &state); + bool progress = cse_block(nir_start_block(impl), instr_set); - if (state.progress) + if (progress) nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); - return state.progress; + nir_instr_set_destroy(instr_set); + return progress; } bool @@ -183,3 +90,4 @@ nir_opt_cse(nir_shader *shader) return progress; } + From bf5f931aee35e8448a6560545d86deb35f0639b3 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 24 Sep 2015 02:10:13 -0400 Subject: [PATCH 023/270] nir: make nir_instrs_equal() static This was previously tied to CSE, since it would only work for instructions where nir_can_cse() (now instr_can_rewrite()) returned true. Now that CSE uses the instruction set abstraction which only uses this internally, we can make it local to nir_instr_set.c. Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/nir/nir_instr_set.c | 2 +- src/glsl/nir/nir_instr_set.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c index 7460fccba10..7886003fe35 100644 --- a/src/glsl/nir/nir_instr_set.c +++ b/src/glsl/nir/nir_instr_set.c @@ -247,7 +247,7 @@ nir_alu_srcs_equal(const nir_alu_instr *alu1, const nir_alu_instr *alu2, * the same hash for (ignoring collisions, of course). */ -bool +static bool nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) { if (instr1->type != instr2->type) diff --git a/src/glsl/nir/nir_instr_set.h b/src/glsl/nir/nir_instr_set.h index a7f6c9dd1eb..939e8ddbf58 100644 --- a/src/glsl/nir/nir_instr_set.h +++ b/src/glsl/nir/nir_instr_set.h @@ -25,8 +25,6 @@ #include "nir.h" -bool nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2); - /** * This file defines functions for creating, destroying, and manipulating an * "instruction set," which is an abstraction for finding duplicate From bb59ba8634e3e5e3949103c6013918b8a4953111 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 24 Sep 2015 02:18:07 -0400 Subject: [PATCH 024/270] nir/instr_set: remove unnecessary check in nir_instrs_equal() This was originally added to nir_instrs_equal() instead of nir_instr_can_cse() incorrectly, but this was fixed when moving to the instruction set API (as it had to be, otherwise hashing wouldn't work). Now, this is dead code since instr_can_rewrite() will only return true for texture instructions that use an index, so we can turn the check into an assert. This also means that now nir_instrs_equal(instr, instr) will always return true unless it assert-fails. Reviewed-by: Jason Ekstrand Signed-off-by: Connor Abbott --- src/glsl/nir/nir_instr_set.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/glsl/nir/nir_instr_set.c b/src/glsl/nir/nir_instr_set.c index 7886003fe35..d3f939fe805 100644 --- a/src/glsl/nir/nir_instr_set.c +++ b/src/glsl/nir/nir_instr_set.c @@ -311,8 +311,7 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) } /* Don't support un-lowered sampler derefs currently. */ - if (tex1->sampler || tex2->sampler) - return false; + assert(!tex1->sampler && !tex2->sampler); return true; } From 5346c1167064d6429c6338974c6342f8346fd34b Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Thu, 13 Aug 2015 15:02:05 +0300 Subject: [PATCH 025/270] i965: Don't tell the hardware about our UAV access. The hardware documentation relating to the UAV HW-assisted coherency mechanism and UAV access enable bits is scarce and sometimes contradictory, and there's quite some guesswork behind this commit, so let me summarize the background first: HSW and later hardware have infrastructure to support a stricter form of data coherency between shader invocations from separate primitives. The mechanism is controlled by the "Accesses UAV" bits on 3DSTATE_VS, _HS, _DS, _GS and _PS (or _PS_EXTRA on BDW+), and the "UAV Coherency Required" bit on the 3DPRIMITIVE command. Regardless of whether "UAV Coherency Required" is set, the hardware fixed-function units will increment a per-stage semaphore for each request received if "Accesses UAV" is set for the same or any lower stage. An implicit DC flush is emitted by the lowermost stage with "Accesses UAV" set once it's done processing the request, this also happens regardless of the value of "UAV Coherency Required". The completion of the DC flush will cause the same stage and all previous ones to decrement the semaphore, marking the UAV accesses for the primitive as coherent with L3. The "UAV Coherency Required" 3DPRIMITIVE bit will cause a pipeline stall before any threads are dispatched for the first FF stage with "Accesses UAV" set until the semaphore is cleared for the same stage. Effectively this guarantees that UAV memory accesses performed by previous primitives from any stage will be strictly ordered (and thanks to the implicit DC flush visible in memory) with UAV accesses from the following primitives. None of this is required by the usual image, atomic counter and SSBO GL APIs which have very relaxed cross-primitive coherency and ordering requirements, so we don't actually ever set the "UAV Coherency Required" bit -- Ordering with respect to shader invocations from previous stages on the same primitive where there is a data dependency is of course already guaranteed as the spec requires, regardless of this mechanism being enabled. We do set the "Accesses UAV" bits though since my commit ac7664e493655e290783c23a0412b9c70936da50 (which this patch partially reverts), mainly because of comments like the following from the BDW PRM: > 3DSTATE_GS >[...] > 12 Accesses UAV > Format: Enable > This field must be set when GS has a UAV access. There are similar comments in the documentation for the other 3DSTATE_*S commands. The "must" part is misleading and unjustified AFAIK. Most of the "Accesses UAV" bits don't seem to have any side effects other than the implicit DC flushes and the related book-keeping in anticipation for a subsequent primitive with "UAV Coherency Required" set, so in most cases they are unnecessary and may incur a performance penalty. There is an exception though. On Gen8+ the PS_EXTRA UAV access bit influences the calculation of the PS UAV-only and ThreadDispatchEnable signals which on previous generations were set explicitly by the driver, so we cannot always avoid enabling it on the PS stage. The primary motivation for this change is that in fact the hardware coherency mechanism is buggy and will cause a rather non-deterministic hang on Gen8 when VS is the only stage with "Accesses UAV" set and the processing of a request terminates immediately after the implicit DC flush is sent for a previous primitive with no additional vertices being emitted for the second primitive, what will cause the hardware to skip sending a second DC flush and cause the VS to stall indefinitely waiting for a response from the DC (BDWGFX HSD 1912017). This hardware bug can be reproduced on current master with the spec@arb_shader_image_load_store@host-mem-barrier@Indirect/RaW piglit subtest (if you have the patience to run it a few dozen times). The proposed workaround is to insert CS STALLs speculatively between 3DPRIMITIVE commands when "Accesses UAV" is enabled for the VS stage only. Because this would affect one of the hottest paths in the driver and likely decrease performance even further due to the unnecessary serialization, and because we don't actually need the implicit DC flushes, it seems better to just disable them. Cc: 11.0 --- src/mesa/drivers/dri/i965/gen7_gs_state.c | 4 +-- src/mesa/drivers/dri/i965/gen7_vs_state.c | 4 +-- src/mesa/drivers/dri/i965/gen7_wm_state.c | 12 ++++++--- src/mesa/drivers/dri/i965/gen8_gs_state.c | 4 +-- src/mesa/drivers/dri/i965/gen8_ps_state.c | 32 ++++++++++++++++++++--- src/mesa/drivers/dri/i965/gen8_vs_state.c | 4 +-- 6 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c index 497ecec8e45..8d6d3fe1d34 100644 --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c @@ -59,9 +59,7 @@ upload_gs_state(struct brw_context *brw) OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (brw->is_haswell && prog_data->base.nr_image_params ? - HSW_GS_UAV_ACCESS_ENABLE : 0)); + GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c index b7e48585482..a18dc697651 100644 --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c @@ -126,9 +126,7 @@ upload_vs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (brw->is_haswell && prog_data->base.nr_image_params ? - HSW_VS_UAV_ACCESS_ENABLE : 0)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index fd6dab5be8b..06d5e65786b 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -113,7 +113,14 @@ upload_wm_state(struct brw_context *brw) else if (prog_data->base.nr_image_params) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; - /* _NEW_BUFFERS | _NEW_COLOR */ + /* The "UAV access enable" bits are unnecessary on HSW because they only + * seem to have an effect on the HW-assisted coherency mechanism which we + * don't need, and the rasterization-related UAV_ONLY flag and the + * DISPATCH_ENABLE bit can be set independently from it. + * C.f. gen8_upload_ps_extra(). + * + * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | _NEW_COLOR + */ if (brw->is_haswell && !(brw_color_buffer_write_enabled(brw) || writes_depth) && prog_data->base.nr_image_params) @@ -221,9 +228,6 @@ gen7_upload_ps_state(struct brw_context *brw, _mesa_get_min_invocations_per_fragment(ctx, fp, false); assert(min_inv_per_frag >= 1); - if (brw->is_haswell && prog_data->base.nr_image_params) - dw4 |= HSW_PS_UAV_ACCESS_ENABLE; - if (prog_data->prog_offset_16 || prog_data->no_8) { dw4 |= GEN7_PS_16_DISPATCH_ENABLE; if (!prog_data->no_8 && min_inv_per_frag == 1) { diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c index 4195f4cf4a7..d766ca7bebf 100644 --- a/src/mesa/drivers/dri/i965/gen8_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c @@ -52,9 +52,7 @@ gen8_upload_gs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << - GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (prog_data->base.nr_image_params ? - HSW_GS_UAV_ACCESS_ENABLE : 0)); + GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index a686fed704f..8f0507413a7 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -25,6 +25,7 @@ #include "program/program.h" #include "brw_state.h" #include "brw_defines.h" +#include "brw_wm.h" #include "intel_batchbuffer.h" void @@ -65,8 +66,33 @@ gen8_upload_ps_extra(struct brw_context *brw, if (brw->gen >= 9 && prog_data->pulls_bary) dw1 |= GEN9_PSX_SHADER_PULLS_BARY; - if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) || - prog_data->base.nr_image_params) + /* The stricter cross-primitive coherency guarantees that the hardware + * gives us with the "Accesses UAV" bit set for at least one shader stage + * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are + * redundant within the current image, atomic counter and SSBO GL APIs, + * which all have very loose ordering and coherency requirements and + * generally rely on the application to insert explicit barriers when a + * shader invocation is expected to see the memory writes performed by the + * invocations of some previous primitive. Regardless of the value of "UAV + * coherency required", the "Accesses UAV" bits will implicitly cause an in + * most cases useless DC flush when the lowermost stage with the bit set + * finishes execution. + * + * It would be nice to disable it, but in some cases we can't because on + * Gen8+ it also has an influence on rasterization via the PS UAV-only + * signal (which could be set independently from the coherency mechanism in + * the 3DSTATE_WM command on Gen7), and because in some cases it will + * determine whether the hardware skips execution of the fragment shader or + * not via the ThreadDispatchEnable signal. However if we know that + * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and + * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any + * difference so we may just disable it here. + * + * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR + */ + if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) || + prog_data->base.nr_image_params) && + !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; BEGIN_BATCH(2); @@ -91,7 +117,7 @@ upload_ps_extra(struct brw_context *brw) const struct brw_tracked_state gen8_ps_extra = { .dirty = { - .mesa = 0, + .mesa = _NEW_BUFFERS | _NEW_COLOR, .brw = BRW_NEW_CONTEXT | BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c index 8b5048bee7e..28f5adddf14 100644 --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c @@ -53,9 +53,7 @@ upload_vs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4) / 4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | - (prog_data->base.nr_image_params ? - HSW_VS_UAV_ACCESS_ENABLE : 0)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, From 5152db415f4047569822d648fda09bdde4171d6d Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Sat, 29 Aug 2015 16:34:50 +0300 Subject: [PATCH 026/270] mesa: Expose function to calculate whether a shader image unit is valid. A future commit will remove all texture object-dependent derived state from the image unit struct to make validation unnecessary on texture state changes. Instead of checking gl_image_unit::_Valid drivers will be required to call this function when needed to find out whether an image unit is in a valid state and whether access from the shader is allowed. Tested-by: Ye Tian CC: "11.0" Reviewed-by: Ian Romanick --- src/mesa/main/shaderimage.c | 8 ++++---- src/mesa/main/shaderimage.h | 11 +++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c index bd4b7c7be3b..acc8fa91e95 100644 --- a/src/mesa/main/shaderimage.c +++ b/src/mesa/main/shaderimage.c @@ -415,8 +415,8 @@ _mesa_init_image_units(struct gl_context *ctx) ctx->ImageUnits[i] = _mesa_default_image_unit(ctx); } -static GLboolean -validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u) +GLboolean +_mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u) { struct gl_texture_object *t = u->TexObj; mesa_format tex_format; @@ -567,7 +567,7 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level, u->Access = access; u->Format = format; u->_ActualFormat = _mesa_get_shader_image_format(format); - u->_Valid = validate_image_unit(ctx, u); + u->_Valid = _mesa_is_image_unit_valid(ctx, u); if (u->TexObj && _mesa_tex_target_is_layered(u->TexObj->Target)) { u->Layered = layered; @@ -703,7 +703,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures) u->Access = GL_READ_WRITE; u->Format = tex_format; u->_ActualFormat = _mesa_get_shader_image_format(tex_format); - u->_Valid = validate_image_unit(ctx, u); + u->_Valid = _mesa_is_image_unit_valid(ctx, u); } else { /* Unbind the texture from the unit */ _mesa_reference_texobj(&u->TexObj, NULL); diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h index bbe088a2459..14a544fca8a 100644 --- a/src/mesa/main/shaderimage.h +++ b/src/mesa/main/shaderimage.h @@ -54,6 +54,17 @@ _mesa_default_image_unit(struct gl_context *ctx); void _mesa_init_image_units(struct gl_context *ctx); +/** + * Return GL_TRUE if the state of the image unit passed as argument is valid + * and access from the shader is allowed. Otherwise loads from this unit + * should return zero and stores should have no effect. + * + * The result depends on context state other than the passed image unit, part + * of the _NEW_TEXTURE set. + */ +GLboolean +_mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u); + /** * Recalculate the \c _Valid flag of a context's shader image units. * From 25d3338be37ddbfe676716034ec5f29e27323704 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Thu, 3 Sep 2015 16:12:59 +0300 Subject: [PATCH 027/270] mesa: Skip redundant texture completeness checking during image validation. The call to _mesa_test_texobj_completeness() is unnecessary if the texture is already known to be complete. If the texture object is dirtied in the meantime _BaseComplete and _MipmapComplete will be reset to false. _mesa_is_image_unit_valid() will start to be called more frequently in a future commit, so it seems desirable to avoid the unnecessary work. Tested-by: Ye Tian CC: "11.0" Reviewed-by: Ian Romanick --- src/mesa/main/shaderimage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c index acc8fa91e95..0f0200a5b77 100644 --- a/src/mesa/main/shaderimage.c +++ b/src/mesa/main/shaderimage.c @@ -424,7 +424,8 @@ _mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u) if (!t) return GL_FALSE; - _mesa_test_texobj_completeness(ctx, t); + if (!t->_BaseComplete && !t->_MipmapComplete) + _mesa_test_texobj_completeness(ctx, t); if (u->Level < t->BaseLevel || u->Level > t->_MaxLevel || From 2d97a78b37ddf325d90e056f5eefee0548092530 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Sat, 29 Aug 2015 17:01:11 +0300 Subject: [PATCH 028/270] i965: Use _mesa_is_image_unit_valid() instead of gl_image_unit::_Valid. gl_image_unit::_Valid will be removed in a future commit. Tested-by: Ye Tian CC: "11.0" Reviewed-by: Ian Romanick --- src/mesa/drivers/dri/i965/brw_gs_surface_state.c | 3 ++- src/mesa/drivers/dri/i965/brw_vs_surface_state.c | 3 ++- src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 10 ++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c index 0bb307432d0..00125c0f405 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c @@ -129,7 +129,7 @@ brw_upload_gs_image_surfaces(struct brw_context *brw) ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]; if (prog) { - /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY], &brw->gs.base, &brw->gs.prog_data->base.base); } @@ -137,6 +137,7 @@ brw_upload_gs_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_gs_image_surfaces = { .dirty = { + .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_GS_PROG_DATA | diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index 9bb48eb2e27..f65258a52a5 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -201,7 +201,7 @@ brw_upload_vs_image_surfaces(struct brw_context *brw) ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]; if (prog) { - /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX], &brw->vs.base, &brw->vs.prog_data->base.base); } @@ -209,6 +209,7 @@ brw_upload_vs_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_vs_image_surfaces = { .dirty = { + .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | BRW_NEW_IMAGE_UNITS | BRW_NEW_VERTEX_PROGRAM | diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index c671e23827e..d73f657edc7 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -34,6 +34,7 @@ #include "main/blend.h" #include "main/mtypes.h" #include "main/samplerobj.h" +#include "main/shaderimage.h" #include "program/prog_parameter.h" #include "main/framebuffer.h" @@ -1112,7 +1113,7 @@ brw_upload_cs_image_surfaces(struct brw_context *brw) ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE]; if (prog) { - /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE], &brw->cs.base, &brw->cs.prog_data->base); } @@ -1120,7 +1121,7 @@ brw_upload_cs_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_cs_image_surfaces = { .dirty = { - .mesa = _NEW_PROGRAM, + .mesa = _NEW_TEXTURE | _NEW_PROGRAM, .brw = BRW_NEW_BATCH | BRW_NEW_CS_PROG_DATA | BRW_NEW_IMAGE_UNITS @@ -1253,7 +1254,7 @@ update_image_surface(struct brw_context *brw, uint32_t *surf_offset, struct brw_image_param *param) { - if (u->_Valid) { + if (_mesa_is_image_unit_valid(&brw->ctx, u)) { struct gl_texture_object *obj = u->TexObj; const unsigned format = get_image_format(brw, u->_ActualFormat, access); @@ -1338,7 +1339,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw) struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram; if (prog) { - /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS */ + /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS, _NEW_TEXTURE */ brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT], &brw->wm.base, &brw->wm.prog_data->base); } @@ -1346,6 +1347,7 @@ brw_upload_wm_image_surfaces(struct brw_context *brw) const struct brw_tracked_state brw_wm_image_surfaces = { .dirty = { + .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | From 7e441bf025cf8c5d088430d546acb4c0ed58d27b Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Sat, 29 Aug 2015 17:03:08 +0300 Subject: [PATCH 029/270] mesa: Get rid of texture-dependent image unit derived state. The point is to avoid having to re-validate all image units when _NEW_TEXTURE is flagged, which can be expensive if the driver exposes a large number of image units. This has been reported to fix a 36% performance regression in the Synmark2 Multithread benchmark on the i965 driver which exposes 192 image units. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91788 Reported-by: Wendy Wang Tested-by: Ye Tian CC: "11.0" Reviewed-by: Ian Romanick --- src/mesa/main/mtypes.h | 7 ------- src/mesa/main/shaderimage.c | 14 -------------- src/mesa/main/shaderimage.h | 9 --------- src/mesa/main/texstate.c | 3 --- 4 files changed, 33 deletions(-) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 288d75790a4..5272372eb8e 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -4070,13 +4070,6 @@ struct gl_image_unit */ GLboolean Layered; - /** - * GL_TRUE if the state of this image unit is valid and access from - * the shader is allowed. Otherwise loads from this unit should - * return zero and stores should have no effect. - */ - GLboolean _Valid; - /** * Layer of the texture object bound to this unit as specified by the * application. diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c index 0f0200a5b77..c4ebf4201fb 100644 --- a/src/mesa/main/shaderimage.c +++ b/src/mesa/main/shaderimage.c @@ -474,17 +474,6 @@ _mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u) return GL_TRUE; } -void -_mesa_validate_image_units(struct gl_context *ctx) -{ - unsigned i; - - for (i = 0; i < ctx->Const.MaxImageUnits; ++i) { - struct gl_image_unit *u = &ctx->ImageUnits[i]; - u->_Valid = validate_image_unit(ctx, u); - } -} - static GLboolean validate_bind_image_texture(struct gl_context *ctx, GLuint unit, GLuint texture, GLint level, GLboolean layered, @@ -568,7 +557,6 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level, u->Access = access; u->Format = format; u->_ActualFormat = _mesa_get_shader_image_format(format); - u->_Valid = _mesa_is_image_unit_valid(ctx, u); if (u->TexObj && _mesa_tex_target_is_layered(u->TexObj->Target)) { u->Layered = layered; @@ -704,7 +692,6 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures) u->Access = GL_READ_WRITE; u->Format = tex_format; u->_ActualFormat = _mesa_get_shader_image_format(tex_format); - u->_Valid = _mesa_is_image_unit_valid(ctx, u); } else { /* Unbind the texture from the unit */ _mesa_reference_texobj(&u->TexObj, NULL); @@ -714,7 +701,6 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures) u->Access = GL_READ_ONLY; u->Format = GL_R8; u->_ActualFormat = MESA_FORMAT_R_UNORM8; - u->_Valid = GL_FALSE; } } diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h index 14a544fca8a..94ee814a716 100644 --- a/src/mesa/main/shaderimage.h +++ b/src/mesa/main/shaderimage.h @@ -65,15 +65,6 @@ _mesa_init_image_units(struct gl_context *ctx); GLboolean _mesa_is_image_unit_valid(struct gl_context *ctx, struct gl_image_unit *u); -/** - * Recalculate the \c _Valid flag of a context's shader image units. - * - * To be called when the state of any texture bound to an image unit - * changes. - */ -void -_mesa_validate_image_units(struct gl_context *ctx); - void GLAPIENTRY _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c index 9b5928c4306..cb147fac476 100644 --- a/src/mesa/main/texstate.c +++ b/src/mesa/main/texstate.c @@ -34,7 +34,6 @@ #include "context.h" #include "enums.h" #include "macros.h" -#include "shaderimage.h" #include "texobj.h" #include "teximage.h" #include "texstate.h" @@ -741,8 +740,6 @@ update_texture_state( struct gl_context *ctx ) if (!prog[MESA_SHADER_FRAGMENT] || !prog[MESA_SHADER_VERTEX]) update_texgen(ctx); - - _mesa_validate_image_units(ctx); } From c9b982b72d443b138cfbded2f40350771c0bb321 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 8 Oct 2015 18:19:00 -0400 Subject: [PATCH 030/270] glsl: move shader_enums into nir First step towards inverting the dependency between glsl and nir (so nir can be used without glsl). Also solves this issue with 'make distclean' Making distclean in mesa make[2]: Entering directory '/mnt/sdb1/Src64/Mesa-git/mesa/src/mesa' Makefile:2486: ../glsl/.deps/shader_enums.Plo: No such file or directory make[2]: *** No rule to make target '../glsl/.deps/shader_enums.Plo'. Stop. make[2]: Leaving directory '/mnt/sdb1/Src64/Mesa-git/mesa/src/mesa' Makefile:684: recipe for target 'distclean-recursive' failed make[1]: *** [distclean-recursive] Error 1 make[1]: Leaving directory '/mnt/sdb1/Src64/Mesa-git/mesa/src' Makefile:615: recipe for target 'distclean-recursive' failed make: *** [distclean-recursive] Error 1 Reported-by: Andy Furniss Reviewed-by: Emil Velikov Signed-off-by: Rob Clark --- src/Makefile.am | 1 - src/gallium/auxiliary/nir/tgsi_to_nir.c | 2 +- src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 1 - src/gallium/drivers/freedreno/ir3/ir3_nir.h | 1 + src/gallium/drivers/freedreno/ir3/ir3_shader.h | 2 +- src/glsl/Makefile.sources | 7 ++++--- src/glsl/nir/nir.h | 2 +- src/glsl/{ => nir}/shader_enums.c | 2 +- src/glsl/{ => nir}/shader_enums.h | 0 src/mesa/Makefile.sources | 4 +--- src/mesa/main/mtypes.h | 2 +- 11 files changed, 11 insertions(+), 13 deletions(-) rename src/glsl/{ => nir}/shader_enums.c (99%) rename src/glsl/{ => nir}/shader_enums.h (100%) diff --git a/src/Makefile.am b/src/Makefile.am index 9e15cca5ea4..0d49bcd19ed 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -62,7 +62,6 @@ AM_CPPFLAGS = \ noinst_LTLIBRARIES = libglsl_util.la libglsl_util_la_SOURCES = \ - glsl/shader_enums.c \ mesa/main/imports.c \ mesa/program/prog_hash_table.c \ mesa/program/symbol_table.c \ diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c index cf43ef2506f..0539cfc16a1 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -27,7 +27,7 @@ #include "glsl/nir/nir_control_flow.h" #include "glsl/nir/nir_builder.h" #include "glsl/list.h" -#include "glsl/shader_enums.h" +#include "glsl/nir/shader_enums.h" #include "nir/tgsi_to_nir.h" #include "tgsi/tgsi_parse.h" diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 7eddbdd3825..8c9234b3847 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -36,7 +36,6 @@ #include "tgsi/tgsi_strings.h" #include "nir/tgsi_to_nir.h" -#include "glsl/shader_enums.h" #include "freedreno_util.h" diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h index f3d3075e6a6..9950782dc38 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.h @@ -30,6 +30,7 @@ #define IR3_NIR_H_ #include "glsl/nir/nir.h" +#include "glsl/nir/shader_enums.h" bool ir3_nir_lower_if_else(nir_shader *shader); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 6dc0ce1133f..7e2c27d9765 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -30,7 +30,7 @@ #define IR3_SHADER_H_ #include "pipe/p_state.h" -#include "glsl/shader_enums.h" +#include "glsl/nir/shader_enums.h" #include "ir3.h" #include "disasm.h" diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index bb8bddc69af..436949cd760 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -80,7 +80,9 @@ NIR_FILES = \ nir/nir_vla.h \ nir/nir_worklist.c \ nir/nir_worklist.h \ - nir/nir_types.cpp + nir/nir_types.cpp \ + nir/shader_enums.h \ + nir/shader_enums.c # libglsl @@ -204,8 +206,7 @@ LIBGLSL_FILES = \ opt_vectorize.cpp \ program.h \ s_expression.cpp \ - s_expression.h \ - shader_enums.h + s_expression.h # glsl_compiler diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index befc7fce724..112c6b5412a 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -35,7 +35,7 @@ #include "util/set.h" #include "util/bitset.h" #include "nir_types.h" -#include "glsl/shader_enums.h" +#include "shader_enums.h" #include #include "nir_opcodes.h" diff --git a/src/glsl/shader_enums.c b/src/glsl/nir/shader_enums.c similarity index 99% rename from src/glsl/shader_enums.c rename to src/glsl/nir/shader_enums.c index c196b791d4f..3722475731b 100644 --- a/src/glsl/shader_enums.c +++ b/src/glsl/nir/shader_enums.c @@ -26,7 +26,7 @@ * Rob Clark */ -#include "glsl/shader_enums.h" +#include "shader_enums.h" #include "util/macros.h" #define ENUM(x) [x] = #x diff --git a/src/glsl/shader_enums.h b/src/glsl/nir/shader_enums.h similarity index 100% rename from src/glsl/shader_enums.h rename to src/glsl/nir/shader_enums.h diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index 0915594cea6..b40ee4d395b 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -525,9 +525,7 @@ PROGRAM_FILES = \ program/sampler.h \ program/string_to_uint_map.cpp \ program/symbol_table.c \ - program/symbol_table.h \ - ../glsl/shader_enums.c \ - ../glsl/shader_enums.h + program/symbol_table.h PROGRAM_NIR_FILES = \ program/prog_to_nir.c \ diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 5272372eb8e..0a54b2073e2 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -42,7 +42,7 @@ #include "main/config.h" #include "glapi/glapi.h" #include "math/m_matrix.h" /* GLmatrix */ -#include "glsl/shader_enums.h" +#include "glsl/nir/shader_enums.h" #include "main/formats.h" /* MESA_FORMAT_COUNT */ From 9ea2a86809577cac5006a2bc4fad29fed9cb3ccc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 03:02:42 +0200 Subject: [PATCH 031/270] mesa: call ProgramStringNotify for fixed-function vertex programs Drivers weren't notified about this at all. This allows disabling on-demand compilation in drivers. Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/main/ffvertex_prog.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c index a6183b47e2e..34cc9218add 100644 --- a/src/mesa/main/ffvertex_prog.c +++ b/src/mesa/main/ffvertex_prog.c @@ -1690,11 +1690,10 @@ _mesa_get_fixed_func_vertex_program(struct gl_context *ctx) ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS, ctx->Const.Program[MESA_SHADER_VERTEX].MaxTemps ); -#if 0 if (ctx->Driver.ProgramStringNotify) ctx->Driver.ProgramStringNotify( ctx, GL_VERTEX_PROGRAM_ARB, &prog->Base ); -#endif + _mesa_program_cache_insert(ctx, ctx->VertexProgram.Cache, &key, sizeof(key), &prog->Base); } From 417927ebded4c6f4cee20c7e07a69c666a3f17a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 00:02:31 +0200 Subject: [PATCH 032/270] tgsi: add a utility for emulating some GL features st/mesa will use this, but drivers can use it too. Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/gallium/auxiliary/Makefile.sources | 2 + src/gallium/auxiliary/tgsi/tgsi_emulate.c | 168 ++++++++++++++++++++++ src/gallium/auxiliary/tgsi/tgsi_emulate.h | 38 +++++ 3 files changed, 208 insertions(+) create mode 100644 src/gallium/auxiliary/tgsi/tgsi_emulate.c create mode 100644 src/gallium/auxiliary/tgsi/tgsi_emulate.h diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 1fa36416b8e..9df4e265b5b 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -137,6 +137,8 @@ C_SOURCES := \ tgsi/tgsi_dump.h \ tgsi/tgsi_exec.c \ tgsi/tgsi_exec.h \ + tgsi/tgsi_emulate.c \ + tgsi/tgsi_emulate.h \ tgsi/tgsi_info.c \ tgsi/tgsi_info.h \ tgsi/tgsi_iterate.c \ diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.c b/src/gallium/auxiliary/tgsi/tgsi_emulate.c new file mode 100644 index 00000000000..819087261b3 --- /dev/null +++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "tgsi/tgsi_transform.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_dump.h" +#include "util/u_debug.h" + +#include "tgsi_emulate.h" + +struct tgsi_emulation_context { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + unsigned flags; + bool first_instruction_emitted; +}; + +static inline struct tgsi_emulation_context * +tgsi_emulation_context(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_emulation_context *)tctx; +} + +static void +transform_decl(struct tgsi_transform_context *tctx, + struct tgsi_full_declaration *decl) +{ + struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); + + if (ctx->flags & TGSI_EMU_FORCE_PERSAMPLE_INTERP && + decl->Declaration.File == TGSI_FILE_INPUT) { + assert(decl->Declaration.Interpolate); + decl->Interp.Location = TGSI_INTERPOLATE_LOC_SAMPLE; + } + + tctx->emit_declaration(tctx, decl); +} + +static void +passthrough_edgeflag(struct tgsi_transform_context *tctx) +{ + struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); + struct tgsi_full_declaration decl; + struct tgsi_full_instruction new_inst; + + /* Input */ + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_INPUT; + decl.Range.First = decl.Range.Last = ctx->info.num_inputs; + tctx->emit_declaration(tctx, &decl); + + /* Output */ + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_OUTPUT; + decl.Declaration.Semantic = true; + decl.Range.First = decl.Range.Last = ctx->info.num_outputs; + decl.Semantic.Name = TGSI_SEMANTIC_EDGEFLAG; + decl.Semantic.Index = 0; + tctx->emit_declaration(tctx, &decl); + + /* MOV */ + new_inst = tgsi_default_full_instruction(); + new_inst.Instruction.Opcode = TGSI_OPCODE_MOV; + + new_inst.Instruction.NumDstRegs = 1; + new_inst.Dst[0].Register.File = TGSI_FILE_OUTPUT; + new_inst.Dst[0].Register.Index = ctx->info.num_outputs; + new_inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + new_inst.Instruction.NumSrcRegs = 1; + new_inst.Src[0].Register.File = TGSI_FILE_INPUT; + new_inst.Src[0].Register.Index = ctx->info.num_inputs; + new_inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; + new_inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X; + new_inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X; + new_inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X; + + tctx->emit_instruction(tctx, &new_inst); +} + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *inst) +{ + struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); + + /* Pass through edgeflags. */ + if (!ctx->first_instruction_emitted) { + ctx->first_instruction_emitted = true; + + if (ctx->flags & TGSI_EMU_PASSTHROUGH_EDGEFLAG) + passthrough_edgeflag(tctx); + } + + /* Clamp color outputs. */ + if (ctx->flags & TGSI_EMU_CLAMP_COLOR_OUTPUTS) { + for (int i = 0; i < inst->Instruction.NumDstRegs; i++) { + unsigned semantic; + + if (inst->Dst[i].Register.File != TGSI_FILE_OUTPUT || + inst->Dst[i].Register.Indirect) + continue; + + semantic = + ctx->info.output_semantic_name[inst->Dst[i].Register.Index]; + + if (semantic == TGSI_SEMANTIC_COLOR || + semantic == TGSI_SEMANTIC_BCOLOR) + inst->Instruction.Saturate = true; + } + } + + tctx->emit_instruction(tctx, inst); +} + +const struct tgsi_token * +tgsi_emulate(const struct tgsi_token *tokens, unsigned flags) +{ + struct tgsi_emulation_context ctx; + struct tgsi_token *newtoks; + int newlen; + + if (!(flags & (TGSI_EMU_CLAMP_COLOR_OUTPUTS | + TGSI_EMU_PASSTHROUGH_EDGEFLAG | + TGSI_EMU_FORCE_PERSAMPLE_INTERP))) + return NULL; + + memset(&ctx, 0, sizeof(ctx)); + ctx.flags = flags; + tgsi_scan_shader(tokens, &ctx.info); + + if (flags & TGSI_EMU_FORCE_PERSAMPLE_INTERP) + ctx.base.transform_declaration = transform_decl; + + if (flags & (TGSI_EMU_CLAMP_COLOR_OUTPUTS | + TGSI_EMU_PASSTHROUGH_EDGEFLAG)) + ctx.base.transform_instruction = transform_instr; + + newlen = tgsi_num_tokens(tokens) + 20; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.h b/src/gallium/auxiliary/tgsi/tgsi_emulate.h new file mode 100644 index 00000000000..425cec72ee1 --- /dev/null +++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef TGSI_GL_EMULATION_H_ +#define TGSI_GL_EMULATION_H_ + +#include "pipe/p_shader_tokens.h" + +#define TGSI_EMU_CLAMP_COLOR_OUTPUTS (1 << 0) +#define TGSI_EMU_PASSTHROUGH_EDGEFLAG (1 << 1) +#define TGSI_EMU_FORCE_PERSAMPLE_INTERP (1 << 2) + +const struct tgsi_token * +tgsi_emulate(const struct tgsi_token *tokens, unsigned flags); + +#endif /* TGSI_GL_EMULATION_H_ */ From c80c19a9d550745b68ce1df5f6c73e89a41514fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 00:08:30 +0200 Subject: [PATCH 033/270] tgsi/scan: add info about declared samplers (v2) v2: get it from declarations, not instructions --- src/gallium/auxiliary/tgsi/tgsi_scan.c | 2 ++ src/gallium/auxiliary/tgsi/tgsi_scan.h | 1 + 2 files changed, 3 insertions(+) diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index d76dddbf7d9..b84a1753eeb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -409,6 +409,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens, info->writes_edgeflag = TRUE; } } + } else if (file == TGSI_FILE_SAMPLER) { + info->samplers_declared |= 1 << reg; } } } diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index 3ceb55717ee..d60ccabda6d 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -64,6 +64,7 @@ struct tgsi_shader_info uint file_count[TGSI_FILE_COUNT]; /**< number of declared registers */ int file_max[TGSI_FILE_COUNT]; /**< highest index of declared registers */ int const_file_max[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned samplers_declared; /**< bitmask of declared samplers */ ubyte input_array_first[PIPE_MAX_SHADER_INPUTS]; ubyte input_array_last[PIPE_MAX_SHADER_INPUTS]; From 4a21edf067b3fdcc58d9df5cbdcd04430b8077f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 3 Oct 2015 22:35:22 +0200 Subject: [PATCH 034/270] st/mesa: inline st_prepare_vertex_program No other shader stage has a "prepare" function. This will allow removing some variables from st_vertex_program. Also, prepare_fragment_program was a dead prototype. Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_program.c | 41 ++++++++--------------------- src/mesa/state_tracker/st_program.h | 10 ------- 2 files changed, 11 insertions(+), 40 deletions(-) diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index a07f8fec309..63ffad76426 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -228,24 +228,25 @@ st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep /** - * Translate a Mesa vertex shader into a TGSI shader. - * \param outputMapping to map vertex program output registers (VARYING_SLOT_x) - * to TGSI output slots - * \param tokensOut destination for TGSI tokens - * \return pointer to cached pipe_shader object. + * Translate a vertex program to create a new variant. */ -void -st_prepare_vertex_program(struct gl_context *ctx, - struct st_vertex_program *stvp) +static struct st_vp_variant * +st_translate_vertex_program(struct st_context *st, + struct st_vertex_program *stvp, + const struct st_vp_variant_key *key) { - struct st_context *st = st_context(ctx); + struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); + struct pipe_context *pipe = st->pipe; + struct ureg_program *ureg; + enum pipe_error error; + unsigned num_outputs; GLuint attr; stvp->num_inputs = 0; stvp->num_outputs = 0; if (stvp->Base.IsPositionInvariant) - _mesa_insert_mvp_code(ctx, &stvp->Base); + _mesa_insert_mvp_code(st->ctx, &stvp->Base); /* * Determine number of inputs, the mappings between VERT_ATTRIB_x @@ -361,29 +362,9 @@ st_prepare_vertex_program(struct gl_context *ctx, stvp->result_to_output[VARYING_SLOT_EDGE] = stvp->num_outputs; stvp->output_semantic_name[stvp->num_outputs] = TGSI_SEMANTIC_EDGEFLAG; stvp->output_semantic_index[stvp->num_outputs] = 0; -} - - -/** - * Translate a vertex program to create a new variant. - */ -static struct st_vp_variant * -st_translate_vertex_program(struct st_context *st, - struct st_vertex_program *stvp, - const struct st_vp_variant_key *key) -{ - struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); - struct pipe_context *pipe = st->pipe; - struct ureg_program *ureg; - enum pipe_error error; - unsigned num_outputs; - - st_prepare_vertex_program(st->ctx, stvp); if (!stvp->glsl_to_tgsi) - { _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT); - } ureg = ureg_create_with_screen(TGSI_PROCESSOR_VERTEX, st->pipe->screen); if (ureg == NULL) { diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 7013993fe38..f54cf83c727 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -414,16 +414,6 @@ st_get_tep_variant(struct st_context *st, struct st_tesseval_program *stgp, const struct st_tep_variant_key *key); - -extern void -st_prepare_vertex_program(struct gl_context *ctx, - struct st_vertex_program *stvp); - -extern GLboolean -st_prepare_fragment_program(struct gl_context *ctx, - struct st_fragment_program *stfp); - - extern void st_release_vp_variants( struct st_context *st, struct st_vertex_program *stvp ); From 4bbe418b4b35039e72b72bd9dff1ebdd96014d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 3 Oct 2015 22:44:30 +0200 Subject: [PATCH 035/270] st/mesa: decrease the size of st_vertex_program The other variables can't be moved. Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_program.c | 94 +++++++++++++++-------------- src/mesa/state_tracker/st_program.h | 5 -- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 63ffad76426..9e100dbd627 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -239,11 +239,14 @@ st_translate_vertex_program(struct st_context *st, struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; enum pipe_error error; - unsigned num_outputs; - GLuint attr; + unsigned num_outputs = 0; + unsigned attr; + unsigned input_to_index[VERT_ATTRIB_MAX] = {0}; + unsigned output_slot_to_attr[VARYING_SLOT_MAX] = {0}; + ubyte output_semantic_name[VARYING_SLOT_MAX] = {0}; + ubyte output_semantic_index[VARYING_SLOT_MAX] = {0}; stvp->num_inputs = 0; - stvp->num_outputs = 0; if (stvp->Base.IsPositionInvariant) _mesa_insert_mvp_code(st->ctx, &stvp->Base); @@ -254,7 +257,7 @@ st_translate_vertex_program(struct st_context *st, */ for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) { if ((stvp->Base.Base.InputsRead & BITFIELD64_BIT(attr)) != 0) { - stvp->input_to_index[attr] = stvp->num_inputs; + input_to_index[attr] = stvp->num_inputs; stvp->index_to_input[stvp->num_inputs] = attr; stvp->num_inputs++; if ((stvp->Base.Base.DoubleInputsRead & BITFIELD64_BIT(attr)) != 0) { @@ -265,7 +268,7 @@ st_translate_vertex_program(struct st_context *st, } } /* bit of a hack, presetup potentially unused edgeflag input */ - stvp->input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs; + input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs; stvp->index_to_input[stvp->num_inputs] = VERT_ATTRIB_EDGEFLAG; /* Compute mapping of vertex program outputs to slots. @@ -275,62 +278,62 @@ st_translate_vertex_program(struct st_context *st, stvp->result_to_output[attr] = ~0; } else { - unsigned slot = stvp->num_outputs++; + unsigned slot = num_outputs++; stvp->result_to_output[attr] = slot; - stvp->output_slot_to_attr[slot] = attr; + output_slot_to_attr[slot] = attr; switch (attr) { case VARYING_SLOT_POS: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_POSITION; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_POSITION; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_COL0: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_COL1: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; - stvp->output_semantic_index[slot] = 1; + output_semantic_name[slot] = TGSI_SEMANTIC_COLOR; + output_semantic_index[slot] = 1; break; case VARYING_SLOT_BFC0: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_BFC1: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; - stvp->output_semantic_index[slot] = 1; + output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR; + output_semantic_index[slot] = 1; break; case VARYING_SLOT_FOGC: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_FOG; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_FOG; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_PSIZ: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_CLIP_DIST0: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_CLIP_DIST1: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; - stvp->output_semantic_index[slot] = 1; + output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST; + output_semantic_index[slot] = 1; break; case VARYING_SLOT_EDGE: assert(0); break; case VARYING_SLOT_CLIP_VERTEX: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_LAYER: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_LAYER; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_LAYER; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_VIEWPORT: - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX; - stvp->output_semantic_index[slot] = 0; + output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX; + output_semantic_index[slot] = 0; break; case VARYING_SLOT_TEX0: @@ -342,8 +345,8 @@ st_translate_vertex_program(struct st_context *st, case VARYING_SLOT_TEX6: case VARYING_SLOT_TEX7: if (st->needs_texcoord_semantic) { - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD; - stvp->output_semantic_index[slot] = attr - VARYING_SLOT_TEX0; + output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD; + output_semantic_index[slot] = attr - VARYING_SLOT_TEX0; break; } /* fall through */ @@ -351,17 +354,17 @@ st_translate_vertex_program(struct st_context *st, default: assert(attr >= VARYING_SLOT_VAR0 || (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7)); - stvp->output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC; - stvp->output_semantic_index[slot] = + output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC; + output_semantic_index[slot] = st_get_generic_varying_index(st, attr); break; } } } /* similar hack to above, presetup potentially unused edgeflag output */ - stvp->result_to_output[VARYING_SLOT_EDGE] = stvp->num_outputs; - stvp->output_semantic_name[stvp->num_outputs] = TGSI_SEMANTIC_EDGEFLAG; - stvp->output_semantic_index[stvp->num_outputs] = 0; + stvp->result_to_output[VARYING_SLOT_EDGE] = num_outputs; + output_semantic_name[num_outputs] = TGSI_SEMANTIC_EDGEFLAG; + output_semantic_index[num_outputs] = 0; if (!stvp->glsl_to_tgsi) _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT); @@ -375,7 +378,6 @@ st_translate_vertex_program(struct st_context *st, vpv->key = *key; vpv->num_inputs = stvp->num_inputs; - num_outputs = stvp->num_outputs; if (key->passthrough_edgeflags) { vpv->num_inputs++; num_outputs++; @@ -395,7 +397,7 @@ st_translate_vertex_program(struct st_context *st, &stvp->Base.Base, /* inputs */ vpv->num_inputs, - stvp->input_to_index, + input_to_index, NULL, /* inputSlotToAttr */ NULL, /* input semantic name */ NULL, /* input semantic index */ @@ -404,9 +406,9 @@ st_translate_vertex_program(struct st_context *st, /* outputs */ num_outputs, stvp->result_to_output, - stvp->output_slot_to_attr, - stvp->output_semantic_name, - stvp->output_semantic_index, + output_slot_to_attr, + output_semantic_name, + output_semantic_index, key->passthrough_edgeflags, key->clamp_color); else @@ -416,15 +418,15 @@ st_translate_vertex_program(struct st_context *st, &stvp->Base.Base, /* inputs */ vpv->num_inputs, - stvp->input_to_index, + input_to_index, NULL, /* input semantic name */ NULL, /* input semantic index */ NULL, /* outputs */ num_outputs, stvp->result_to_output, - stvp->output_semantic_name, - stvp->output_semantic_index, + output_semantic_name, + output_semantic_index, key->passthrough_edgeflags, key->clamp_color); diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index f54cf83c727..c60d2d5f803 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -156,17 +156,12 @@ struct st_vertex_program struct glsl_to_tgsi_visitor* glsl_to_tgsi; /** maps a Mesa VERT_ATTRIB_x to a packed TGSI input index */ - GLuint input_to_index[VERT_ATTRIB_MAX]; /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */ GLuint index_to_input[PIPE_MAX_SHADER_INPUTS]; GLuint num_inputs; /** Maps VARYING_SLOT_x to slot */ GLuint result_to_output[VARYING_SLOT_MAX]; - GLuint output_slot_to_attr[VARYING_SLOT_MAX]; - ubyte output_semantic_name[VARYING_SLOT_MAX]; - ubyte output_semantic_index[VARYING_SLOT_MAX]; - GLuint num_outputs; /** List of translated variants of this vertex program. */ From 941721ee2a90811b225db3241e280ea4ab27ea40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 00:33:11 +0200 Subject: [PATCH 036/270] st/mesa: use TGSI utility to emulate features for VS variants Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_program.c | 41 ++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 9e100dbd627..6ace35295a2 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -43,6 +43,8 @@ #include "pipe/p_shader_tokens.h" #include "draw/draw_context.h" #include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_emulate.h" +#include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" #include "st_debug.h" @@ -377,12 +379,6 @@ st_translate_vertex_program(struct st_context *st, vpv->key = *key; - vpv->num_inputs = stvp->num_inputs; - if (key->passthrough_edgeflags) { - vpv->num_inputs++; - num_outputs++; - } - if (ST_DEBUG & DEBUG_MESA) { _mesa_print_program(&stvp->Base.Base); _mesa_print_program_parameters(st->ctx, &stvp->Base.Base); @@ -396,7 +392,7 @@ st_translate_vertex_program(struct st_context *st, stvp->glsl_to_tgsi, &stvp->Base.Base, /* inputs */ - vpv->num_inputs, + stvp->num_inputs, input_to_index, NULL, /* inputSlotToAttr */ NULL, /* input semantic name */ @@ -409,15 +405,15 @@ st_translate_vertex_program(struct st_context *st, output_slot_to_attr, output_semantic_name, output_semantic_index, - key->passthrough_edgeflags, - key->clamp_color); + false, + false); else error = st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_VERTEX, ureg, &stvp->Base.Base, /* inputs */ - vpv->num_inputs, + stvp->num_inputs, input_to_index, NULL, /* input semantic name */ NULL, /* input semantic index */ @@ -427,8 +423,8 @@ st_translate_vertex_program(struct st_context *st, stvp->result_to_output, output_semantic_name, output_semantic_index, - key->passthrough_edgeflags, - key->clamp_color); + false, + false); if (error) goto fail; @@ -445,6 +441,27 @@ st_translate_vertex_program(struct st_context *st, &vpv->tgsi.stream_output); } + vpv->num_inputs = stvp->num_inputs; + + /* Emulate features. */ + if (key->clamp_color || key->passthrough_edgeflags) { + const struct tgsi_token *tokens; + unsigned flags = + (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | + (key->passthrough_edgeflags ? TGSI_EMU_PASSTHROUGH_EDGEFLAG : 0); + + tokens = tgsi_emulate(vpv->tgsi.tokens, flags); + + if (tokens) { + tgsi_free_tokens(vpv->tgsi.tokens); + vpv->tgsi.tokens = tokens; + + if (key->passthrough_edgeflags) + vpv->num_inputs++; + } else + fprintf(stderr, "mesa: cannot emulate deprecated features\n"); + } + if (ST_DEBUG & DEBUG_TGSI) { tgsi_dump(vpv->tgsi.tokens, 0); debug_printf("\n"); From c04e91a0e9abb424cb68c7e310ca9e5adf7f7be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 00:33:11 +0200 Subject: [PATCH 037/270] st/mesa: use TGSI utility to emulate features for FS variants Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_program.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 6ace35295a2..bf6b492e80b 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -620,8 +620,7 @@ st_translate_fragment_program(struct st_context *st, interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTER; if (stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID | - SYSTEM_BIT_SAMPLE_POS) || - key->persample_shading) + SYSTEM_BIT_SAMPLE_POS)) interpLocation[slot] = TGSI_INTERPOLATE_LOC_SAMPLE; switch (attr) { @@ -861,7 +860,7 @@ st_translate_fragment_program(struct st_context *st, NULL, fs_output_semantic_name, fs_output_semantic_index, FALSE, - key->clamp_color ); + false); else st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, @@ -878,10 +877,26 @@ st_translate_fragment_program(struct st_context *st, outputMapping, fs_output_semantic_name, fs_output_semantic_index, FALSE, - key->clamp_color); + false); - variant->tgsi.tokens = ureg_get_tokens( ureg, NULL ); - ureg_destroy( ureg ); + variant->tgsi.tokens = ureg_get_tokens(ureg, NULL); + ureg_destroy(ureg); + + /* Emulate features. */ + if (key->clamp_color || key->persample_shading) { + const struct tgsi_token *tokens; + unsigned flags = + (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | + (key->persample_shading ? TGSI_EMU_FORCE_PERSAMPLE_INTERP : 0); + + tokens = tgsi_emulate(variant->tgsi.tokens, flags); + + if (tokens) { + tgsi_free_tokens(variant->tgsi.tokens); + variant->tgsi.tokens = tokens; + } else + fprintf(stderr, "mesa: cannot emulate deprecated features\n"); + } if (ST_DEBUG & DEBUG_TGSI) { tgsi_dump(variant->tgsi.tokens, 0/*TGSI_DUMP_VERBOSE*/); From 3eedb633710733b38f612bdd5b2b490a7f854c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 01:01:16 +0200 Subject: [PATCH 038/270] st/mesa: remove old emulation for VS and FS variants Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 48 +++------------------ src/mesa/state_tracker/st_glsl_to_tgsi.h | 4 +- src/mesa/state_tracker/st_mesa_to_tgsi.c | 50 +++------------------- src/mesa/state_tracker/st_mesa_to_tgsi.h | 4 +- src/mesa/state_tracker/st_program.c | 18 +++----- 5 files changed, 17 insertions(+), 107 deletions(-) diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 633e90ffa38..1488ea07c1d 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -4852,7 +4852,7 @@ src_register(struct st_translate *t, const st_src_reg *reg) static struct ureg_dst translate_dst(struct st_translate *t, const st_dst_reg *dst_reg, - bool saturate, bool clamp_color) + bool saturate) { struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index, dst_reg->array_id); @@ -4864,28 +4864,6 @@ translate_dst(struct st_translate *t, if (saturate) dst = ureg_saturate(dst); - else if (clamp_color && dst_reg->file == PROGRAM_OUTPUT) { - /* Clamp colors for ARB_color_buffer_float. */ - switch (t->procType) { - case TGSI_PROCESSOR_VERTEX: - /* This can only occur with a compatibility profile, which doesn't - * support geometry shaders. */ - if (dst_reg->index == VARYING_SLOT_COL0 || - dst_reg->index == VARYING_SLOT_COL1 || - dst_reg->index == VARYING_SLOT_BFC0 || - dst_reg->index == VARYING_SLOT_BFC1) { - dst = ureg_saturate(dst); - } - break; - - case TGSI_PROCESSOR_FRAGMENT: - if (dst_reg->index == FRAG_RESULT_COLOR || - dst_reg->index >= FRAG_RESULT_DATA0) { - dst = ureg_saturate(dst); - } - break; - } - } if (dst_reg->reladdr != NULL) { assert(dst_reg->file != PROGRAM_TEMPORARY); @@ -4991,8 +4969,7 @@ translate_tex_offset(struct st_translate *t, static void compile_tgsi_instruction(struct st_translate *t, - const glsl_to_tgsi_instruction *inst, - bool clamp_dst_color_output) + const glsl_to_tgsi_instruction *inst) { struct ureg_program *ureg = t->ureg; GLuint i; @@ -5010,8 +4987,7 @@ compile_tgsi_instruction(struct st_translate *t, for (i = 0; i < num_dst; i++) dst[i] = translate_dst(t, &inst->dst[i], - inst->saturate, - clamp_dst_color_output); + inst->saturate); for (i = 0; i < num_src; i++) src[i] = translate_src(t, &inst->src[i]); @@ -5286,16 +5262,6 @@ emit_face_var(struct gl_context *ctx, struct st_translate *t) t->inputs[t->inputMapping[VARYING_SLOT_FACE]] = ureg_src(face_temp); } -static void -emit_edgeflags(struct st_translate *t) -{ - struct ureg_program *ureg = t->ureg; - struct ureg_dst edge_dst = t->outputs[t->outputMapping[VARYING_SLOT_EDGE]]; - struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]]; - - ureg_MOV(ureg, edge_dst, edge_src); -} - static bool find_array(unsigned attr, struct array_decl *arrays, unsigned count, unsigned *array_id, unsigned *array_size) @@ -5353,9 +5319,7 @@ st_translate_program( const GLuint outputMapping[], const GLuint outputSlotToAttr[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color) + const ubyte outputSemanticIndex[]) { struct st_translate *t; unsigned i; @@ -5544,8 +5508,6 @@ st_translate_program( t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X); } } - if (passthrough_edgeflags) - emit_edgeflags(t); } /* Declare address register. @@ -5696,7 +5658,7 @@ st_translate_program( */ foreach_in_list(glsl_to_tgsi_instruction, inst, &program->instructions) { set_insn_start(t, ureg_get_instruction_number(ureg)); - compile_tgsi_instruction(t, inst, clamp_color); + compile_tgsi_instruction(t, inst); } /* Fix up all emitted labels: diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h index 4af747fa9de..c29fc768e49 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.h +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h @@ -52,9 +52,7 @@ enum pipe_error st_translate_program( const GLuint outputMapping[], const GLuint outputSlotToAttr[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color); + const ubyte outputSemanticIndex[]); void free_glsl_to_tgsi_visitor(struct glsl_to_tgsi_visitor *v); void get_pixel_transfer_visitor(struct st_fragment_program *fp, diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c index 896e239ee68..4b9dc994ea5 100644 --- a/src/mesa/state_tracker/st_mesa_to_tgsi.c +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c @@ -283,8 +283,7 @@ st_translate_texture_target( GLuint textarget, static struct ureg_dst translate_dst( struct st_translate *t, const struct prog_dst_register *DstReg, - boolean saturate, - boolean clamp_color) + boolean saturate) { struct ureg_dst dst = dst_register( t, DstReg->File, @@ -295,27 +294,6 @@ translate_dst( struct st_translate *t, if (saturate) dst = ureg_saturate( dst ); - else if (clamp_color && DstReg->File == PROGRAM_OUTPUT) { - /* Clamp colors for ARB_color_buffer_float. */ - switch (t->procType) { - case TGSI_PROCESSOR_VERTEX: - /* This can only occur with a compatibility profile, which doesn't - * support geometry shaders. */ - if (DstReg->Index == VARYING_SLOT_COL0 || - DstReg->Index == VARYING_SLOT_COL1 || - DstReg->Index == VARYING_SLOT_BFC0 || - DstReg->Index == VARYING_SLOT_BFC1) { - dst = ureg_saturate(dst); - } - break; - - case TGSI_PROCESSOR_FRAGMENT: - if (DstReg->Index >= FRAG_RESULT_COLOR) { - dst = ureg_saturate(dst); - } - break; - } - } if (DstReg->RelAddr) dst = ureg_dst_indirect( dst, ureg_src(t->address[0]) ); @@ -649,8 +627,7 @@ static void compile_instruction( struct gl_context *ctx, struct st_translate *t, - const struct prog_instruction *inst, - boolean clamp_dst_color_output) + const struct prog_instruction *inst) { struct ureg_program *ureg = t->ureg; GLuint i; @@ -665,8 +642,7 @@ compile_instruction( if (num_dst) dst[0] = translate_dst( t, &inst->DstReg, - inst->Saturate, - clamp_dst_color_output); + inst->Saturate); for (i = 0; i < num_src; i++) src[i] = translate_src( t, &inst->SrcReg[i] ); @@ -974,18 +950,6 @@ emit_face_var( struct st_translate *t, } -static void -emit_edgeflags( struct st_translate *t, - const struct gl_program *program ) -{ - struct ureg_program *ureg = t->ureg; - struct ureg_dst edge_dst = t->outputs[t->outputMapping[VARYING_SLOT_EDGE]]; - struct ureg_src edge_src = t->inputs[t->inputMapping[VERT_ATTRIB_EDGEFLAG]]; - - ureg_MOV( ureg, edge_dst, edge_src ); -} - - /** * Translate Mesa program to TGSI format. * \param program the program to translate @@ -1019,9 +983,7 @@ st_translate_mesa_program( GLuint numOutputs, const GLuint outputMapping[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color) + const ubyte outputSemanticIndex[]) { struct st_translate translate, *t; unsigned i; @@ -1125,8 +1087,6 @@ st_translate_mesa_program( t->outputs[i] = ureg_writemask(t->outputs[i], TGSI_WRITEMASK_X); } } - if (passthrough_edgeflags) - emit_edgeflags( t, program ); } /* Declare address register. @@ -1231,7 +1191,7 @@ st_translate_mesa_program( */ for (i = 0; i < program->NumInstructions; i++) { set_insn_start( t, ureg_get_instruction_number( ureg )); - compile_instruction( ctx, t, &program->Instructions[i], clamp_color ); + compile_instruction(ctx, t, &program->Instructions[i]); } /* Fix up all emitted labels: diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.h b/src/mesa/state_tracker/st_mesa_to_tgsi.h index 62bb654e95a..ed7a3adfe1a 100644 --- a/src/mesa/state_tracker/st_mesa_to_tgsi.h +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.h @@ -58,9 +58,7 @@ st_translate_mesa_program( GLuint numOutputs, const GLuint outputMapping[], const ubyte outputSemanticName[], - const ubyte outputSemanticIndex[], - boolean passthrough_edgeflags, - boolean clamp_color); + const ubyte outputSemanticIndex[]); unsigned st_translate_texture_target(GLuint textarget, GLboolean shadow); diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index bf6b492e80b..7a6720cee7c 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -404,9 +404,7 @@ st_translate_vertex_program(struct st_context *st, stvp->result_to_output, output_slot_to_attr, output_semantic_name, - output_semantic_index, - false, - false); + output_semantic_index); else error = st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_VERTEX, @@ -422,9 +420,7 @@ st_translate_vertex_program(struct st_context *st, num_outputs, stvp->result_to_output, output_semantic_name, - output_semantic_index, - false, - false); + output_semantic_index); if (error) goto fail; @@ -859,8 +855,7 @@ st_translate_fragment_program(struct st_context *st, outputMapping, NULL, fs_output_semantic_name, - fs_output_semantic_index, FALSE, - false); + fs_output_semantic_index); else st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, @@ -876,8 +871,7 @@ st_translate_fragment_program(struct st_context *st, fs_num_outputs, outputMapping, fs_output_semantic_name, - fs_output_semantic_index, FALSE, - false); + fs_output_semantic_index); variant->tgsi.tokens = ureg_get_tokens(ureg, NULL); ureg_destroy(ureg); @@ -1206,9 +1200,7 @@ st_translate_program_common(struct st_context *st, outputMapping, outputSlotToAttr, output_semantic_name, - output_semantic_index, - FALSE, - FALSE); + output_semantic_index); out_state->tokens = ureg_get_tokens(ureg, NULL); ureg_destroy(ureg); From f4ec81032bb9c1460794d3d843d0ffe47a181291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 02:38:55 +0200 Subject: [PATCH 039/270] st/mesa: implement glBitmap shader transformation using tgsi_transform_shader Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/Makefile.sources | 1 + src/mesa/state_tracker/st_cb_bitmap.c | 145 ---------------- src/mesa/state_tracker/st_cb_bitmap.h | 11 +- src/mesa/state_tracker/st_cb_bitmap_shader.c | 174 +++++++++++++++++++ src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 78 --------- src/mesa/state_tracker/st_glsl_to_tgsi.h | 3 - src/mesa/state_tracker/st_program.c | 34 ++-- 7 files changed, 202 insertions(+), 244 deletions(-) create mode 100644 src/mesa/state_tracker/st_cb_bitmap_shader.c diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index b40ee4d395b..2dabea83075 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -415,6 +415,7 @@ STATETRACKER_FILES = \ state_tracker/st_cache.h \ state_tracker/st_cb_bitmap.c \ state_tracker/st_cb_bitmap.h \ + state_tracker/st_cb_bitmap_shader.c \ state_tracker/st_cb_blit.c \ state_tracker/st_cb_blit.h \ state_tracker/st_cb_bufferobjects.c \ diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c index 230eba8c4a5..bb6dfe85644 100644 --- a/src/mesa/state_tracker/st_cb_bitmap.c +++ b/src/mesa/state_tracker/st_cb_bitmap.c @@ -107,151 +107,6 @@ struct bitmap_cache #define Z_EPSILON 1e-06 -/** - * Make fragment program for glBitmap: - * Sample the texture and kill the fragment if the bit is 0. - * This program will be combined with the user's fragment program. - */ -static struct st_fragment_program * -make_bitmap_fragment_program(struct gl_context *ctx, GLuint samplerIndex) -{ - struct st_context *st = st_context(ctx); - struct st_fragment_program *stfp; - struct gl_program *p; - GLuint ic = 0; - - p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - if (!p) - return NULL; - - p->NumInstructions = 3; - - p->Instructions = _mesa_alloc_instructions(p->NumInstructions); - if (!p->Instructions) { - ctx->Driver.DeleteProgram(ctx, p); - return NULL; - } - _mesa_init_instructions(p->Instructions, p->NumInstructions); - - /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ - p->Instructions[ic].Opcode = OPCODE_TEX; - p->Instructions[ic].DstReg.File = PROGRAM_TEMPORARY; - p->Instructions[ic].DstReg.Index = 0; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - p->Instructions[ic].TexSrcUnit = samplerIndex; - p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - - /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ - p->Instructions[ic].Opcode = OPCODE_KIL; - p->Instructions[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - - if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM) - p->Instructions[ic].SrcReg[0].Swizzle = SWIZZLE_XXXX; - - p->Instructions[ic].SrcReg[0].Index = 0; - p->Instructions[ic].SrcReg[0].Negate = NEGATE_XYZW; - ic++; - - /* END; */ - p->Instructions[ic++].Opcode = OPCODE_END; - - assert(ic == p->NumInstructions); - - p->InputsRead = VARYING_BIT_TEX0; - p->OutputsWritten = 0x0; - p->SamplersUsed = (1 << samplerIndex); - - stfp = (struct st_fragment_program *) p; - stfp->Base.UsesKill = GL_TRUE; - - return stfp; -} - - -static struct gl_program * -make_bitmap_fragment_program_glsl(struct st_context *st, - struct st_fragment_program *orig, - GLuint samplerIndex) -{ - struct gl_context *ctx = st->ctx; - struct st_fragment_program *fp = (struct st_fragment_program *) - ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - - if (!fp) - return NULL; - - get_bitmap_visitor(fp, orig->glsl_to_tgsi, samplerIndex); - return &fp->Base.Base; -} - - -static int -find_free_bit(uint bitfield) -{ - int i; - for (i = 0; i < 32; i++) { - if ((bitfield & (1 << i)) == 0) { - return i; - } - } - return -1; -} - - -/** - * Combine basic bitmap fragment program with the user-defined program. - * \param st current context - * \param fpIn the incoming fragment program - * \param fpOut the new fragment program which does fragment culling - * \param bitmap_sampler sampler number for the bitmap texture - */ -void -st_make_bitmap_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut, - GLuint *bitmap_sampler) -{ - struct st_fragment_program *bitmap_prog; - struct st_fragment_program *stfpIn = (struct st_fragment_program *) fpIn; - struct gl_program *newProg; - uint sampler; - - /* - * Generate new program which is the user-defined program prefixed - * with the bitmap sampler/kill instructions. - */ - sampler = find_free_bit(fpIn->Base.SamplersUsed); - - if (stfpIn->glsl_to_tgsi) - newProg = make_bitmap_fragment_program_glsl(st, stfpIn, sampler); - else { - bitmap_prog = make_bitmap_fragment_program(st->ctx, sampler); - - newProg = _mesa_combine_programs(st->ctx, - &bitmap_prog->Base.Base, - &fpIn->Base); - /* done with this after combining */ - st_reference_fragprog(st, &bitmap_prog, NULL); - } - -#if 0 - { - printf("Combined bitmap program:\n"); - _mesa_print_program(newProg); - printf("InputsRead: 0x%x\n", newProg->InputsRead); - printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten); - _mesa_print_parameter_list(newProg->Parameters); - } -#endif - - /* return results */ - *fpOut = (struct gl_fragment_program *) newProg; - *bitmap_sampler = sampler; -} - - /** * Copy user-provide bitmap bits into texture buffer, expanding * bits into texels. diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h index b4254ca1eeb..dc7e5cb5c9e 100644 --- a/src/mesa/state_tracker/st_cb_bitmap.h +++ b/src/mesa/state_tracker/st_cb_bitmap.h @@ -31,6 +31,7 @@ #include "main/compiler.h" +#include struct dd_function_table; struct st_context; @@ -46,14 +47,12 @@ st_init_bitmap(struct st_context *st); extern void st_destroy_bitmap(struct st_context *st); -extern void -st_make_bitmap_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut, - GLuint *bitmap_sampler); - extern void st_flush_bitmap_cache(struct st_context *st); +extern const struct tgsi_token * +st_get_bitmap_shader(const struct tgsi_token *tokens, + unsigned sampler_index, + bool use_texcoord, bool swizzle_xxxx); #endif /* ST_CB_BITMAP_H */ diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c new file mode 100644 index 00000000000..cddea36d4f6 --- /dev/null +++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c @@ -0,0 +1,174 @@ +/************************************************************************** + * + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * Copyright 2007 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "st_cb_bitmap.h" +#include "tgsi/tgsi_transform.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_dump.h" +#include "util/u_debug.h" + +struct tgsi_bitmap_transform { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + unsigned sampler_index; + bool use_texcoord; + bool swizzle_xxxx; + bool first_instruction_emitted; +}; + +static inline struct tgsi_bitmap_transform * +tgsi_bitmap_transform(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_bitmap_transform *)tctx; +} + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *current_inst) +{ + struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx); + struct tgsi_full_declaration decl; + struct tgsi_full_instruction inst; + unsigned i, semantic; + int texcoord_index = -1; + + if (ctx->first_instruction_emitted) { + tctx->emit_instruction(tctx, current_inst); + return; + } + + ctx->first_instruction_emitted = true; + + /* Add TEMP[0] if it's missing. */ + if (ctx->info.file_max[TGSI_FILE_TEMPORARY] == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_TEMPORARY; + tctx->emit_declaration(tctx, &decl); + } + + /* Add TEXCOORD[0] if it's missing. */ + semantic = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC; + for (i = 0; i < ctx->info.num_inputs; i++) { + if (ctx->info.input_semantic_name[i] == semantic && + ctx->info.input_semantic_index[i] == 0) { + texcoord_index = i; + break; + } + } + + if (texcoord_index == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_INPUT; + decl.Declaration.Semantic = 1; + decl.Semantic.Name = semantic; + decl.Declaration.Interpolate = 1; + decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; + decl.Range.First = decl.Range.Last = ctx->info.num_inputs; + texcoord_index = ctx->info.num_inputs; + tctx->emit_declaration(tctx, &decl); + } + + /* Declare the sampler. */ + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_SAMPLER; + decl.Range.First = decl.Range.Last = ctx->sampler_index; + tctx->emit_declaration(tctx, &decl); + + /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_TEX; + inst.Instruction.Texture = 1; + inst.Texture.Texture = TGSI_TEXTURE_2D; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = 0; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + inst.Instruction.NumSrcRegs = 2; + inst.Src[0].Register.File = TGSI_FILE_INPUT; + inst.Src[0].Register.Index = texcoord_index; + inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; + inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y; + inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z; + inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W; + inst.Src[1].Register.File = TGSI_FILE_SAMPLER; + inst.Src[1].Register.Index = ctx->sampler_index; + + tctx->emit_instruction(tctx, &inst); + + /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_KILL_IF; + inst.Instruction.NumDstRegs = 0; + inst.Instruction.NumSrcRegs = 1; + + inst.Src[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Src[0].Register.Index = 0; + inst.Src[0].Register.Negate = 1; + inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; + if (ctx->swizzle_xxxx) { + inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_X; + inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_X; + inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_X; + } else { + inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y; + inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z; + inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W; + } + tctx->emit_instruction(tctx, &inst); + + /* And emit the instruction we got. */ + tctx->emit_instruction(tctx, current_inst); +} + +const struct tgsi_token * +st_get_bitmap_shader(const struct tgsi_token *tokens, + unsigned sampler_index, + bool use_texcoord, bool swizzle_xxxx) +{ + struct tgsi_bitmap_transform ctx; + struct tgsi_token *newtoks; + int newlen; + + memset(&ctx, 0, sizeof(ctx)); + ctx.base.transform_instruction = transform_instr; + ctx.sampler_index = sampler_index; + ctx.use_texcoord = use_texcoord; + ctx.swizzle_xxxx = swizzle_xxxx; + tgsi_scan_shader(tokens, &ctx.info); + + newlen = tgsi_num_tokens(tokens) + 20; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 1488ea07c1d..a54ee17173a 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -4466,84 +4466,6 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp, fp->glsl_to_tgsi = v; } -/** - * Make fragment program for glBitmap: - * Sample the texture and kill the fragment if the bit is 0. - * This program will be combined with the user's fragment program. - * - * Based on make_bitmap_fragment_program in st_cb_bitmap.c. - */ -extern "C" void -get_bitmap_visitor(struct st_fragment_program *fp, - glsl_to_tgsi_visitor *original, int samplerIndex) -{ - glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor(); - struct st_context *st = st_context(original->ctx); - struct gl_program *prog = &fp->Base.Base; - st_src_reg coord, src0; - st_dst_reg dst0; - glsl_to_tgsi_instruction *inst; - - /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */ - v->ctx = original->ctx; - v->prog = prog; - v->shader_program = NULL; - v->shader = NULL; - v->glsl_version = original->glsl_version; - v->native_integers = original->native_integers; - v->options = original->options; - v->next_temp = original->next_temp; - v->num_address_regs = original->num_address_regs; - v->samplers_used = prog->SamplersUsed = original->samplers_used; - v->indirect_addr_consts = original->indirect_addr_consts; - memcpy(&v->immediates, &original->immediates, sizeof(v->immediates)); - v->num_immediates = original->num_immediates; - - /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ - coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type); - src0 = v->get_temp(glsl_type::vec4_type); - dst0 = st_dst_reg(src0); - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord); - inst->sampler.index = samplerIndex; - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - prog->InputsRead |= VARYING_BIT_TEX0; - prog->SamplersUsed |= (1 << samplerIndex); /* mark sampler as used */ - v->samplers_used |= (1 << samplerIndex); - - /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ - src0.negate = NEGATE_XYZW; - if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM) - src0.swizzle = SWIZZLE_XXXX; - inst = v->emit_asm(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0); - - /* Now copy the instructions from the original glsl_to_tgsi_visitor into the - * new visitor. */ - foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) { - glsl_to_tgsi_instruction *newinst; - st_src_reg src_regs[4]; - - if (inst->dst[0].file == PROGRAM_OUTPUT) - prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index); - - for (int i = 0; i < 4; i++) { - src_regs[i] = inst->src[i]; - if (src_regs[i].file == PROGRAM_INPUT) - prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index); - } - - newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]); - newinst->tex_target = inst->tex_target; - newinst->sampler_array_size = inst->sampler_array_size; - } - - /* Make modifications to fragment program info. */ - prog->Parameters = _mesa_clone_parameter_list(original->prog->Parameters); - count_resources(v, prog); - fp->glsl_to_tgsi = v; -} - /* ------------------------- TGSI conversion stuff -------------------------- */ struct label { unsigned branch_target; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h index c29fc768e49..dcdfbebcbdc 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.h +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h @@ -58,9 +58,6 @@ void free_glsl_to_tgsi_visitor(struct glsl_to_tgsi_visitor *v); void get_pixel_transfer_visitor(struct st_fragment_program *fp, struct glsl_to_tgsi_visitor *original, int scale_and_bias, int pixel_maps); -void get_bitmap_visitor(struct st_fragment_program *fp, - struct glsl_to_tgsi_visitor *original, - int samplerIndex); GLboolean st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog); diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 7a6720cee7c..fba661b5405 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -567,18 +567,7 @@ st_translate_fragment_program(struct st_context *st, assert(!(key->bitmap && key->drawpixels)); memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); - if (key->bitmap) { - /* glBitmap drawing */ - struct gl_fragment_program *fp; /* we free this temp program below */ - - st_make_bitmap_fragment_program(st, &stfp->Base, - &fp, &variant->bitmap_sampler); - - variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters); - stfp = st_fragment_program(fp); - deleteFP = GL_TRUE; - } - else if (key->drawpixels) { + if (key->drawpixels) { /* glDrawPixels drawing */ struct gl_fragment_program *fp; /* we free this temp program below */ @@ -892,6 +881,27 @@ st_translate_fragment_program(struct st_context *st, fprintf(stderr, "mesa: cannot emulate deprecated features\n"); } + /* glBitmap */ + if (key->bitmap) { + const struct tgsi_token *tokens; + + variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1; + + tokens = st_get_bitmap_shader(variant->tgsi.tokens, + variant->bitmap_sampler, + st->needs_texcoord_semantic, + st->bitmap.tex_format == + PIPE_FORMAT_L8_UNORM); + + if (tokens) { + tgsi_free_tokens(variant->tgsi.tokens); + variant->tgsi.tokens = tokens; + variant->parameters = + _mesa_clone_parameter_list(stfp->Base.Base.Parameters); + } else + fprintf(stderr, "mesa: cannot create a shader for glBitmap\n"); + } + if (ST_DEBUG & DEBUG_TGSI) { tgsi_dump(variant->tgsi.tokens, 0/*TGSI_DUMP_VERBOSE*/); debug_printf("\n"); From b55b986dc9c89a3a4fb3956dcd269216f59b06ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 18:23:33 +0200 Subject: [PATCH 040/270] st/mesa: make Z/S drawpix shaders independent of variants, don't use Mesa IR v2 - there is no connection to user fragment shaders, so having these as shader variants makes no sense - don't use Mesa IR, use TGSI - don't create gl_fragment_program, just create the shader CSO v2: generate exactly the same shader as before to fix llvmpipe Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_cb_drawpixels.c | 171 +++++++--------------- src/mesa/state_tracker/st_cb_drawpixels.h | 6 - src/mesa/state_tracker/st_context.h | 2 +- src/mesa/state_tracker/st_program.c | 15 +- src/mesa/state_tracker/st_program.h | 2 - 5 files changed, 60 insertions(+), 136 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 152160e1dd2..f77d3049ae6 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -189,100 +189,70 @@ st_make_drawpix_fragment_program(struct st_context *st, * stencil value value, then writes to FRAG_RESULT_DEPTH/FRAG_RESULT_STENCIL. * Used for glDrawPixels(GL_DEPTH_COMPONENT / GL_STENCIL_INDEX). * Pass fragment color through as-is. - * \return pointer to the gl_fragment program + * + * \return CSO of the fragment shader. */ -struct gl_fragment_program * -st_make_drawpix_z_stencil_program(struct st_context *st, - GLboolean write_depth, - GLboolean write_stencil) +static void * +get_drawpix_z_stencil_program(struct st_context *st, + GLboolean write_depth, + GLboolean write_stencil) { - struct gl_context *ctx = st->ctx; - struct gl_program *p; - struct gl_fragment_program *fp; - GLuint ic = 0; + struct ureg_program *ureg; + struct ureg_src depth_sampler, stencil_sampler; + struct ureg_src texcoord, color; + struct ureg_dst out_color, out_depth, out_stencil; const GLuint shaderIndex = write_depth * 2 + write_stencil; + void *cso; - assert(shaderIndex < ARRAY_SIZE(st->drawpix.shaders)); + assert(shaderIndex < ARRAY_SIZE(st->drawpix.zs_shaders)); - if (st->drawpix.shaders[shaderIndex]) { + if (st->drawpix.zs_shaders[shaderIndex]) { /* already have the proper shader */ - return st->drawpix.shaders[shaderIndex]; + return st->drawpix.zs_shaders[shaderIndex]; } - /* - * Create shader now - */ - p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - if (!p) + ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT); + if (ureg == NULL) return NULL; - p->NumInstructions = write_depth ? 3 : 1; - p->NumInstructions += write_stencil ? 1 : 0; - - p->Instructions = _mesa_alloc_instructions(p->NumInstructions); - if (!p->Instructions) { - ctx->Driver.DeleteProgram(ctx, p); - return NULL; - } - _mesa_init_instructions(p->Instructions, p->NumInstructions); + ureg_property(ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, TRUE); if (write_depth) { - /* TEX result.depth, fragment.texcoord[0], texture[0], 2D; */ - p->Instructions[ic].Opcode = OPCODE_TEX; - p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT; - p->Instructions[ic].DstReg.Index = FRAG_RESULT_DEPTH; - p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Z; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - p->Instructions[ic].TexSrcUnit = 0; - p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - /* MOV result.color, fragment.color; */ - p->Instructions[ic].Opcode = OPCODE_MOV; - p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT; - p->Instructions[ic].DstReg.Index = FRAG_RESULT_COLOR; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_COL0; - ic++; + color = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, + TGSI_INTERPOLATE_COLOR); + out_color = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0); + + depth_sampler = ureg_DECL_sampler(ureg, 0); + out_depth = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); } if (write_stencil) { - /* TEX result.stencil, fragment.texcoord[0], texture[0], 2D; */ - p->Instructions[ic].Opcode = OPCODE_TEX; - p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT; - p->Instructions[ic].DstReg.Index = FRAG_RESULT_STENCIL; - p->Instructions[ic].DstReg.WriteMask = WRITEMASK_Y; - p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT; - p->Instructions[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - p->Instructions[ic].TexSrcUnit = 1; - p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; + stencil_sampler = ureg_DECL_sampler(ureg, 1); + out_stencil = ureg_DECL_output(ureg, TGSI_SEMANTIC_STENCIL, 0); } - /* END; */ - p->Instructions[ic++].Opcode = OPCODE_END; + texcoord = ureg_DECL_fs_input(ureg, + st->needs_texcoord_semantic ? + TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC, + 0, TGSI_INTERPOLATE_LINEAR); - assert(ic == p->NumInstructions); - - p->InputsRead = VARYING_BIT_TEX0 | VARYING_BIT_COL0; - p->OutputsWritten = 0; if (write_depth) { - p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_DEPTH); - p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_COLOR); + ureg_TEX(ureg, ureg_writemask(out_depth, TGSI_WRITEMASK_Z), + TGSI_TEXTURE_2D, texcoord, depth_sampler); + ureg_MOV(ureg, out_color, color); } - if (write_stencil) - p->OutputsWritten |= BITFIELD64_BIT(FRAG_RESULT_STENCIL); - p->SamplersUsed = 0x1; /* sampler 0 (bit 0) is used */ if (write_stencil) - p->SamplersUsed |= 1 << 1; + ureg_TEX(ureg, ureg_writemask(out_stencil, TGSI_WRITEMASK_Y), + TGSI_TEXTURE_2D, texcoord, stencil_sampler); - fp = (struct gl_fragment_program *) p; + ureg_END(ureg); + cso = ureg_create_shader_and_destroy(ureg, st->pipe); /* save the new shader */ - st->drawpix.shaders[shaderIndex] = fp; - - return fp; + st->drawpix.zs_shaders[shaderIndex] = cso; + return cso; } @@ -1047,30 +1017,6 @@ get_color_fp_variant(struct st_context *st) } -/** - * Get fragment program variant for a glDrawPixels or glCopyPixels - * command for depth/stencil data. - */ -static struct st_fp_variant * -get_depth_stencil_fp_variant(struct st_context *st, GLboolean write_depth, - GLboolean write_stencil) -{ - struct st_fp_variant_key key; - struct st_fp_variant *fpv; - - memset(&key, 0, sizeof(key)); - - key.st = st; - key.drawpixels = 1; - key.drawpixels_z = write_depth; - key.drawpixels_stencil = write_stencil; - - fpv = st_get_fp_variant(st, st->fp, &key); - - return fpv; -} - - /** * Clamp glDrawPixels width and height to the maximum texture size. */ @@ -1109,7 +1055,6 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, GLboolean write_stencil = GL_FALSE, write_depth = GL_FALSE; struct pipe_sampler_view *sv[2] = { NULL }; int num_sampler_view = 1; - struct st_fp_variant *fpv; struct gl_pixelstore_attrib clippedUnpack; /* Mesa state should be up to date by now */ @@ -1144,19 +1089,15 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, * Get vertex/fragment shaders */ if (write_depth || write_stencil) { - fpv = get_depth_stencil_fp_variant(st, write_depth, write_stencil); - - driver_fp = fpv->driver_shader; - + driver_fp = get_drawpix_z_stencil_program(st, write_depth, + write_stencil); driver_vp = make_passthrough_vertex_shader(st, GL_TRUE); - color = ctx->Current.RasterColor; } else { - fpv = get_color_fp_variant(st); + struct st_fp_variant *fpv = get_color_fp_variant(st); driver_fp = fpv->driver_shader; - driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); color = NULL; @@ -1165,10 +1106,10 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, st->pixel_xfer.pixelmap_sampler_view); num_sampler_view++; } - } - /* update fragment program constants */ - st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); + /* update fragment program constants */ + st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); + } /* draw with textured quad */ { @@ -1459,7 +1400,6 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, GLboolean invertTex = GL_FALSE; GLint readX, readY, readW, readH; struct gl_pixelstore_attrib pack = ctx->DefaultPacking; - struct st_fp_variant *fpv; st_validate_state(st); @@ -1491,12 +1431,12 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, * Get vertex/fragment shaders */ if (type == GL_COLOR) { + struct st_fp_variant *fpv = get_color_fp_variant(st); + rbRead = st_get_color_read_renderbuffer(ctx); color = NULL; - fpv = get_color_fp_variant(st); driver_fp = fpv->driver_shader; - driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); if (st->pixel_xfer.pixelmap_enabled) { @@ -1504,6 +1444,9 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, st->pixel_xfer.pixelmap_sampler_view); num_sampler_view++; } + + /* update fragment program constants */ + st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); } else { assert(type == GL_DEPTH); @@ -1511,15 +1454,10 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, Attachment[BUFFER_DEPTH].Renderbuffer); color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0]; - fpv = get_depth_stencil_fp_variant(st, GL_TRUE, GL_FALSE); - driver_fp = fpv->driver_shader; - + driver_fp = get_drawpix_z_stencil_program(st, GL_TRUE, GL_FALSE); driver_vp = make_passthrough_vertex_shader(st, GL_TRUE); } - /* update fragment program constants */ - st_upload_constants(st, fpv->parameters, PIPE_SHADER_FRAGMENT); - /* Choose the format for the temporary texture. */ srcFormat = rbRead->texture->format; srcBind = PIPE_BIND_SAMPLER_VIEW | @@ -1666,9 +1604,10 @@ st_destroy_drawpix(struct st_context *st) { GLuint i; - for (i = 0; i < ARRAY_SIZE(st->drawpix.shaders); i++) { - if (st->drawpix.shaders[i]) - _mesa_reference_fragprog(st->ctx, &st->drawpix.shaders[i], NULL); + for (i = 0; i < ARRAY_SIZE(st->drawpix.zs_shaders); i++) { + if (st->drawpix.zs_shaders[i]) + cso_delete_fragment_shader(st->cso_context, + st->drawpix.zs_shaders[i]); } st_reference_fragprog(st, &st->pixel_xfer.combined_prog, NULL); diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h index c707ace2f9f..c6649cbd51c 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.h +++ b/src/mesa/state_tracker/st_cb_drawpixels.h @@ -45,10 +45,4 @@ st_make_drawpix_fragment_program(struct st_context *st, struct gl_fragment_program *fpIn, struct gl_fragment_program **fpOut); -extern struct gl_fragment_program * -st_make_drawpix_z_stencil_program(struct st_context *st, - GLboolean write_depth, - GLboolean write_stencil); - - #endif /* ST_CB_DRAWPIXELS_H */ diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h index a4cda29059d..262581eceeb 100644 --- a/src/mesa/state_tracker/st_context.h +++ b/src/mesa/state_tracker/st_context.h @@ -184,7 +184,7 @@ struct st_context /** for glDraw/CopyPixels */ struct { - struct gl_fragment_program *shaders[4]; + void *zs_shaders[4]; void *vert_shaders[2]; /**< ureg shaders */ } drawpix; diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index fba661b5405..d900ede7265 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -568,19 +568,12 @@ st_translate_fragment_program(struct st_context *st, memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); if (key->drawpixels) { - /* glDrawPixels drawing */ + /* glDrawPixels color drawing */ struct gl_fragment_program *fp; /* we free this temp program below */ - if (key->drawpixels_z || key->drawpixels_stencil) { - fp = st_make_drawpix_z_stencil_program(st, key->drawpixels_z, - key->drawpixels_stencil); - } - else { - /* RGBA */ - st_make_drawpix_fragment_program(st, &stfp->Base, &fp); - variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters); - deleteFP = GL_TRUE; - } + st_make_drawpix_fragment_program(st, &stfp->Base, &fp); + variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters); + deleteFP = GL_TRUE; stfp = st_fragment_program(fp); } diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index c60d2d5f803..2927d542dfc 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -59,8 +59,6 @@ struct st_fp_variant_key GLuint drawpixels:1; /**< glDrawPixels variant */ GLuint scaleAndBias:1; /**< glDrawPixels w/ scale and/or bias? */ GLuint pixelMaps:1; /**< glDrawPixels w/ pixel lookup map? */ - GLuint drawpixels_z:1; /**< glDrawPixels(GL_DEPTH) */ - GLuint drawpixels_stencil:1; /**< glDrawPixels(GL_STENCIL) */ /** for ARB_color_buffer_float */ GLuint clamp_color:1; From f15bb3e633d577fe6d8d7bc2c64497c6ac4c2021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Oct 2015 02:38:55 +0200 Subject: [PATCH 041/270] st/mesa: implement DrawPixels shader transformation using tgsi_transform_shader Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/Makefile.sources | 1 + .../state_tracker/st_atom_pixeltransfer.c | 225 +--------------- src/mesa/state_tracker/st_cb_drawpixels.c | 118 +------- src/mesa/state_tracker/st_cb_drawpixels.h | 9 +- .../state_tracker/st_cb_drawpixels_shader.c | 255 ++++++++++++++++++ src/mesa/state_tracker/st_context.c | 6 +- src/mesa/state_tracker/st_context.h | 7 - src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 132 --------- src/mesa/state_tracker/st_glsl_to_tgsi.h | 3 - src/mesa/state_tracker/st_program.c | 51 ++-- 10 files changed, 303 insertions(+), 504 deletions(-) create mode 100644 src/mesa/state_tracker/st_cb_drawpixels_shader.c diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index 2dabea83075..13208b5e421 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -426,6 +426,7 @@ STATETRACKER_FILES = \ state_tracker/st_cb_condrender.h \ state_tracker/st_cb_drawpixels.c \ state_tracker/st_cb_drawpixels.h \ + state_tracker/st_cb_drawpixels_shader.c \ state_tracker/st_cb_drawtex.c \ state_tracker/st_cb_drawtex.h \ state_tracker/st_cb_eglimage.c \ diff --git a/src/mesa/state_tracker/st_atom_pixeltransfer.c b/src/mesa/state_tracker/st_atom_pixeltransfer.c index a04163cc137..f94c358afba 100644 --- a/src/mesa/state_tracker/st_atom_pixeltransfer.c +++ b/src/mesa/state_tracker/st_atom_pixeltransfer.c @@ -25,65 +25,17 @@ * **************************************************************************/ -/* - * Generate fragment programs to implement pixel transfer ops, such as - * scale/bias, colortable, convolution... - * - * Authors: +/* Authors: * Brian Paul */ -#include "main/imports.h" -#include "main/image.h" -#include "main/macros.h" -#include "program/program.h" -#include "program/prog_cache.h" -#include "program/prog_instruction.h" -#include "program/prog_parameter.h" -#include "program/prog_print.h" - #include "st_context.h" -#include "st_format.h" #include "st_texture.h" -#include "pipe/p_screen.h" -#include "pipe/p_context.h" #include "util/u_inlines.h" #include "util/u_pack_color.h" -struct state_key -{ - GLuint scaleAndBias:1; - GLuint pixelMaps:1; - -#if 0 - GLfloat Maps[3][256][4]; - int NumMaps; - GLint NumStages; - pipeline_stage Stages[STAGE_MAX]; - GLboolean StagesUsed[STAGE_MAX]; - GLfloat Scale1[4], Bias1[4]; - GLfloat Scale2[4], Bias2[4]; -#endif -}; - -static void -make_state_key(struct gl_context *ctx, struct state_key *key) -{ - memset(key, 0, sizeof(*key)); - - if (ctx->Pixel.RedBias != 0.0 || ctx->Pixel.RedScale != 1.0 || - ctx->Pixel.GreenBias != 0.0 || ctx->Pixel.GreenScale != 1.0 || - ctx->Pixel.BlueBias != 0.0 || ctx->Pixel.BlueScale != 1.0 || - ctx->Pixel.AlphaBias != 0.0 || ctx->Pixel.AlphaScale != 1.0) { - key->scaleAndBias = 1; - } - - key->pixelMaps = ctx->Pixel.MapColorFlag; -} - - /** * Update the pixelmap texture with the contents of the R/G/B/A pixel maps. */ @@ -128,74 +80,15 @@ load_color_map_texture(struct gl_context *ctx, struct pipe_resource *pt) pipe_transfer_unmap(pipe, transfer); } - - -#define MAX_INST 100 - /** - * Returns a fragment program which implements the current pixel transfer ops. + * Upload the pixel transfer color map texture. */ -static struct gl_fragment_program * -get_pixel_transfer_program(struct gl_context *ctx, const struct state_key *key) +static void +update_pixel_transfer(struct st_context *st) { - struct st_context *st = st_context(ctx); - struct prog_instruction inst[MAX_INST]; - struct gl_program_parameter_list *params; - struct gl_fragment_program *fp; - GLuint ic = 0; - const GLuint colorTemp = 0; - - fp = (struct gl_fragment_program *) - ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - if (!fp) - return NULL; - - params = _mesa_new_parameter_list(); - - /* - * Get initial pixel color from the texture. - * TEX colorTemp, fragment.texcoord[0], texture[0], 2D; - */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_TEX; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = colorTemp; - inst[ic].SrcReg[0].File = PROGRAM_INPUT; - inst[ic].SrcReg[0].Index = VARYING_SLOT_TEX0; - inst[ic].TexSrcUnit = 0; - inst[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - fp->Base.InputsRead = BITFIELD64_BIT(VARYING_SLOT_TEX0); - fp->Base.OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR); - fp->Base.SamplersUsed = 0x1; /* sampler 0 (bit 0) is used */ - - if (key->scaleAndBias) { - static const gl_state_index scale_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_SCALE, 0, 0, 0 }; - static const gl_state_index bias_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_BIAS, 0, 0, 0 }; - GLint scale_p, bias_p; - - scale_p = _mesa_add_state_reference(params, scale_state); - bias_p = _mesa_add_state_reference(params, bias_state); - - /* MAD colorTemp, colorTemp, scale, bias; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_MAD; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = colorTemp; - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = colorTemp; - inst[ic].SrcReg[1].File = PROGRAM_STATE_VAR; - inst[ic].SrcReg[1].Index = scale_p; - inst[ic].SrcReg[2].File = PROGRAM_STATE_VAR; - inst[ic].SrcReg[2].Index = bias_p; - ic++; - } - - if (key->pixelMaps) { - const GLuint temp = 1; + struct gl_context *ctx = st->ctx; + if (ctx->Pixel.MapColorFlag) { /* create the colormap/texture now if not already done */ if (!st->pixel_xfer.pixelmap_texture) { st->pixel_xfer.pixelmap_texture = st_create_color_map_texture(ctx); @@ -203,117 +96,11 @@ get_pixel_transfer_program(struct gl_context *ctx, const struct state_key *key) st_create_texture_sampler_view(st->pipe, st->pixel_xfer.pixelmap_texture); } - - /* with a little effort, we can do four pixel map look-ups with - * two TEX instructions: - */ - - /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_TEX; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = temp; - inst[ic].DstReg.WriteMask = WRITEMASK_XY; /* write R,G */ - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = colorTemp; - inst[ic].TexSrcUnit = 1; - inst[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - - /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_TEX; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = temp; - inst[ic].DstReg.WriteMask = WRITEMASK_ZW; /* write B,A */ - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = colorTemp; - inst[ic].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, - SWIZZLE_Z, SWIZZLE_W); - inst[ic].TexSrcUnit = 1; - inst[ic].TexSrcTarget = TEXTURE_2D_INDEX; - ic++; - - /* MOV colorTemp, temp; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_MOV; - inst[ic].DstReg.File = PROGRAM_TEMPORARY; - inst[ic].DstReg.Index = colorTemp; - inst[ic].SrcReg[0].File = PROGRAM_TEMPORARY; - inst[ic].SrcReg[0].Index = temp; - ic++; - - fp->Base.SamplersUsed |= (1 << 1); /* sampler 1 is used */ - } - - /* Modify last instruction's dst reg to write to result.color */ - { - struct prog_instruction *last = &inst[ic - 1]; - last->DstReg.File = PROGRAM_OUTPUT; - last->DstReg.Index = FRAG_RESULT_COLOR; - } - - /* END; */ - _mesa_init_instructions(inst + ic, 1); - inst[ic].Opcode = OPCODE_END; - ic++; - - assert(ic <= MAX_INST); - - - fp->Base.Instructions = _mesa_alloc_instructions(ic); - if (!fp->Base.Instructions) { - _mesa_error(ctx, GL_OUT_OF_MEMORY, - "generating pixel transfer program"); - _mesa_free_parameter_list(params); - return NULL; - } - - _mesa_copy_instructions(fp->Base.Instructions, inst, ic); - fp->Base.NumInstructions = ic; - fp->Base.Parameters = params; - -#if 0 - printf("========= pixel transfer prog\n"); - _mesa_print_program(&fp->Base); - _mesa_print_parameter_list(fp->Base.Parameters); -#endif - - return fp; -} - - - -/** - * Update st->pixel_xfer.program in response to new pixel-transfer state. - */ -static void -update_pixel_transfer(struct st_context *st) -{ - struct gl_context *ctx = st->ctx; - struct state_key key; - struct gl_fragment_program *fp; - - make_state_key(st->ctx, &key); - - fp = (struct gl_fragment_program *) - _mesa_search_program_cache(st->pixel_xfer.cache, &key, sizeof(key)); - if (!fp) { - fp = get_pixel_transfer_program(st->ctx, &key); - _mesa_program_cache_insert(st->ctx, st->pixel_xfer.cache, - &key, sizeof(key), &fp->Base); - } - - if (ctx->Pixel.MapColorFlag) { load_color_map_texture(ctx, st->pixel_xfer.pixelmap_texture); } - st->pixel_xfer.pixelmap_enabled = ctx->Pixel.MapColorFlag; - - st->pixel_xfer.program = (struct st_fragment_program *) fp; } - const struct st_tracked_state st_update_pixel_transfer = { "st_update_pixel_transfer", /* name */ { /* dirty */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index f77d3049ae6..34163ede4a2 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -71,119 +71,6 @@ #include "cso_cache/cso_context.h" -/** - * Check if the given program is: - * 0: MOVE result.color, fragment.color; - * 1: END; - */ -static GLboolean -is_passthrough_program(const struct gl_fragment_program *prog) -{ - if (prog->Base.NumInstructions == 2) { - const struct prog_instruction *inst = prog->Base.Instructions; - if (inst[0].Opcode == OPCODE_MOV && - inst[1].Opcode == OPCODE_END && - inst[0].DstReg.File == PROGRAM_OUTPUT && - inst[0].DstReg.Index == FRAG_RESULT_COLOR && - inst[0].DstReg.WriteMask == WRITEMASK_XYZW && - inst[0].SrcReg[0].File == PROGRAM_INPUT && - inst[0].SrcReg[0].Index == VARYING_SLOT_COL0 && - inst[0].SrcReg[0].Swizzle == SWIZZLE_XYZW) { - return GL_TRUE; - } - } - return GL_FALSE; -} - - -/** - * Returns a fragment program which implements the current pixel transfer ops. - */ -static struct gl_fragment_program * -get_glsl_pixel_transfer_program(struct st_context *st, - struct st_fragment_program *orig) -{ - int pixelMaps = 0, scaleAndBias = 0; - struct gl_context *ctx = st->ctx; - struct st_fragment_program *fp = (struct st_fragment_program *) - ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0); - - if (!fp) - return NULL; - - if (ctx->Pixel.RedBias != 0.0 || ctx->Pixel.RedScale != 1.0 || - ctx->Pixel.GreenBias != 0.0 || ctx->Pixel.GreenScale != 1.0 || - ctx->Pixel.BlueBias != 0.0 || ctx->Pixel.BlueScale != 1.0 || - ctx->Pixel.AlphaBias != 0.0 || ctx->Pixel.AlphaScale != 1.0) { - scaleAndBias = 1; - } - - pixelMaps = ctx->Pixel.MapColorFlag; - - if (pixelMaps) { - /* create the colormap/texture now if not already done */ - if (!st->pixel_xfer.pixelmap_texture) { - st->pixel_xfer.pixelmap_texture = st_create_color_map_texture(ctx); - st->pixel_xfer.pixelmap_sampler_view = - st_create_texture_sampler_view(st->pipe, - st->pixel_xfer.pixelmap_texture); - } - } - - get_pixel_transfer_visitor(fp, orig->glsl_to_tgsi, - scaleAndBias, pixelMaps); - - return &fp->Base; -} - - -/** - * Make fragment shader for glDraw/CopyPixels. This shader is made - * by combining the pixel transfer shader with the user-defined shader. - * \param fpIn the current/incoming fragment program - * \param fpOut returns the combined fragment program - */ -void -st_make_drawpix_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut) -{ - struct gl_program *newProg; - struct st_fragment_program *stfp = (struct st_fragment_program *) fpIn; - - if (is_passthrough_program(fpIn)) { - newProg = (struct gl_program *) _mesa_clone_fragment_program(st->ctx, - &st->pixel_xfer.program->Base); - } - else if (stfp->glsl_to_tgsi != NULL) { - newProg = (struct gl_program *) get_glsl_pixel_transfer_program(st, stfp); - } - else { -#if 0 - /* debug */ - printf("Base program:\n"); - _mesa_print_program(&fpIn->Base); - printf("DrawPix program:\n"); - _mesa_print_program(&st->pixel_xfer.program->Base.Base); -#endif - newProg = _mesa_combine_programs(st->ctx, - &st->pixel_xfer.program->Base.Base, - &fpIn->Base); - } - -#if 0 - /* debug */ - printf("Combined DrawPixels program:\n"); - _mesa_print_program(newProg); - printf("InputsRead: 0x%x\n", newProg->InputsRead); - printf("OutputsWritten: 0x%x\n", newProg->OutputsWritten); - _mesa_print_parameter_list(newProg->Parameters); -#endif - - *fpOut = (struct gl_fragment_program *) newProg; -} - - /** * Create fragment program that does a TEX() instruction to get a Z and/or * stencil value value, then writes to FRAG_RESULT_DEPTH/FRAG_RESULT_STENCIL. @@ -1101,7 +988,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); color = NULL; - if (st->pixel_xfer.pixelmap_enabled) { + if (ctx->Pixel.MapColorFlag) { pipe_sampler_view_reference(&sv[1], st->pixel_xfer.pixelmap_sampler_view); num_sampler_view++; @@ -1439,7 +1326,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, driver_fp = fpv->driver_shader; driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); - if (st->pixel_xfer.pixelmap_enabled) { + if (ctx->Pixel.MapColorFlag) { pipe_sampler_view_reference(&sv[1], st->pixel_xfer.pixelmap_sampler_view); num_sampler_view++; @@ -1610,7 +1497,6 @@ st_destroy_drawpix(struct st_context *st) st->drawpix.zs_shaders[i]); } - st_reference_fragprog(st, &st->pixel_xfer.combined_prog, NULL); if (st->drawpix.vert_shaders[0]) cso_delete_vertex_shader(st->cso_context, st->drawpix.vert_shaders[0]); if (st->drawpix.vert_shaders[1]) diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h index c6649cbd51c..b8a34952141 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.h +++ b/src/mesa/state_tracker/st_cb_drawpixels.h @@ -31,6 +31,7 @@ #include "main/compiler.h" +#include struct dd_function_table; struct st_context; @@ -40,9 +41,9 @@ extern void st_init_drawpixels_functions(struct dd_function_table *functions); extern void st_destroy_drawpix(struct st_context *st); -extern void -st_make_drawpix_fragment_program(struct st_context *st, - struct gl_fragment_program *fpIn, - struct gl_fragment_program **fpOut); +extern const struct tgsi_token * +st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, + bool scale_and_bias, unsigned scale_const, + unsigned bias_const, bool pixel_maps); #endif /* ST_CB_DRAWPIXELS_H */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c new file mode 100644 index 00000000000..01db0b6d73e --- /dev/null +++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c @@ -0,0 +1,255 @@ +/************************************************************************** + * + * Copyright (C) 2015 Advanced Micro Devices, Inc. + * Copyright 2007 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "st_cb_drawpixels.h" +#include "tgsi/tgsi_transform.h" +#include "tgsi/tgsi_scan.h" + +struct tgsi_drawpix_transform { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + bool use_texcoord; + bool scale_and_bias; + bool pixel_maps; + bool first_instruction_emitted; + unsigned scale_const; + unsigned bias_const; + unsigned color_temp; +}; + +static inline struct tgsi_drawpix_transform * +tgsi_drawpix_transform(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_drawpix_transform *)tctx; +} + +static void +set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index, + unsigned x, unsigned y, unsigned z, unsigned w) +{ + inst->Src[i].Register.File = file; + inst->Src[i].Register.Index = index; + inst->Src[i].Register.SwizzleX = x; + inst->Src[i].Register.SwizzleY = y; + inst->Src[i].Register.SwizzleZ = z; + inst->Src[i].Register.SwizzleW = w; +} + +#define SET_SRC(inst, i, file, index, x, y, z, w) \ + set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \ + TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w) + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *current_inst) +{ + struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx); + struct tgsi_full_declaration decl; + struct tgsi_full_instruction inst; + unsigned i, semantic; + int texcoord_index = -1; + + if (ctx->first_instruction_emitted) + goto transform_inst; + + ctx->first_instruction_emitted = true; + + /* Add scale and bias constants. */ + if (ctx->scale_and_bias) { + if (ctx->info.const_file_max[0] < (int)ctx->scale_const) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_CONSTANT; + decl.Range.First = decl.Range.Last = ctx->scale_const; + tctx->emit_declaration(tctx, &decl); + } + + if (ctx->info.const_file_max[0] < (int)ctx->bias_const) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_CONSTANT; + decl.Range.First = decl.Range.Last = ctx->bias_const; + tctx->emit_declaration(tctx, &decl); + } + } + + /* Add a new temp. */ + ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1; + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_TEMPORARY; + decl.Range.First = decl.Range.Last = ctx->color_temp; + tctx->emit_declaration(tctx, &decl); + + /* Add TEXCOORD[0] if it's missing. */ + semantic = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC; + for (i = 0; i < ctx->info.num_inputs; i++) { + if (ctx->info.input_semantic_name[i] == semantic && + ctx->info.input_semantic_index[i] == 0) { + texcoord_index = i; + break; + } + } + + if (texcoord_index == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_INPUT; + decl.Declaration.Semantic = 1; + decl.Semantic.Name = semantic; + decl.Declaration.Interpolate = 1; + decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; + decl.Range.First = decl.Range.Last = ctx->info.num_inputs; + texcoord_index = ctx->info.num_inputs; + tctx->emit_declaration(tctx, &decl); + } + + /* Declare the drawpix sampler if it's missing. */ + if (ctx->info.file_max[TGSI_FILE_SAMPLER] == -1) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_SAMPLER; + tctx->emit_declaration(tctx, &decl); + } + + /* Declare the pixel map sampler if it's missing. */ + if (ctx->info.file_max[TGSI_FILE_SAMPLER] <= 0) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_SAMPLER; + decl.Range.First = decl.Range.Last = 1; + tctx->emit_declaration(tctx, &decl); + } + + /* Get initial pixel color from the texture. + * TEX temp, fragment.texcoord[0], texture[0], 2D; + */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_TEX; + inst.Instruction.Texture = 1; + inst.Texture.Texture = TGSI_TEXTURE_2D; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->color_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W); + inst.Src[1].Register.File = TGSI_FILE_SAMPLER; + inst.Src[1].Register.Index = 0; + + tctx->emit_instruction(tctx, &inst); + + /* Apply the scale and bias. */ + if (ctx->scale_and_bias) { + /* MAD temp, temp, scale, bias; */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_MAD; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->color_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + + inst.Instruction.NumSrcRegs = 3; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, ctx->scale_const, X, Y, Z, W); + SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, ctx->bias_const, X, Y, Z, W); + + tctx->emit_instruction(tctx, &inst); + } + + if (ctx->pixel_maps) { + /* do four pixel map look-ups with two TEX instructions: */ + + /* TEX temp.xy, temp.xyyy, texture[1], 2D; */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_TEX; + inst.Instruction.Texture = 1; + inst.Texture.Texture = TGSI_TEXTURE_2D; + + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->color_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XY; + + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Y, Y); + inst.Src[1].Register.File = TGSI_FILE_SAMPLER; + inst.Src[1].Register.Index = 1; + + tctx->emit_instruction(tctx, &inst); + + /* TEX temp.zw, temp.zwww, texture[1], 2D; */ + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_ZW; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, Z, W, W, W); + tctx->emit_instruction(tctx, &inst); + } + + /* Now, "color_temp" should be used in place of IN:COLOR0 */ + +transform_inst: + + for (i = 0; i < current_inst->Instruction.NumSrcRegs; i++) { + struct tgsi_full_src_register *src = ¤t_inst->Src[i]; + unsigned reg = src->Register.Index; + + if (src->Register.File == TGSI_FILE_INPUT && + !src->Register.Indirect && + ctx->info.input_semantic_name[reg] == TGSI_SEMANTIC_COLOR && + ctx->info.input_semantic_index[reg] == 0) { + src->Register.File = TGSI_FILE_TEMPORARY; + src->Register.Index = ctx->color_temp; + } + } + + tctx->emit_instruction(tctx, current_inst); +} + +const struct tgsi_token * +st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, + bool scale_and_bias, unsigned scale_const, + unsigned bias_const, bool pixel_maps) +{ + struct tgsi_drawpix_transform ctx; + struct tgsi_token *newtoks; + int newlen; + + memset(&ctx, 0, sizeof(ctx)); + ctx.base.transform_instruction = transform_instr; + ctx.use_texcoord = use_texcoord; + ctx.scale_and_bias = scale_and_bias; + ctx.scale_const = scale_const; + ctx.bias_const = bias_const; + ctx.pixel_maps = pixel_maps; + tgsi_scan_shader(tokens, &ctx.info); + + newlen = tgsi_num_tokens(tokens) + 30; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c index a9ab5edcf49..bef7307bb27 100644 --- a/src/mesa/state_tracker/st_context.c +++ b/src/mesa/state_tracker/st_context.c @@ -224,8 +224,6 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe, st->ctx->VertexProgram._MaintainTnlProgram = GL_TRUE; - st->pixel_xfer.cache = _mesa_new_program_cache(); - st->has_stencil_export = screen->get_param(screen, PIPE_CAP_SHADER_STENCIL_EXPORT); st->has_shader_model3 = screen->get_param(screen, PIPE_CAP_SM3); @@ -386,8 +384,8 @@ void st_destroy_context( struct st_context *st ) pipe_surface_reference(&st->state.framebuffer.cbufs[i], NULL); } pipe_surface_reference(&st->state.framebuffer.zsbuf, NULL); - - _mesa_delete_program_cache(st->ctx, st->pixel_xfer.cache); + pipe_sampler_view_reference(&st->pixel_xfer.pixelmap_sampler_view, NULL); + pipe_resource_reference(&st->pixel_xfer.pixelmap_texture, NULL); _vbo_DestroyContext(st->ctx); diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h index 262581eceeb..f187d82449b 100644 --- a/src/mesa/state_tracker/st_context.h +++ b/src/mesa/state_tracker/st_context.h @@ -162,15 +162,8 @@ struct st_context struct gl_texture_object *default_texture; struct { - struct gl_program_cache *cache; - struct st_fragment_program *program; /**< cur pixel transfer prog */ - GLuint xfer_prog_sn; /**< pixel xfer program serial no. */ - GLuint user_prog_sn; /**< user fragment program serial no. */ - struct st_fragment_program *combined_prog; - GLuint combined_prog_sn; struct pipe_resource *pixelmap_texture; struct pipe_sampler_view *pixelmap_sampler_view; - boolean pixelmap_enabled; /**< use the pixelmap texture? */ } pixel_xfer; /** for glBitmap */ diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index a54ee17173a..cdd80f167d0 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -4334,138 +4334,6 @@ glsl_to_tgsi_visitor::renumber_registers(void) ralloc_free(first_reads); } -/** - * Returns a fragment program which implements the current pixel transfer ops. - * Based on get_pixel_transfer_program in st_atom_pixeltransfer.c. - */ -extern "C" void -get_pixel_transfer_visitor(struct st_fragment_program *fp, - glsl_to_tgsi_visitor *original, - int scale_and_bias, int pixel_maps) -{ - glsl_to_tgsi_visitor *v = new glsl_to_tgsi_visitor(); - struct st_context *st = st_context(original->ctx); - struct gl_program *prog = &fp->Base.Base; - struct gl_program_parameter_list *params = _mesa_new_parameter_list(); - st_src_reg coord, src0; - st_dst_reg dst0; - glsl_to_tgsi_instruction *inst; - - /* Copy attributes of the glsl_to_tgsi_visitor in the original shader. */ - v->ctx = original->ctx; - v->prog = prog; - v->shader_program = NULL; - v->shader = NULL; - v->glsl_version = original->glsl_version; - v->native_integers = original->native_integers; - v->options = original->options; - v->next_temp = original->next_temp; - v->num_address_regs = original->num_address_regs; - v->samplers_used = prog->SamplersUsed = original->samplers_used; - v->indirect_addr_consts = original->indirect_addr_consts; - memcpy(&v->immediates, &original->immediates, sizeof(v->immediates)); - v->num_immediates = original->num_immediates; - - /* - * Get initial pixel color from the texture. - * TEX colorTemp, fragment.texcoord[0], texture[0], 2D; - */ - coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type); - src0 = v->get_temp(glsl_type::vec4_type); - dst0 = st_dst_reg(src0); - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord); - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - prog->InputsRead |= VARYING_BIT_TEX0; - prog->SamplersUsed |= (1 << 0); /* mark sampler 0 as used */ - v->samplers_used |= (1 << 0); - - if (scale_and_bias) { - static const gl_state_index scale_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_SCALE, - (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 }; - static const gl_state_index bias_state[STATE_LENGTH] = - { STATE_INTERNAL, STATE_PT_BIAS, - (gl_state_index) 0, (gl_state_index) 0, (gl_state_index) 0 }; - GLint scale_p, bias_p; - st_src_reg scale, bias; - - scale_p = _mesa_add_state_reference(params, scale_state); - bias_p = _mesa_add_state_reference(params, bias_state); - - /* MAD colorTemp, colorTemp, scale, bias; */ - scale = st_src_reg(PROGRAM_STATE_VAR, scale_p, GLSL_TYPE_FLOAT); - bias = st_src_reg(PROGRAM_STATE_VAR, bias_p, GLSL_TYPE_FLOAT); - inst = v->emit_asm(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias); - } - - if (pixel_maps) { - st_src_reg temp = v->get_temp(glsl_type::vec4_type); - st_dst_reg temp_dst = st_dst_reg(temp); - - assert(st->pixel_xfer.pixelmap_texture); - (void) st; - - /* With a little effort, we can do four pixel map look-ups with - * two TEX instructions: - */ - - /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */ - temp_dst.writemask = WRITEMASK_XY; /* write R,G */ - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0); - inst->sampler.index = 1; - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */ - src0.swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W); - temp_dst.writemask = WRITEMASK_ZW; /* write B,A */ - inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0); - inst->sampler.index = 1; - inst->sampler_array_size = 1; - inst->tex_target = TEXTURE_2D_INDEX; - - prog->SamplersUsed |= (1 << 1); /* mark sampler 1 as used */ - v->samplers_used |= (1 << 1); - - /* MOV colorTemp, temp; */ - inst = v->emit_asm(NULL, TGSI_OPCODE_MOV, dst0, temp); - } - - /* Now copy the instructions from the original glsl_to_tgsi_visitor into the - * new visitor. */ - foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) { - glsl_to_tgsi_instruction *newinst; - st_src_reg src_regs[4]; - - if (inst->dst[0].file == PROGRAM_OUTPUT) - prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index); - - for (int i = 0; i < 4; i++) { - src_regs[i] = inst->src[i]; - if (src_regs[i].file == PROGRAM_INPUT && - src_regs[i].index == VARYING_SLOT_COL0) { - src_regs[i].file = PROGRAM_TEMPORARY; - src_regs[i].index = src0.index; - } - else if (src_regs[i].file == PROGRAM_INPUT) - prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index); - } - - newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]); - newinst->tex_target = inst->tex_target; - newinst->sampler_array_size = inst->sampler_array_size; - } - - /* Make modifications to fragment program info. */ - prog->Parameters = _mesa_combine_parameter_lists(params, - original->prog->Parameters); - _mesa_free_parameter_list(params); - count_resources(v, prog); - fp->glsl_to_tgsi = v; -} - /* ------------------------- TGSI conversion stuff -------------------------- */ struct label { unsigned branch_target; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h index dcdfbebcbdc..729295bcb52 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.h +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h @@ -55,9 +55,6 @@ enum pipe_error st_translate_program( const ubyte outputSemanticIndex[]); void free_glsl_to_tgsi_visitor(struct glsl_to_tgsi_visitor *v); -void get_pixel_transfer_visitor(struct st_fragment_program *fp, - struct glsl_to_tgsi_visitor *original, - int scale_and_bias, int pixel_maps); GLboolean st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog); diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index d900ede7265..01e33d04f74 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -540,8 +540,6 @@ st_translate_fragment_program(struct st_context *st, { struct pipe_context *pipe = st->pipe; struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant); - GLboolean deleteFP = GL_FALSE; - GLuint outputMapping[FRAG_RESULT_MAX]; GLuint inputMapping[VARYING_SLOT_MAX]; GLuint inputSlotToAttr[VARYING_SLOT_MAX]; @@ -567,16 +565,6 @@ st_translate_fragment_program(struct st_context *st, assert(!(key->bitmap && key->drawpixels)); memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); - if (key->drawpixels) { - /* glDrawPixels color drawing */ - struct gl_fragment_program *fp; /* we free this temp program below */ - - st_make_drawpix_fragment_program(st, &stfp->Base, &fp); - variant->parameters = _mesa_clone_parameter_list(fp->Base.Parameters); - deleteFP = GL_TRUE; - stfp = st_fragment_program(fp); - } - if (!stfp->glsl_to_tgsi) _mesa_remove_output_reads(&stfp->Base.Base, PROGRAM_OUTPUT); @@ -895,6 +883,38 @@ st_translate_fragment_program(struct st_context *st, fprintf(stderr, "mesa: cannot create a shader for glBitmap\n"); } + /* glDrawPixels (color only) */ + if (key->drawpixels) { + const struct tgsi_token *tokens; + unsigned scale_const = 0, bias_const = 0; + + variant->parameters = + _mesa_clone_parameter_list(stfp->Base.Base.Parameters); + + if (key->scaleAndBias) { + static const gl_state_index scale_state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_PT_SCALE }; + static const gl_state_index bias_state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_PT_BIAS }; + + scale_const = _mesa_add_state_reference(variant->parameters, + scale_state); + bias_const = _mesa_add_state_reference(variant->parameters, + bias_state); + } + + tokens = st_get_drawpix_shader(variant->tgsi.tokens, + st->needs_texcoord_semantic, + key->scaleAndBias, scale_const, + bias_const, key->pixelMaps); + + if (tokens) { + tgsi_free_tokens(variant->tgsi.tokens); + variant->tgsi.tokens = tokens; + } else + fprintf(stderr, "mesa: cannot create a shader for glDrawPixels\n"); + } + if (ST_DEBUG & DEBUG_TGSI) { tgsi_dump(variant->tgsi.tokens, 0/*TGSI_DUMP_VERBOSE*/); debug_printf("\n"); @@ -903,13 +923,6 @@ st_translate_fragment_program(struct st_context *st, /* fill in variant */ variant->driver_shader = pipe->create_fs_state(pipe, &variant->tgsi); variant->key = *key; - - if (deleteFP) { - /* Free the temporary program made above */ - struct gl_fragment_program *fp = &stfp->Base; - _mesa_reference_fragprog(st->ctx, &fp, NULL); - } - return variant; } From de6a004035f3de5879648f8afb4670ae82f4ad02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 01:22:20 +0200 Subject: [PATCH 042/270] st/mesa: fix glDrawPixels with a texture The samplers for DrawPixels data and the pixel map are assigned to slots which don't overlap with the existing sampler slots. The texture coordinates for the user texture are uploaded as a constant. Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_cb_drawpixels.c | 52 ++++++++++++++---- src/mesa/state_tracker/st_cb_drawpixels.h | 4 +- .../state_tracker/st_cb_drawpixels_shader.c | 55 +++++++++++++------ src/mesa/state_tracker/st_program.c | 25 ++++++++- src/mesa/state_tracker/st_program.h | 4 ++ 5 files changed, 111 insertions(+), 29 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 34163ede4a2..7e8633edc1a 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -525,6 +525,7 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, int num_sampler_view, void *driver_vp, void *driver_fp, + struct st_fp_variant *fpv, const GLfloat *color, GLboolean invertTex, GLboolean write_depth, GLboolean write_stencil) @@ -612,10 +613,9 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, cso_set_tesseval_shader_handle(cso, NULL); cso_set_geometry_shader_handle(cso, NULL); - /* texture sampling state: */ + /* user samplers, plus the drawpix samplers */ { struct pipe_sampler_state sampler; - const struct pipe_sampler_state *states[2] = {&sampler, &sampler}; memset(&sampler, 0, sizeof(sampler)); sampler.wrap_s = PIPE_TEX_WRAP_CLAMP; @@ -626,8 +626,25 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST; sampler.normalized_coords = normalized; - cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, - num_sampler_view > 1 ? 2 : 1, states); + if (fpv) { + const struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS]; + uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1, + st->state.num_samplers[PIPE_SHADER_FRAGMENT]); + uint i; + + for (i = 0; i < st->state.num_samplers[PIPE_SHADER_FRAGMENT]; i++) + samplers[i] = &st->state.samplers[PIPE_SHADER_FRAGMENT][i]; + + samplers[fpv->drawpix_sampler] = &sampler; + if (sv[1]) + samplers[fpv->pixelmap_sampler] = &sampler; + + cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num, samplers); + } else { + const struct pipe_sampler_state *samplers[2] = {&sampler, &sampler}; + + cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, samplers); + } } /* viewport state: viewport matching window dims */ @@ -647,8 +664,21 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z, cso_set_vertex_elements(cso, 3, st->velems_util_draw); cso_set_stream_outputs(st->cso_context, 0, NULL, NULL); - /* texture state: */ - cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv); + /* user textures, plus the drawpix textures */ + if (fpv) { + struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS]; + uint num = MAX2(MAX2(fpv->drawpix_sampler, fpv->pixelmap_sampler) + 1, + st->state.num_sampler_views[PIPE_SHADER_FRAGMENT]); + + memcpy(sampler_views, st->state.sampler_views[PIPE_SHADER_FRAGMENT], + sizeof(sampler_views)); + + sampler_views[fpv->drawpix_sampler] = sv[0]; + if (sv[1]) + sampler_views[fpv->pixelmap_sampler] = sv[1]; + cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num, sampler_views); + } else + cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, num_sampler_view, sv); /* Compute Gallium window coords (y=0=top) with pixel zoom. * Recall that these coords are transformed by the current @@ -943,6 +973,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, struct pipe_sampler_view *sv[2] = { NULL }; int num_sampler_view = 1; struct gl_pixelstore_attrib clippedUnpack; + struct st_fp_variant *fpv = NULL; /* Mesa state should be up to date by now */ assert(ctx->NewState == 0x0); @@ -982,7 +1013,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, color = ctx->Current.RasterColor; } else { - struct st_fp_variant *fpv = get_color_fp_variant(st); + fpv = get_color_fp_variant(st); driver_fp = fpv->driver_shader; driver_vp = make_passthrough_vertex_shader(st, GL_FALSE); @@ -1025,7 +1056,7 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, sv, num_sampler_view, driver_vp, - driver_fp, + driver_fp, fpv, color, GL_FALSE, write_depth, write_stencil); pipe_sampler_view_reference(&sv[0], NULL); if (num_sampler_view > 1) @@ -1280,6 +1311,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, void *driver_vp, *driver_fp; struct pipe_resource *pt; struct pipe_sampler_view *sv[2] = { NULL }; + struct st_fp_variant *fpv = NULL; int num_sampler_view = 1; GLfloat *color; enum pipe_format srcFormat; @@ -1318,7 +1350,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, * Get vertex/fragment shaders */ if (type == GL_COLOR) { - struct st_fp_variant *fpv = get_color_fp_variant(st); + fpv = get_color_fp_variant(st); rbRead = st_get_color_read_renderbuffer(ctx); color = NULL; @@ -1470,7 +1502,7 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy, sv, num_sampler_view, driver_vp, - driver_fp, + driver_fp, fpv, color, invertTex, GL_FALSE, GL_FALSE); pipe_resource_reference(&pt, NULL); diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h index b8a34952141..f1fb32dd6cf 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.h +++ b/src/mesa/state_tracker/st_cb_drawpixels.h @@ -44,6 +44,8 @@ st_destroy_drawpix(struct st_context *st); extern const struct tgsi_token * st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, bool scale_and_bias, unsigned scale_const, - unsigned bias_const, bool pixel_maps); + unsigned bias_const, bool pixel_maps, + unsigned drawpix_sampler, unsigned pixelmap_sampler, + unsigned texcoord_const); #endif /* ST_CB_DRAWPIXELS_H */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c index 01db0b6d73e..749b46cfbf7 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c +++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c @@ -40,6 +40,9 @@ struct tgsi_drawpix_transform { unsigned scale_const; unsigned bias_const; unsigned color_temp; + unsigned drawpix_sampler; + unsigned pixelmap_sampler; + unsigned texcoord_const; }; static inline struct tgsi_drawpix_transform * @@ -71,7 +74,8 @@ transform_instr(struct tgsi_transform_context *tctx, struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx); struct tgsi_full_declaration decl; struct tgsi_full_instruction inst; - unsigned i, semantic; + unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : + TGSI_SEMANTIC_GENERIC; int texcoord_index = -1; if (ctx->first_instruction_emitted) @@ -96,6 +100,13 @@ transform_instr(struct tgsi_transform_context *tctx, } } + if (ctx->info.const_file_max[0] < (int)ctx->texcoord_const) { + decl = tgsi_default_full_declaration(); + decl.Declaration.File = TGSI_FILE_CONSTANT; + decl.Range.First = decl.Range.Last = ctx->texcoord_const; + tctx->emit_declaration(tctx, &decl); + } + /* Add a new temp. */ ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1; decl = tgsi_default_full_declaration(); @@ -103,11 +114,9 @@ transform_instr(struct tgsi_transform_context *tctx, decl.Range.First = decl.Range.Last = ctx->color_temp; tctx->emit_declaration(tctx, &decl); - /* Add TEXCOORD[0] if it's missing. */ - semantic = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : - TGSI_SEMANTIC_GENERIC; + /* Add TEXCOORD[texcoord_slot] if it's missing. */ for (i = 0; i < ctx->info.num_inputs; i++) { - if (ctx->info.input_semantic_name[i] == semantic && + if (ctx->info.input_semantic_name[i] == sem_texcoord && ctx->info.input_semantic_index[i] == 0) { texcoord_index = i; break; @@ -118,7 +127,7 @@ transform_instr(struct tgsi_transform_context *tctx, decl = tgsi_default_full_declaration(); decl.Declaration.File = TGSI_FILE_INPUT; decl.Declaration.Semantic = 1; - decl.Semantic.Name = semantic; + decl.Semantic.Name = sem_texcoord; decl.Declaration.Interpolate = 1; decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; decl.Range.First = decl.Range.Last = ctx->info.num_inputs; @@ -127,17 +136,19 @@ transform_instr(struct tgsi_transform_context *tctx, } /* Declare the drawpix sampler if it's missing. */ - if (ctx->info.file_max[TGSI_FILE_SAMPLER] == -1) { + if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) { decl = tgsi_default_full_declaration(); decl.Declaration.File = TGSI_FILE_SAMPLER; + decl.Range.First = decl.Range.Last = ctx->drawpix_sampler; tctx->emit_declaration(tctx, &decl); } /* Declare the pixel map sampler if it's missing. */ - if (ctx->info.file_max[TGSI_FILE_SAMPLER] <= 0) { + if (ctx->pixel_maps && + !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) { decl = tgsi_default_full_declaration(); decl.Declaration.File = TGSI_FILE_SAMPLER; - decl.Range.First = decl.Range.Last = 1; + decl.Range.First = decl.Range.Last = ctx->pixelmap_sampler; tctx->emit_declaration(tctx, &decl); } @@ -157,7 +168,7 @@ transform_instr(struct tgsi_transform_context *tctx, inst.Instruction.NumSrcRegs = 2; SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W); inst.Src[1].Register.File = TGSI_FILE_SAMPLER; - inst.Src[1].Register.Index = 0; + inst.Src[1].Register.Index = ctx->drawpix_sampler; tctx->emit_instruction(tctx, &inst); @@ -197,7 +208,7 @@ transform_instr(struct tgsi_transform_context *tctx, inst.Instruction.NumSrcRegs = 2; SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Y, Y); inst.Src[1].Register.File = TGSI_FILE_SAMPLER; - inst.Src[1].Register.Index = 1; + inst.Src[1].Register.Index = ctx->pixelmap_sampler; tctx->emit_instruction(tctx, &inst); @@ -207,7 +218,9 @@ transform_instr(struct tgsi_transform_context *tctx, tctx->emit_instruction(tctx, &inst); } - /* Now, "color_temp" should be used in place of IN:COLOR0 */ + /* Now, "color_temp" should be used in place of IN:COLOR0, + * and CONST[texcoord_slot] should be used in place of IN:TEXCOORD0. + */ transform_inst: @@ -215,12 +228,17 @@ transform_inst: struct tgsi_full_src_register *src = ¤t_inst->Src[i]; unsigned reg = src->Register.Index; - if (src->Register.File == TGSI_FILE_INPUT && - !src->Register.Indirect && - ctx->info.input_semantic_name[reg] == TGSI_SEMANTIC_COLOR && + if (src->Register.File != TGSI_FILE_INPUT || src->Register.Indirect) + continue; + + if (ctx->info.input_semantic_name[reg] == TGSI_SEMANTIC_COLOR && ctx->info.input_semantic_index[reg] == 0) { src->Register.File = TGSI_FILE_TEMPORARY; src->Register.Index = ctx->color_temp; + } else if (ctx->info.input_semantic_name[reg] == sem_texcoord && + ctx->info.input_semantic_index[reg] == 0) { + src->Register.File = TGSI_FILE_CONSTANT; + src->Register.Index = ctx->texcoord_const; } } @@ -230,7 +248,9 @@ transform_inst: const struct tgsi_token * st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, bool scale_and_bias, unsigned scale_const, - unsigned bias_const, bool pixel_maps) + unsigned bias_const, bool pixel_maps, + unsigned drawpix_sampler, unsigned pixelmap_sampler, + unsigned texcoord_const) { struct tgsi_drawpix_transform ctx; struct tgsi_token *newtoks; @@ -243,6 +263,9 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, ctx.scale_const = scale_const; ctx.bias_const = bias_const; ctx.pixel_maps = pixel_maps; + ctx.drawpix_sampler = drawpix_sampler; + ctx.pixelmap_sampler = pixelmap_sampler; + ctx.texcoord_const = texcoord_const; tgsi_scan_shader(tokens, &ctx.info); newlen = tgsi_num_tokens(tokens) + 30; diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 01e33d04f74..95ad2f46827 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -886,7 +886,17 @@ st_translate_fragment_program(struct st_context *st, /* glDrawPixels (color only) */ if (key->drawpixels) { const struct tgsi_token *tokens; - unsigned scale_const = 0, bias_const = 0; + unsigned scale_const = 0, bias_const = 0, texcoord_const = 0; + + /* Find the first unused slot. */ + variant->drawpix_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1; + + if (key->pixelMaps) { + unsigned samplers_used = stfp->Base.Base.SamplersUsed | + (1 << variant->drawpix_sampler); + + variant->pixelmap_sampler = ffs(~samplers_used) - 1; + } variant->parameters = _mesa_clone_parameter_list(stfp->Base.Base.Parameters); @@ -903,10 +913,21 @@ st_translate_fragment_program(struct st_context *st, bias_state); } + { + static const gl_state_index state[STATE_LENGTH] = + { STATE_INTERNAL, STATE_CURRENT_ATTRIB, VERT_ATTRIB_TEX0 }; + + texcoord_const = _mesa_add_state_reference(variant->parameters, + state); + } + tokens = st_get_drawpix_shader(variant->tgsi.tokens, st->needs_texcoord_semantic, key->scaleAndBias, scale_const, - bias_const, key->pixelMaps); + bias_const, key->pixelMaps, + variant->drawpix_sampler, + variant->pixelmap_sampler, + texcoord_const); if (tokens) { tgsi_free_tokens(variant->tgsi.tokens); diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 2927d542dfc..33d39f005b9 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -85,6 +85,10 @@ struct st_fp_variant struct gl_program_parameter_list *parameters; uint bitmap_sampler; + /** For glDrawPixels variants */ + unsigned drawpix_sampler; + unsigned pixelmap_sampler; + /** next in linked list */ struct st_fp_variant *next; }; From 46021ace514cf2ba91733dfcfd258073b90c0354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 02:47:37 +0200 Subject: [PATCH 043/270] st/mesa: translate vertex shaders into TGSI when we get them The translate functions is split into two: - translation to TGSI - creating the variant (TGSI transformations only) Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_cb_program.c | 4 +- src/mesa/state_tracker/st_program.c | 74 +++++++++++++------------- src/mesa/state_tracker/st_program.h | 4 ++ 3 files changed, 45 insertions(+), 37 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 3029909d12d..745b4476d42 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -249,7 +249,9 @@ st_program_string_notify( struct gl_context *ctx, else if (target == GL_VERTEX_PROGRAM_ARB) { struct st_vertex_program *stvp = (struct st_vertex_program *) prog; - st_release_vp_variants( st, stvp ); + st_release_vp_variants(st, stvp); + if (!st_translate_vertex_program(st, stvp)) + return false; if (st->vp == stvp) st->dirty.st |= ST_NEW_VERTEX_PROGRAM; diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 95ad2f46827..4bdbf8537fb 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -94,6 +94,11 @@ st_release_vp_variants( struct st_context *st, } stvp->variants = NULL; + + if (stvp->tgsi.tokens) { + tgsi_free_tokens(stvp->tgsi.tokens); + stvp->tgsi.tokens = NULL; + } } @@ -230,15 +235,12 @@ st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep /** - * Translate a vertex program to create a new variant. + * Translate a vertex program. */ -static struct st_vp_variant * +bool st_translate_vertex_program(struct st_context *st, - struct st_vertex_program *stvp, - const struct st_vp_variant_key *key) + struct st_vertex_program *stvp) { - struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; enum pipe_error error; unsigned num_outputs = 0; @@ -372,12 +374,8 @@ st_translate_vertex_program(struct st_context *st, _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT); ureg = ureg_create_with_screen(TGSI_PROCESSOR_VERTEX, st->pipe->screen); - if (ureg == NULL) { - free(vpv); - return NULL; - } - - vpv->key = *key; + if (ureg == NULL) + return false; if (ST_DEBUG & DEBUG_MESA) { _mesa_print_program(&stvp->Base.Base); @@ -385,7 +383,7 @@ st_translate_vertex_program(struct st_context *st, debug_printf("\n"); } - if (stvp->glsl_to_tgsi) + if (stvp->glsl_to_tgsi) { error = st_translate_program(st->ctx, TGSI_PROCESSOR_VERTEX, ureg, @@ -405,7 +403,11 @@ st_translate_vertex_program(struct st_context *st, output_slot_to_attr, output_semantic_name, output_semantic_index); - else + + st_translate_stream_output_info(stvp->glsl_to_tgsi, + stvp->result_to_output, + &stvp->tgsi.stream_output); + } else error = st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_VERTEX, ureg, @@ -422,21 +424,29 @@ st_translate_vertex_program(struct st_context *st, output_semantic_name, output_semantic_index); - if (error) - goto fail; - - vpv->tgsi.tokens = ureg_get_tokens( ureg, NULL ); - if (!vpv->tgsi.tokens) - goto fail; - - ureg_destroy( ureg ); - - if (stvp->glsl_to_tgsi) { - st_translate_stream_output_info(stvp->glsl_to_tgsi, - stvp->result_to_output, - &vpv->tgsi.stream_output); + if (error) { + debug_printf("%s: failed to translate Mesa program:\n", __func__); + _mesa_print_program(&stvp->Base.Base); + debug_assert(0); + return false; } + stvp->tgsi.tokens = ureg_get_tokens(ureg, NULL); + ureg_destroy(ureg); + return stvp->tgsi.tokens != NULL; +} + +static struct st_vp_variant * +st_create_vp_variant(struct st_context *st, + struct st_vertex_program *stvp, + const struct st_vp_variant_key *key) +{ + struct st_vp_variant *vpv = CALLOC_STRUCT(st_vp_variant); + struct pipe_context *pipe = st->pipe; + + vpv->key = *key; + vpv->tgsi.tokens = tgsi_dup_tokens(stvp->tgsi.tokens); + vpv->tgsi.stream_output = stvp->tgsi.stream_output; vpv->num_inputs = stvp->num_inputs; /* Emulate features. */ @@ -465,14 +475,6 @@ st_translate_vertex_program(struct st_context *st, vpv->driver_shader = pipe->create_vs_state(pipe, &vpv->tgsi); return vpv; - -fail: - debug_printf("%s: failed to translate Mesa program:\n", __func__); - _mesa_print_program(&stvp->Base.Base); - debug_assert(0); - - ureg_destroy( ureg ); - return NULL; } @@ -495,7 +497,7 @@ st_get_vp_variant(struct st_context *st, if (!vpv) { /* create now */ - vpv = st_translate_vertex_program(st, stvp, key); + vpv = st_create_vp_variant(st, stvp, key); if (vpv) { /* insert into list */ vpv->next = stvp->variants; diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 33d39f005b9..6f4a6a1b802 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -155,6 +155,7 @@ struct st_vp_variant struct st_vertex_program { struct gl_vertex_program Base; /**< The Mesa vertex program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; /** maps a Mesa VERT_ATTRIB_x to a packed TGSI input index */ @@ -434,6 +435,9 @@ st_release_tep_variants(struct st_context *st, extern void st_destroy_program_variants(struct st_context *st); +extern bool +st_translate_vertex_program(struct st_context *st, + struct st_vertex_program *stvp); extern void st_print_current_vertex_program(void); From a907b5dd162b7911b8c21f6d54837831bc078059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 03:26:48 +0200 Subject: [PATCH 044/270] st/mesa: translate fragment shaders into TGSI when we get them Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_cb_program.c | 2 + src/mesa/state_tracker/st_debug.c | 2 +- src/mesa/state_tracker/st_program.c | 81 +++++++++++++++----------- src/mesa/state_tracker/st_program.h | 7 ++- 4 files changed, 55 insertions(+), 37 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 745b4476d42..40eeb0f703e 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -234,6 +234,8 @@ st_program_string_notify( struct gl_context *ctx, struct st_fragment_program *stfp = (struct st_fragment_program *) prog; st_release_fp_variants(st, stfp); + if (!st_translate_fragment_program(st, stfp)) + return false; if (st->fp == stfp) st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM; diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c index 50891c112cb..6d859c6ab5b 100644 --- a/src/mesa/state_tracker/st_debug.c +++ b/src/mesa/state_tracker/st_debug.c @@ -98,7 +98,7 @@ st_print_current(void) if (st->vp->Base.Base.Parameters) _mesa_print_parameter_list(st->vp->Base.Base.Parameters); - tgsi_dump( st->fp->variants[0].tgsi.tokens, 0 ); + tgsi_dump(st->fp->tgsi.tokens, 0); if (st->fp->Base.Base.Parameters) _mesa_print_parameter_list(st->fp->Base.Base.Parameters); } diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 4bdbf8537fb..5eded93650c 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -114,8 +114,6 @@ delete_fp_variant(struct st_context *st, struct st_fp_variant *fpv) cso_delete_fragment_shader(st->cso_context, fpv->driver_shader); if (fpv->parameters) _mesa_free_parameter_list(fpv->parameters); - if (fpv->tgsi.tokens) - ureg_free_tokens(fpv->tgsi.tokens); free(fpv); } @@ -135,6 +133,11 @@ st_release_fp_variants(struct st_context *st, struct st_fragment_program *stfp) } stfp->variants = NULL; + + if (stfp->tgsi.tokens) { + ureg_free_tokens(stfp->tgsi.tokens); + stfp->tgsi.tokens = NULL; + } } @@ -531,17 +534,12 @@ st_translate_interp(enum glsl_interp_qualifier glsl_qual, bool is_color) /** - * Translate a Mesa fragment shader into a TGSI shader using extra info in - * the key. - * \return new fragment program variant + * Translate a Mesa fragment shader into a TGSI shader. */ -static struct st_fp_variant * +bool st_translate_fragment_program(struct st_context *st, - struct st_fragment_program *stfp, - const struct st_fp_variant_key *key) + struct st_fragment_program *stfp) { - struct pipe_context *pipe = st->pipe; - struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant); GLuint outputMapping[FRAG_RESULT_MAX]; GLuint inputMapping[VARYING_SLOT_MAX]; GLuint inputSlotToAttr[VARYING_SLOT_MAX]; @@ -561,10 +559,6 @@ st_translate_fragment_program(struct st_context *st, ubyte fs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; uint fs_num_outputs = 0; - if (!variant) - return NULL; - - assert(!(key->bitmap && key->drawpixels)); memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr)); if (!stfp->glsl_to_tgsi) @@ -772,10 +766,8 @@ st_translate_fragment_program(struct st_context *st, } ureg = ureg_create_with_screen(TGSI_PROCESSOR_FRAGMENT, st->pipe->screen); - if (ureg == NULL) { - free(variant); - return NULL; - } + if (ureg == NULL) + return false; if (ST_DEBUG & DEBUG_MESA) { _mesa_print_program(&stfp->Base.Base); @@ -845,8 +837,26 @@ st_translate_fragment_program(struct st_context *st, fs_output_semantic_name, fs_output_semantic_index); - variant->tgsi.tokens = ureg_get_tokens(ureg, NULL); + stfp->tgsi.tokens = ureg_get_tokens(ureg, NULL); ureg_destroy(ureg); + return stfp->tgsi.tokens != NULL; +} + +static struct st_fp_variant * +st_create_fp_variant(struct st_context *st, + struct st_fragment_program *stfp, + const struct st_fp_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_fp_variant *variant = CALLOC_STRUCT(st_fp_variant); + struct pipe_shader_state tgsi = {0}; + + if (!variant) + return NULL; + + tgsi.tokens = stfp->tgsi.tokens; + + assert(!(key->bitmap && key->drawpixels)); /* Emulate features. */ if (key->clamp_color || key->persample_shading) { @@ -855,12 +865,11 @@ st_translate_fragment_program(struct st_context *st, (key->clamp_color ? TGSI_EMU_CLAMP_COLOR_OUTPUTS : 0) | (key->persample_shading ? TGSI_EMU_FORCE_PERSAMPLE_INTERP : 0); - tokens = tgsi_emulate(variant->tgsi.tokens, flags); + tokens = tgsi_emulate(tgsi.tokens, flags); - if (tokens) { - tgsi_free_tokens(variant->tgsi.tokens); - variant->tgsi.tokens = tokens; - } else + if (tokens) + tgsi.tokens = tokens; + else fprintf(stderr, "mesa: cannot emulate deprecated features\n"); } @@ -870,15 +879,16 @@ st_translate_fragment_program(struct st_context *st, variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1; - tokens = st_get_bitmap_shader(variant->tgsi.tokens, + tokens = st_get_bitmap_shader(tgsi.tokens, variant->bitmap_sampler, st->needs_texcoord_semantic, st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM); if (tokens) { - tgsi_free_tokens(variant->tgsi.tokens); - variant->tgsi.tokens = tokens; + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); + tgsi.tokens = tokens; variant->parameters = _mesa_clone_parameter_list(stfp->Base.Base.Parameters); } else @@ -923,7 +933,7 @@ st_translate_fragment_program(struct st_context *st, state); } - tokens = st_get_drawpix_shader(variant->tgsi.tokens, + tokens = st_get_drawpix_shader(tgsi.tokens, st->needs_texcoord_semantic, key->scaleAndBias, scale_const, bias_const, key->pixelMaps, @@ -932,24 +942,27 @@ st_translate_fragment_program(struct st_context *st, texcoord_const); if (tokens) { - tgsi_free_tokens(variant->tgsi.tokens); - variant->tgsi.tokens = tokens; + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); + tgsi.tokens = tokens; } else fprintf(stderr, "mesa: cannot create a shader for glDrawPixels\n"); } if (ST_DEBUG & DEBUG_TGSI) { - tgsi_dump(variant->tgsi.tokens, 0/*TGSI_DUMP_VERBOSE*/); + tgsi_dump(tgsi.tokens, 0); debug_printf("\n"); } /* fill in variant */ - variant->driver_shader = pipe->create_fs_state(pipe, &variant->tgsi); + variant->driver_shader = pipe->create_fs_state(pipe, &tgsi); variant->key = *key; + + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); return variant; } - /** * Translate fragment program if needed. */ @@ -969,7 +982,7 @@ st_get_fp_variant(struct st_context *st, if (!fpv) { /* create new */ - fpv = st_translate_fragment_program(st, stfp, key); + fpv = st_create_fp_variant(st, stfp, key); if (fpv) { /* insert into list */ fpv->next = stfp->variants; diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 6f4a6a1b802..d4b5c1f427a 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -76,8 +76,6 @@ struct st_fp_variant /** Parameters which generated this version of fragment program */ struct st_fp_variant_key key; - struct pipe_shader_state tgsi; - /** Driver's compiled shader */ void *driver_shader; @@ -100,6 +98,7 @@ struct st_fp_variant struct st_fragment_program { struct gl_fragment_program Base; + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_fp_variant *variants; @@ -439,6 +438,10 @@ extern bool st_translate_vertex_program(struct st_context *st, struct st_vertex_program *stvp); +extern bool +st_translate_fragment_program(struct st_context *st, + struct st_fragment_program *stfp); + extern void st_print_current_vertex_program(void); From 897177020bb702cd18eafcc1d8c4e7f502a8a65d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 03:26:48 +0200 Subject: [PATCH 045/270] st/mesa: translate geometry shaders into TGSI when we get them Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_cb_program.c | 2 ++ src/mesa/state_tracker/st_program.c | 38 ++++++++++++++++---------- src/mesa/state_tracker/st_program.h | 5 ++++ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 40eeb0f703e..dff06dd2608 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -244,6 +244,8 @@ st_program_string_notify( struct gl_context *ctx, struct st_geometry_program *stgp = (struct st_geometry_program *) prog; st_release_gp_variants(st, stgp); + if (!st_translate_geometry_program(st, stgp)) + return false; if (st->gp == stgp) st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM; diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 5eded93650c..37e7a09daf7 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -170,6 +170,11 @@ st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp) } stgp->variants = NULL; + + if (stgp->tgsi.tokens) { + ureg_free_tokens(stgp->tgsi.tokens); + stgp->tgsi.tokens = NULL; + } } @@ -1276,19 +1281,15 @@ st_translate_program_common(struct st_context *st, /** * Translate a geometry program to create a new variant. */ -static struct st_gp_variant * +bool st_translate_geometry_program(struct st_context *st, - struct st_geometry_program *stgp, - const struct st_gp_variant_key *key) + struct st_geometry_program *stgp) { - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; - struct st_gp_variant *gpv; - struct pipe_shader_state state; ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen); if (ureg == NULL) - return NULL; + return false; ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType); ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType); @@ -1297,19 +1298,26 @@ st_translate_geometry_program(struct st_context *st, ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations); st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg, - TGSI_PROCESSOR_GEOMETRY, &state); + TGSI_PROCESSOR_GEOMETRY, &stgp->tgsi); + return true; +} + + +static struct st_gp_variant * +st_create_gp_variant(struct st_context *st, + struct st_geometry_program *stgp, + const struct st_gp_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_gp_variant *gpv; gpv = CALLOC_STRUCT(st_gp_variant); - if (!gpv) { - ureg_free_tokens(state.tokens); + if (!gpv) return NULL; - } /* fill in new variant */ - gpv->driver_shader = pipe->create_gs_state(pipe, &state); + gpv->driver_shader = pipe->create_gs_state(pipe, &stgp->tgsi); gpv->key = *key; - - ureg_free_tokens(state.tokens); return gpv; } @@ -1333,7 +1341,7 @@ st_get_gp_variant(struct st_context *st, if (!gpv) { /* create new */ - gpv = st_translate_geometry_program(st, stgp, key); + gpv = st_create_gp_variant(st, stgp, key); if (gpv) { /* insert into list */ gpv->next = stgp->variants; diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index d4b5c1f427a..3a4c2604812 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -200,6 +200,7 @@ struct st_gp_variant struct st_geometry_program { struct gl_geometry_program Base; /**< The Mesa geometry program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_gp_variant *variants; @@ -442,6 +443,10 @@ extern bool st_translate_fragment_program(struct st_context *st, struct st_fragment_program *stfp); +extern bool +st_translate_geometry_program(struct st_context *st, + struct st_geometry_program *stgp); + extern void st_print_current_vertex_program(void); From e5073e8d0c1ea98c7e5cfff8fe69cd779bc129d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 03:47:44 +0200 Subject: [PATCH 046/270] st/mesa: translate tessellation shaders into TGSI when we get them Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_cb_program.c | 4 ++ src/mesa/state_tracker/st_program.c | 86 +++++++++++++++----------- src/mesa/state_tracker/st_program.h | 10 +++ 3 files changed, 64 insertions(+), 36 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index dff06dd2608..003ce336fd4 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -265,6 +265,8 @@ st_program_string_notify( struct gl_context *ctx, (struct st_tessctrl_program *) prog; st_release_tcp_variants(st, sttcp); + if (!st_translate_tessctrl_program(st, sttcp)) + return false; if (st->tcp == sttcp) st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM; @@ -274,6 +276,8 @@ st_program_string_notify( struct gl_context *ctx, (struct st_tesseval_program *) prog; st_release_tep_variants(st, sttep); + if (!st_translate_tesseval_program(st, sttep)) + return false; if (st->tep == sttep) st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM; diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 37e7a09daf7..3317071ed59 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -207,6 +207,11 @@ st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp } sttcp->variants = NULL; + + if (sttcp->tgsi.tokens) { + ureg_free_tokens(sttcp->tgsi.tokens); + sttcp->tgsi.tokens = NULL; + } } @@ -239,6 +244,11 @@ st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep } sttep->variants = NULL; + + if (sttep->tgsi.tokens) { + ureg_free_tokens(sttep->tgsi.tokens); + sttep->tgsi.tokens = NULL; + } } @@ -1356,38 +1366,40 @@ st_get_gp_variant(struct st_context *st, /** * Translate a tessellation control program to create a new variant. */ -static struct st_tcp_variant * +bool st_translate_tessctrl_program(struct st_context *st, - struct st_tessctrl_program *sttcp, - const struct st_tcp_variant_key *key) + struct st_tessctrl_program *sttcp) { - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; - struct st_tcp_variant *tcpv; - struct pipe_shader_state state; - ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, pipe->screen); - if (ureg == NULL) { - return NULL; - } + ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, st->pipe->screen); + if (ureg == NULL) + return false; ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, sttcp->Base.VerticesOut); st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi, - ureg, TGSI_PROCESSOR_TESS_CTRL, &state); + ureg, TGSI_PROCESSOR_TESS_CTRL, &sttcp->tgsi); + return true; +} + + +static struct st_tcp_variant * +st_create_tcp_variant(struct st_context *st, + struct st_tessctrl_program *sttcp, + const struct st_tcp_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_tcp_variant *tcpv; tcpv = CALLOC_STRUCT(st_tcp_variant); - if (!tcpv) { - ureg_free_tokens(state.tokens); + if (!tcpv) return NULL; - } /* fill in new variant */ - tcpv->driver_shader = pipe->create_tcs_state(pipe, &state); + tcpv->driver_shader = pipe->create_tcs_state(pipe, &sttcp->tgsi); tcpv->key = *key; - - ureg_free_tokens(state.tokens); return tcpv; } @@ -1411,7 +1423,7 @@ st_get_tcp_variant(struct st_context *st, if (!tcpv) { /* create new */ - tcpv = st_translate_tessctrl_program(st, sttcp, key); + tcpv = st_create_tcp_variant(st, sttcp, key); if (tcpv) { /* insert into list */ tcpv->next = sttcp->variants; @@ -1426,20 +1438,15 @@ st_get_tcp_variant(struct st_context *st, /** * Translate a tessellation evaluation program to create a new variant. */ -static struct st_tep_variant * +bool st_translate_tesseval_program(struct st_context *st, - struct st_tesseval_program *sttep, - const struct st_tep_variant_key *key) + struct st_tesseval_program *sttep) { - struct pipe_context *pipe = st->pipe; struct ureg_program *ureg; - struct st_tep_variant *tepv; - struct pipe_shader_state state; - ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, pipe->screen); - if (ureg == NULL) { - return NULL; - } + ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, st->pipe->screen); + if (ureg == NULL) + return false; if (sttep->Base.PrimitiveMode == GL_ISOLINES) ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES); @@ -1467,19 +1474,26 @@ st_translate_tesseval_program(struct st_context *st, ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, sttep->Base.PointMode); st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi, - ureg, TGSI_PROCESSOR_TESS_EVAL, &state); + ureg, TGSI_PROCESSOR_TESS_EVAL, &sttep->tgsi); + return true; +} + + +static struct st_tep_variant * +st_create_tep_variant(struct st_context *st, + struct st_tesseval_program *sttep, + const struct st_tep_variant_key *key) +{ + struct pipe_context *pipe = st->pipe; + struct st_tep_variant *tepv; tepv = CALLOC_STRUCT(st_tep_variant); - if (!tepv) { - ureg_free_tokens(state.tokens); + if (!tepv) return NULL; - } /* fill in new variant */ - tepv->driver_shader = pipe->create_tes_state(pipe, &state); + tepv->driver_shader = pipe->create_tes_state(pipe, &sttep->tgsi); tepv->key = *key; - - ureg_free_tokens(state.tokens); return tepv; } @@ -1503,7 +1517,7 @@ st_get_tep_variant(struct st_context *st, if (!tepv) { /* create new */ - tepv = st_translate_tesseval_program(st, sttep, key); + tepv = st_create_tep_variant(st, sttep, key); if (tepv) { /* insert into list */ tepv->next = sttep->variants; diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 3a4c2604812..d9b53ac008c 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -236,6 +236,7 @@ struct st_tcp_variant struct st_tessctrl_program { struct gl_tess_ctrl_program Base; /**< The Mesa tess ctrl program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_tcp_variant *variants; @@ -271,6 +272,7 @@ struct st_tep_variant struct st_tesseval_program { struct gl_tess_eval_program Base; /**< The Mesa tess eval program */ + struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; struct st_tep_variant *variants; @@ -447,6 +449,14 @@ extern bool st_translate_geometry_program(struct st_context *st, struct st_geometry_program *stgp); +extern bool +st_translate_tessctrl_program(struct st_context *st, + struct st_tessctrl_program *sttcp); + +extern bool +st_translate_tesseval_program(struct st_context *st, + struct st_tesseval_program *sttep); + extern void st_print_current_vertex_program(void); From ee01942eb595b05aff47b4fdcd358508f2d6f14c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 22:46:44 +0200 Subject: [PATCH 047/270] st/mesa: release the glsl_to_tgsi visitor after translation Reviewed-by: Dave Airlie Reviewed-by: Brian Paul Tested-by: Brian Paul --- src/mesa/state_tracker/st_program.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 3317071ed59..6a69ba7aa26 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -425,6 +425,9 @@ st_translate_vertex_program(struct st_context *st, st_translate_stream_output_info(stvp->glsl_to_tgsi, stvp->result_to_output, &stvp->tgsi.stream_output); + + free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi); + stvp->glsl_to_tgsi = NULL; } else error = st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_VERTEX, @@ -815,7 +818,7 @@ st_translate_fragment_program(struct st_context *st, } } - if (stfp->glsl_to_tgsi) + if (stfp->glsl_to_tgsi) { st_translate_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, ureg, @@ -835,7 +838,10 @@ st_translate_fragment_program(struct st_context *st, NULL, fs_output_semantic_name, fs_output_semantic_index); - else + + free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi); + stfp->glsl_to_tgsi = NULL; + } else st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, ureg, @@ -1309,6 +1315,9 @@ st_translate_geometry_program(struct st_context *st, st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg, TGSI_PROCESSOR_GEOMETRY, &stgp->tgsi); + + free_glsl_to_tgsi_visitor(stgp->glsl_to_tgsi); + stgp->glsl_to_tgsi = NULL; return true; } @@ -1381,6 +1390,9 @@ st_translate_tessctrl_program(struct st_context *st, st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi, ureg, TGSI_PROCESSOR_TESS_CTRL, &sttcp->tgsi); + + free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi); + sttcp->glsl_to_tgsi = NULL; return true; } @@ -1475,6 +1487,9 @@ st_translate_tesseval_program(struct st_context *st, st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi, ureg, TGSI_PROCESSOR_TESS_EVAL, &sttep->tgsi); + + free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi); + sttep->glsl_to_tgsi = NULL; return true; } From c947a3a4c4fbd210fd19a78d7b07ccfdd6f03812 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 21:39:17 +0200 Subject: [PATCH 048/270] program: remove unused function _mesa_find_line_column Reviewed-by: Emil Velikov Reviewed-by: Brian Paul --- src/mesa/program/program.c | 43 -------------------------------------- src/mesa/program/program.h | 5 ----- 2 files changed, 48 deletions(-) diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index e94c1021258..23d8be821f8 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -172,49 +172,6 @@ _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string) } -/** - * Find the line number and column for 'pos' within 'string'. - * Return a copy of the line which contains 'pos'. Free the line with - * free(). - * \param string the program string - * \param pos the position within the string - * \param line returns the line number corresponding to 'pos'. - * \param col returns the column number corresponding to 'pos'. - * \return copy of the line containing 'pos'. - */ -const GLubyte * -_mesa_find_line_column(const GLubyte *string, const GLubyte *pos, - GLint *line, GLint *col) -{ - const GLubyte *lineStart = string; - const GLubyte *p = string; - GLubyte *s; - int len; - - *line = 1; - - while (p != pos) { - if (*p == (GLubyte) '\n') { - (*line)++; - lineStart = p + 1; - } - p++; - } - - *col = (pos - lineStart) + 1; - - /* return copy of this line */ - while (*p != 0 && *p != '\n') - p++; - len = p - lineStart; - s = malloc(len + 1); - memcpy(s, lineStart, len); - s[len] = 0; - - return s; -} - - /** * Initialize a new gl_program object. */ diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index a894147cafd..aad81de0f35 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -63,11 +63,6 @@ _mesa_update_default_objects_program(struct gl_context *ctx); extern void _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string); -extern const GLubyte * -_mesa_find_line_column(const GLubyte *string, const GLubyte *pos, - GLint *line, GLint *col); - - extern struct gl_program * _mesa_init_vertex_program(struct gl_context *ctx, struct gl_vertex_program *prog, From 5042a3eef8d4e4314d0463b83d267875fd3dd910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 21:41:03 +0200 Subject: [PATCH 049/270] program: remove unused cloning and combining functions Reviewed-by: Emil Velikov Reviewed-by: Brian Paul --- src/mesa/program/program.c | 250 ------------------------------------- src/mesa/program/program.h | 44 ------- 2 files changed, 294 deletions(-) diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index 23d8be821f8..eb1f8bec220 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -450,123 +450,6 @@ _mesa_reference_program_(struct gl_context *ctx, } -/** - * Return a copy of a program. - * XXX Problem here if the program object is actually OO-derivation - * made by a device driver. - */ -struct gl_program * -_mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog) -{ - struct gl_program *clone; - - clone = ctx->Driver.NewProgram(ctx, prog->Target, prog->Id); - if (!clone) - return NULL; - - assert(clone->Target == prog->Target); - assert(clone->RefCount == 1); - - clone->String = (GLubyte *) strdup((char *) prog->String); - clone->Format = prog->Format; - clone->Instructions = _mesa_alloc_instructions(prog->NumInstructions); - if (!clone->Instructions) { - _mesa_reference_program(ctx, &clone, NULL); - return NULL; - } - _mesa_copy_instructions(clone->Instructions, prog->Instructions, - prog->NumInstructions); - clone->InputsRead = prog->InputsRead; - clone->OutputsWritten = prog->OutputsWritten; - clone->SamplersUsed = prog->SamplersUsed; - clone->ShadowSamplers = prog->ShadowSamplers; - memcpy(clone->TexturesUsed, prog->TexturesUsed, sizeof(prog->TexturesUsed)); - - if (prog->Parameters) - clone->Parameters = _mesa_clone_parameter_list(prog->Parameters); - if (prog->LocalParams) { - clone->LocalParams = malloc(MAX_PROGRAM_LOCAL_PARAMS * - sizeof(float[4])); - if (!clone->LocalParams) { - _mesa_reference_program(ctx, &clone, NULL); - return NULL; - } - memcpy(clone->LocalParams, prog->LocalParams, - MAX_PROGRAM_LOCAL_PARAMS * sizeof(float[4])); - } - clone->IndirectRegisterFiles = prog->IndirectRegisterFiles; - clone->NumInstructions = prog->NumInstructions; - clone->NumTemporaries = prog->NumTemporaries; - clone->NumParameters = prog->NumParameters; - clone->NumAttributes = prog->NumAttributes; - clone->NumAddressRegs = prog->NumAddressRegs; - clone->NumNativeInstructions = prog->NumNativeInstructions; - clone->NumNativeTemporaries = prog->NumNativeTemporaries; - clone->NumNativeParameters = prog->NumNativeParameters; - clone->NumNativeAttributes = prog->NumNativeAttributes; - clone->NumNativeAddressRegs = prog->NumNativeAddressRegs; - clone->NumAluInstructions = prog->NumAluInstructions; - clone->NumTexInstructions = prog->NumTexInstructions; - clone->NumTexIndirections = prog->NumTexIndirections; - clone->NumNativeAluInstructions = prog->NumNativeAluInstructions; - clone->NumNativeTexInstructions = prog->NumNativeTexInstructions; - clone->NumNativeTexIndirections = prog->NumNativeTexIndirections; - - switch (prog->Target) { - case GL_VERTEX_PROGRAM_ARB: - { - const struct gl_vertex_program *vp = gl_vertex_program_const(prog); - struct gl_vertex_program *vpc = gl_vertex_program(clone); - vpc->IsPositionInvariant = vp->IsPositionInvariant; - } - break; - case GL_FRAGMENT_PROGRAM_ARB: - { - const struct gl_fragment_program *fp = gl_fragment_program_const(prog); - struct gl_fragment_program *fpc = gl_fragment_program(clone); - fpc->UsesKill = fp->UsesKill; - fpc->UsesDFdy = fp->UsesDFdy; - fpc->OriginUpperLeft = fp->OriginUpperLeft; - fpc->PixelCenterInteger = fp->PixelCenterInteger; - } - break; - case GL_GEOMETRY_PROGRAM_NV: - { - const struct gl_geometry_program *gp = gl_geometry_program_const(prog); - struct gl_geometry_program *gpc = gl_geometry_program(clone); - gpc->VerticesOut = gp->VerticesOut; - gpc->InputType = gp->InputType; - gpc->Invocations = gp->Invocations; - gpc->OutputType = gp->OutputType; - gpc->UsesEndPrimitive = gp->UsesEndPrimitive; - gpc->UsesStreams = gp->UsesStreams; - } - break; - case GL_TESS_CONTROL_PROGRAM_NV: - { - const struct gl_tess_ctrl_program *tcp = gl_tess_ctrl_program_const(prog); - struct gl_tess_ctrl_program *tcpc = gl_tess_ctrl_program(clone); - tcpc->VerticesOut = tcp->VerticesOut; - } - break; - case GL_TESS_EVALUATION_PROGRAM_NV: - { - const struct gl_tess_eval_program *tep = gl_tess_eval_program_const(prog); - struct gl_tess_eval_program *tepc = gl_tess_eval_program(clone); - tepc->PrimitiveMode = tep->PrimitiveMode; - tepc->Spacing = tep->Spacing; - tepc->VertexOrder = tep->VertexOrder; - tepc->PointMode = tep->PointMode; - } - break; - default: - _mesa_problem(NULL, "Unexpected target in _mesa_clone_program"); - } - - return clone; -} - - /** * Insert 'count' NOP instructions at 'start' in the given program. * Adjust branch targets accordingly. @@ -714,139 +597,6 @@ adjust_param_indexes(struct prog_instruction *inst, GLuint numInst, } -/** - * Combine two programs into one. Fix instructions so the outputs of - * the first program go to the inputs of the second program. - */ -struct gl_program * -_mesa_combine_programs(struct gl_context *ctx, - const struct gl_program *progA, - const struct gl_program *progB) -{ - struct prog_instruction *newInst; - struct gl_program *newProg; - const GLuint lenA = progA->NumInstructions - 1; /* omit END instr */ - const GLuint lenB = progB->NumInstructions; - const GLuint numParamsA = _mesa_num_parameters(progA->Parameters); - const GLuint newLength = lenA + lenB; - GLboolean usedTemps[MAX_PROGRAM_TEMPS]; - GLuint firstTemp = 0; - GLbitfield64 inputsB; - GLuint i; - - assert(progA->Target == progB->Target); - - newInst = _mesa_alloc_instructions(newLength); - if (!newInst) - return GL_FALSE; - - _mesa_copy_instructions(newInst, progA->Instructions, lenA); - _mesa_copy_instructions(newInst + lenA, progB->Instructions, lenB); - - /* adjust branch / instruction addresses for B's instructions */ - for (i = 0; i < lenB; i++) { - newInst[lenA + i].BranchTarget += lenA; - } - - newProg = ctx->Driver.NewProgram(ctx, progA->Target, 0); - newProg->Instructions = newInst; - newProg->NumInstructions = newLength; - - /* find used temp regs (we may need new temps below) */ - _mesa_find_used_registers(newProg, PROGRAM_TEMPORARY, - usedTemps, MAX_PROGRAM_TEMPS); - - if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) { - const struct gl_fragment_program *fprogA, *fprogB; - struct gl_fragment_program *newFprog; - GLbitfield64 progB_inputsRead = progB->InputsRead; - GLint progB_colorFile, progB_colorIndex; - - fprogA = gl_fragment_program_const(progA); - fprogB = gl_fragment_program_const(progB); - newFprog = gl_fragment_program(newProg); - - newFprog->UsesKill = fprogA->UsesKill || fprogB->UsesKill; - newFprog->UsesDFdy = fprogA->UsesDFdy || fprogB->UsesDFdy; - - /* We'll do a search and replace for instances - * of progB_colorFile/progB_colorIndex below... - */ - progB_colorFile = PROGRAM_INPUT; - progB_colorIndex = VARYING_SLOT_COL0; - - /* - * The fragment program may get color from a state var rather than - * a fragment input (vertex output) if it's constant. - * See the texenvprogram.c code. - * So, search the program's parameter list now to see if the program - * gets color from a state var instead of a conventional fragment - * input register. - */ - for (i = 0; i < progB->Parameters->NumParameters; i++) { - struct gl_program_parameter *p = &progB->Parameters->Parameters[i]; - if (p->Type == PROGRAM_STATE_VAR && - p->StateIndexes[0] == STATE_INTERNAL && - p->StateIndexes[1] == STATE_CURRENT_ATTRIB && - (int) p->StateIndexes[2] == (int) VERT_ATTRIB_COLOR0) { - progB_inputsRead |= VARYING_BIT_COL0; - progB_colorFile = PROGRAM_STATE_VAR; - progB_colorIndex = i; - break; - } - } - - /* Connect color outputs of fprogA to color inputs of fprogB, via a - * new temporary register. - */ - if ((progA->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR)) && - (progB_inputsRead & VARYING_BIT_COL0)) { - GLint tempReg = _mesa_find_free_register(usedTemps, MAX_PROGRAM_TEMPS, - firstTemp); - if (tempReg < 0) { - _mesa_problem(ctx, "No free temp regs found in " - "_mesa_combine_programs(), using 31"); - tempReg = 31; - } - firstTemp = tempReg + 1; - - /* replace writes to result.color[0] with tempReg */ - replace_registers(newInst, lenA, - PROGRAM_OUTPUT, FRAG_RESULT_COLOR, - PROGRAM_TEMPORARY, tempReg); - /* replace reads from the input color with tempReg */ - replace_registers(newInst + lenA, lenB, - progB_colorFile, progB_colorIndex, /* search for */ - PROGRAM_TEMPORARY, tempReg /* replace with */ ); - } - - /* compute combined program's InputsRead */ - inputsB = progB_inputsRead; - if (progA->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_COLOR)) { - inputsB &= ~(1 << VARYING_SLOT_COL0); - } - newProg->InputsRead = progA->InputsRead | inputsB; - newProg->OutputsWritten = progB->OutputsWritten; - newProg->SamplersUsed = progA->SamplersUsed | progB->SamplersUsed; - } - else { - /* vertex program */ - assert(0); /* XXX todo */ - } - - /* - * Merge parameters (uniforms, constants, etc) - */ - newProg->Parameters = _mesa_combine_parameter_lists(progA->Parameters, - progB->Parameters); - - adjust_param_indexes(newInst + lenA, lenB, numParamsA); - - - return newProg; -} - - /** * Populate the 'used' array with flags indicating which registers (TEMPs, * INPUTs, OUTPUTs, etc, are used by the given program. diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index aad81de0f35..f17b2f8f294 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -171,56 +171,12 @@ _mesa_reference_tesseprog(struct gl_context *ctx, (struct gl_program *) prog); } -extern struct gl_program * -_mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog); - -static inline struct gl_vertex_program * -_mesa_clone_vertex_program(struct gl_context *ctx, - const struct gl_vertex_program *prog) -{ - return (struct gl_vertex_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_tess_ctrl_program * -_mesa_clone_tess_ctrl_program(struct gl_context *ctx, - const struct gl_tess_ctrl_program *prog) -{ - return (struct gl_tess_ctrl_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_tess_eval_program * -_mesa_clone_tess_eval_program(struct gl_context *ctx, - const struct gl_tess_eval_program *prog) -{ - return (struct gl_tess_eval_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_geometry_program * -_mesa_clone_geometry_program(struct gl_context *ctx, - const struct gl_geometry_program *prog) -{ - return (struct gl_geometry_program *) _mesa_clone_program(ctx, &prog->Base); -} - -static inline struct gl_fragment_program * -_mesa_clone_fragment_program(struct gl_context *ctx, - const struct gl_fragment_program *prog) -{ - return (struct gl_fragment_program *) _mesa_clone_program(ctx, &prog->Base); -} - - extern GLboolean _mesa_insert_instructions(struct gl_program *prog, GLuint start, GLuint count); extern GLboolean _mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count); -extern struct gl_program * -_mesa_combine_programs(struct gl_context *ctx, - const struct gl_program *progA, - const struct gl_program *progB); - extern void _mesa_find_used_registers(const struct gl_program *prog, gl_register_file file, From 092f0427dcc15e36666f24c817957727632ed377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 21:42:42 +0200 Subject: [PATCH 050/270] program: remove other unused functions Reviewed-by: Emil Velikov Reviewed-by: Brian Paul --- src/mesa/program/program.c | 134 ------------------------------------- src/mesa/program/program.h | 9 --- 2 files changed, 143 deletions(-) diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index eb1f8bec220..1fcb8e06ea3 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -659,140 +659,6 @@ _mesa_find_free_register(const GLboolean used[], } - -/** - * Check if the given register index is valid (doesn't exceed implementation- - * dependent limits). - * \return GL_TRUE if OK, GL_FALSE if bad index - */ -GLboolean -_mesa_valid_register_index(const struct gl_context *ctx, - gl_shader_stage shaderType, - gl_register_file file, GLint index) -{ - const struct gl_program_constants *c; - - assert(0 <= shaderType && shaderType < MESA_SHADER_STAGES); - c = &ctx->Const.Program[shaderType]; - - switch (file) { - case PROGRAM_UNDEFINED: - return GL_TRUE; /* XXX or maybe false? */ - - case PROGRAM_TEMPORARY: - return index >= 0 && index < (GLint) c->MaxTemps; - - case PROGRAM_UNIFORM: - case PROGRAM_STATE_VAR: - /* aka constant buffer */ - return index >= 0 && index < (GLint) c->MaxUniformComponents / 4; - - case PROGRAM_CONSTANT: - /* constant buffer w/ possible relative negative addressing */ - return (index > (int) c->MaxUniformComponents / -4 && - index < (int) c->MaxUniformComponents / 4); - - case PROGRAM_INPUT: - if (index < 0) - return GL_FALSE; - - switch (shaderType) { - case MESA_SHADER_VERTEX: - return index < VERT_ATTRIB_GENERIC0 + (GLint) c->MaxAttribs; - case MESA_SHADER_FRAGMENT: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - case MESA_SHADER_GEOMETRY: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - default: - return GL_FALSE; - } - - case PROGRAM_OUTPUT: - if (index < 0) - return GL_FALSE; - - switch (shaderType) { - case MESA_SHADER_VERTEX: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - case MESA_SHADER_FRAGMENT: - return index < FRAG_RESULT_DATA0 + (GLint) ctx->Const.MaxDrawBuffers; - case MESA_SHADER_GEOMETRY: - return index < VARYING_SLOT_VAR0 + (GLint) ctx->Const.MaxVarying; - default: - return GL_FALSE; - } - - case PROGRAM_ADDRESS: - return index >= 0 && index < (GLint) c->MaxAddressRegs; - - default: - _mesa_problem(ctx, - "unexpected register file in _mesa_valid_register_index()"); - return GL_FALSE; - } -} - - - -/** - * "Post-process" a GPU program. This is intended to be used for debugging. - * Example actions include no-op'ing instructions or changing instruction - * behaviour. - */ -void -_mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog) -{ - static const GLfloat white[4] = { 0.5, 0.5, 0.5, 0.5 }; - GLuint i; - GLuint whiteSwizzle; - GLint whiteIndex = _mesa_add_unnamed_constant(prog->Parameters, - (gl_constant_value *) white, - 4, &whiteSwizzle); - - (void) whiteIndex; - - for (i = 0; i < prog->NumInstructions; i++) { - struct prog_instruction *inst = prog->Instructions + i; - const GLuint n = _mesa_num_inst_src_regs(inst->Opcode); - - (void) n; - - if (_mesa_is_tex_instruction(inst->Opcode)) { -#if 0 - /* replace TEX/TXP/TXB with MOV */ - inst->Opcode = OPCODE_MOV; - inst->DstReg.WriteMask = WRITEMASK_XYZW; - inst->SrcReg[0].Swizzle = SWIZZLE_XYZW; - inst->SrcReg[0].Negate = NEGATE_NONE; -#endif - -#if 0 - /* disable shadow texture mode */ - inst->TexShadow = 0; -#endif - } - - if (inst->Opcode == OPCODE_TXP) { -#if 0 - inst->Opcode = OPCODE_MOV; - inst->DstReg.WriteMask = WRITEMASK_XYZW; - inst->SrcReg[0].File = PROGRAM_CONSTANT; - inst->SrcReg[0].Index = whiteIndex; - inst->SrcReg[0].Swizzle = SWIZZLE_XYZW; - inst->SrcReg[0].Negate = NEGATE_NONE; -#endif -#if 0 - inst->TexShadow = 0; -#endif -#if 0 - inst->Opcode = OPCODE_TEX; - inst->TexShadow = 0; -#endif - } - - } -} - /* Gets the minimum number of shader invocations per fragment. * This function is useful to determine if we need to do per * sample shading or per fragment shading. diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index f17b2f8f294..6f54fac4f32 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -186,15 +186,6 @@ extern GLint _mesa_find_free_register(const GLboolean used[], GLuint maxRegs, GLuint firstReg); - -extern GLboolean -_mesa_valid_register_index(const struct gl_context *ctx, - gl_shader_stage shaderType, - gl_register_file file, GLint index); - -extern void -_mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog); - extern GLint _mesa_get_min_invocations_per_fragment(struct gl_context *ctx, const struct gl_fragment_program *prog, From d695c676ea61f48f121969462ece708b739a02c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2015 22:13:34 +0200 Subject: [PATCH 051/270] program: remove _mesa_init_*_program wrappers They didn't do anything useful. Reviewed-by: Emil Velikov Reviewed-by: Brian Paul --- src/mesa/drivers/dri/i915/i915_fragprog.c | 7 +- src/mesa/drivers/dri/i965/brw_program.c | 10 +- .../dri/i965/test_fs_cmod_propagation.cpp | 2 +- .../dri/i965/test_fs_saturate_propagation.cpp | 2 +- .../dri/i965/test_vec4_copy_propagation.cpp | 2 +- .../dri/i965/test_vec4_register_coalesce.cpp | 2 +- src/mesa/drivers/dri/r200/r200_vertprog.c | 4 +- src/mesa/program/program.c | 133 +++--------------- src/mesa/program/program.h | 29 +--- src/mesa/state_tracker/st_cb_program.c | 43 +++--- 10 files changed, 50 insertions(+), 184 deletions(-) diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c index 1a5943c87fb..237d219289b 100644 --- a/src/mesa/drivers/dri/i915/i915_fragprog.c +++ b/src/mesa/drivers/dri/i915/i915_fragprog.c @@ -1316,8 +1316,8 @@ i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id) { switch (target) { case GL_VERTEX_PROGRAM_ARB: - return _mesa_init_vertex_program(ctx, CALLOC_STRUCT(gl_vertex_program), - target, id); + return _mesa_init_gl_program(CALLOC_STRUCT(gl_vertex_program), + target, id); case GL_FRAGMENT_PROGRAM_ARB:{ struct i915_fragment_program *prog = @@ -1325,8 +1325,7 @@ i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id) if (prog) { i915_init_program(I915_CONTEXT(ctx), prog); - return _mesa_init_fragment_program(ctx, &prog->FragProg, - target, id); + return _mesa_init_gl_program(prog, target, id); } else return NULL; diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 69ecc36f2e7..164c3d76c99 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -69,8 +69,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_vertex_program( ctx, &prog->program, - target, id ); + return _mesa_init_gl_program(&prog->program, target, id); } else return NULL; @@ -81,8 +80,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_fragment_program( ctx, &prog->program, - target, id ); + return _mesa_init_gl_program(&prog->program, target, id); } else return NULL; @@ -93,7 +91,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_geometry_program(ctx, &prog->program, target, id); + return _mesa_init_gl_program(&prog->program, target, id); } else { return NULL; } @@ -104,7 +102,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_compute_program(ctx, &prog->program, target, id); + return _mesa_init_gl_program(&prog->program, target, id); } else { return NULL; } diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp index 8adb626d420..7eee42630a6 100644 --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp @@ -66,7 +66,7 @@ void cmod_propagation_test::SetUp() v = new cmod_propagation_fs_visitor(compiler, prog_data, shader); - _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0); + _mesa_init_gl_program(&fp->program, GL_FRAGMENT_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp index f77b18e7db8..fefde4bb7bf 100644 --- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp @@ -66,7 +66,7 @@ void saturate_propagation_test::SetUp() v = new saturate_propagation_fs_visitor(compiler, prog_data, shader); - _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0); + _mesa_init_gl_program(&fp->program, GL_FRAGMENT_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp index 40253961a65..4a87e6eff96 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp @@ -98,7 +98,7 @@ void copy_propagation_test::SetUp() v = new copy_propagation_vec4_visitor(compiler, shader); - _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0); + _mesa_init_gl_program(&vp->program, GL_VERTEX_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp index 76028d36311..92d75e79837 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp @@ -101,7 +101,7 @@ void register_coalesce_test::SetUp() v = new register_coalesce_vec4_visitor(compiler, shader); - _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0); + _mesa_init_gl_program(&vp->program, GL_VERTEX_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c index d43eaf977fc..d173605d110 100644 --- a/src/mesa/drivers/dri/r200/r200_vertprog.c +++ b/src/mesa/drivers/dri/r200/r200_vertprog.c @@ -1205,9 +1205,9 @@ r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id) switch(target){ case GL_VERTEX_PROGRAM_ARB: vp = CALLOC_STRUCT(r200_vertex_program); - return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id); + return _mesa_init_gl_program(&vp->mesa_program, target, id); case GL_FRAGMENT_PROGRAM_ARB: - return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(gl_fragment_program), target, id ); + return _mesa_init_gl_program(CALLOC_STRUCT(gl_fragment_program), target, id); default: _mesa_problem(ctx, "Bad target in r200NewProgram"); } diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index 1fcb8e06ea3..c35a89b5983 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -175,12 +175,14 @@ _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string) /** * Initialize a new gl_program object. */ -static void -init_program_struct(struct gl_program *prog, GLenum target, GLuint id) +struct gl_program * +_mesa_init_gl_program(void *_prog, GLenum target, GLuint id) { + struct gl_program *prog = (struct gl_program*)_prog; GLuint i; - assert(prog); + if (!prog) + return NULL; memset(prog, 0, sizeof(*prog)); mtx_init(&prog->Mutex, mtx_plain); @@ -192,102 +194,8 @@ init_program_struct(struct gl_program *prog, GLenum target, GLuint id) /* default mapping from samplers to texture units */ for (i = 0; i < MAX_SAMPLERS; i++) prog->SamplerUnits[i] = i; -} - -/** - * Initialize a new fragment program object. - */ -struct gl_program * -_mesa_init_fragment_program(struct gl_context *ctx, - struct gl_fragment_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new vertex program object. - */ -struct gl_program * -_mesa_init_vertex_program(struct gl_context *ctx, - struct gl_vertex_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new compute program object. - */ -struct gl_program * -_mesa_init_compute_program(struct gl_context *ctx, - struct gl_compute_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new tessellation control program object. - */ -struct gl_program * -_mesa_init_tess_ctrl_program(struct gl_context *ctx, - struct gl_tess_ctrl_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new tessellation evaluation program object. - */ -struct gl_program * -_mesa_init_tess_eval_program(struct gl_context *ctx, - struct gl_tess_eval_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; -} - - -/** - * Initialize a new geometry program object. - */ -struct gl_program * -_mesa_init_geometry_program(struct gl_context *ctx, - struct gl_geometry_program *prog, - GLenum target, GLuint id) -{ - if (prog) { - init_program_struct(&prog->Base, target, id); - return &prog->Base; - } - return NULL; + return prog; } @@ -309,34 +217,29 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id) struct gl_program *prog; switch (target) { case GL_VERTEX_PROGRAM_ARB: /* == GL_VERTEX_PROGRAM_NV */ - prog = _mesa_init_vertex_program(ctx, CALLOC_STRUCT(gl_vertex_program), - target, id ); + prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_vertex_program), + target, id); break; case GL_FRAGMENT_PROGRAM_NV: case GL_FRAGMENT_PROGRAM_ARB: - prog =_mesa_init_fragment_program(ctx, - CALLOC_STRUCT(gl_fragment_program), - target, id ); + prog =_mesa_init_gl_program(CALLOC_STRUCT(gl_fragment_program), + target, id); break; case GL_GEOMETRY_PROGRAM_NV: - prog = _mesa_init_geometry_program(ctx, - CALLOC_STRUCT(gl_geometry_program), - target, id); + prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_geometry_program), + target, id); break; case GL_TESS_CONTROL_PROGRAM_NV: - prog = _mesa_init_tess_ctrl_program(ctx, - CALLOC_STRUCT(gl_tess_ctrl_program), - target, id); + prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_tess_ctrl_program), + target, id); break; case GL_TESS_EVALUATION_PROGRAM_NV: - prog = _mesa_init_tess_eval_program(ctx, - CALLOC_STRUCT(gl_tess_eval_program), - target, id); + prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_tess_eval_program), + target, id); break; case GL_COMPUTE_PROGRAM_NV: - prog = _mesa_init_compute_program(ctx, - CALLOC_STRUCT(gl_compute_program), - target, id); + prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_compute_program), + target, id); break; default: _mesa_problem(ctx, "bad target in _mesa_new_program"); diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index 6f54fac4f32..51e10a1708b 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -64,34 +64,7 @@ extern void _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string); extern struct gl_program * -_mesa_init_vertex_program(struct gl_context *ctx, - struct gl_vertex_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_fragment_program(struct gl_context *ctx, - struct gl_fragment_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_tess_ctrl_program(struct gl_context *ctx, - struct gl_tess_ctrl_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_tess_eval_program(struct gl_context *ctx, - struct gl_tess_eval_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_geometry_program(struct gl_context *ctx, - struct gl_geometry_program *prog, - GLenum target, GLuint id); - -extern struct gl_program * -_mesa_init_compute_program(struct gl_context *ctx, - struct gl_compute_program *prog, - GLenum target, GLuint id); +_mesa_init_gl_program(void *prog, GLenum target, GLuint id); extern struct gl_program * _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id); diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 003ce336fd4..40f2af0e550 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -102,36 +102,29 @@ st_use_program(struct gl_context *ctx, struct gl_shader_program *shProg) static struct gl_program * st_new_program(struct gl_context *ctx, GLenum target, GLuint id) { + struct gl_program *prog; + switch (target) { - case GL_VERTEX_PROGRAM_ARB: { - struct st_vertex_program *prog = ST_CALLOC_STRUCT(st_vertex_program); - return _mesa_init_vertex_program(ctx, &prog->Base, target, id); - } - - case GL_FRAGMENT_PROGRAM_ARB: { - struct st_fragment_program *prog = ST_CALLOC_STRUCT(st_fragment_program); - return _mesa_init_fragment_program(ctx, &prog->Base, target, id); - } - - case GL_GEOMETRY_PROGRAM_NV: { - struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program); - return _mesa_init_geometry_program(ctx, &prog->Base, target, id); - } - - case GL_TESS_CONTROL_PROGRAM_NV: { - struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program); - return _mesa_init_tess_ctrl_program(ctx, &prog->Base, target, id); - } - - case GL_TESS_EVALUATION_PROGRAM_NV: { - struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program); - return _mesa_init_tess_eval_program(ctx, &prog->Base, target, id); - } - + case GL_VERTEX_PROGRAM_ARB: + prog = (struct gl_program*)ST_CALLOC_STRUCT(st_vertex_program); + break; + case GL_FRAGMENT_PROGRAM_ARB: + prog = (struct gl_program*)ST_CALLOC_STRUCT(st_fragment_program); + break; + case GL_GEOMETRY_PROGRAM_NV: + prog = (struct gl_program*)ST_CALLOC_STRUCT(st_geometry_program); + break; + case GL_TESS_CONTROL_PROGRAM_NV: + prog = (struct gl_program*)ST_CALLOC_STRUCT(st_tessctrl_program); + break; + case GL_TESS_EVALUATION_PROGRAM_NV: + prog = (struct gl_program*)ST_CALLOC_STRUCT(st_tesseval_program); + break; default: assert(0); return NULL; } + return _mesa_init_gl_program(prog, target, id); } From 4c4ba5a8c32c0a58b5874bdb4b42cb12e6b1c2f5 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Fri, 9 Oct 2015 23:12:14 +0200 Subject: [PATCH 052/270] tgsi: (trivial) kill c99-ism. --- src/gallium/auxiliary/tgsi/tgsi_emulate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_emulate.c b/src/gallium/auxiliary/tgsi/tgsi_emulate.c index 819087261b3..59d2e4c95b1 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_emulate.c +++ b/src/gallium/auxiliary/tgsi/tgsi_emulate.c @@ -102,7 +102,7 @@ passthrough_edgeflag(struct tgsi_transform_context *tctx) static void transform_instr(struct tgsi_transform_context *tctx, - struct tgsi_full_instruction *inst) + struct tgsi_full_instruction *inst) { struct tgsi_emulation_context *ctx = tgsi_emulation_context(tctx); @@ -116,7 +116,8 @@ transform_instr(struct tgsi_transform_context *tctx, /* Clamp color outputs. */ if (ctx->flags & TGSI_EMU_CLAMP_COLOR_OUTPUTS) { - for (int i = 0; i < inst->Instruction.NumDstRegs; i++) { + int i; + for (i = 0; i < inst->Instruction.NumDstRegs; i++) { unsigned semantic; if (inst->Dst[i].Register.File != TGSI_FILE_OUTPUT || From dcd59a9e322edeea74187bcad65a8e56c0bfaaa2 Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Thu, 8 Oct 2015 11:53:08 -0700 Subject: [PATCH 053/270] i965/gen9: Disable MCS for 1x color surfaces Fast color clears are disabled for gen9 (see the checks in brw_meta_fast_clear), so there is no reason to allocate the MCS and track its clear/resolve state. Reviewed-by: Neil Roberts --- src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c index a169c41790e..b6e35205727 100644 --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c @@ -201,6 +201,14 @@ intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw, if (brw->gen < 7) return false; + if (brw->gen >= 9) { + /* FINISHME: Enable singlesample fast MCS clears on SKL after all GPU + * FINISHME: hangs are resolved. + */ + perf_debug("singlesample fast MCS clears disabled on gen9"); + return false; + } + if (mt->disable_aux_buffers) return false; From 8a0c85b25853decb4a110b6d36d79c4f095d437b Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Thu, 8 Oct 2015 12:06:24 -0700 Subject: [PATCH 054/270] i965/gen9: Enable rep clears on gen9 The (gen < 9) check in brw_clear() was too broad. It disabled all types of fast color clears: a. singlesample rep clears b. singlesample MCS fast clears c. multisample MCS fast clears The MCS clears are still buggy, but the rep clear works well. So let's enable it. Reviewed-by: Neil Roberts --- src/mesa/drivers/dri/i965/brw_clear.c | 2 +- src/mesa/drivers/dri/i965/brw_meta_fast_clear.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c index 17a745d0373..b0119558c3a 100644 --- a/src/mesa/drivers/dri/i965/brw_clear.c +++ b/src/mesa/drivers/dri/i965/brw_clear.c @@ -241,7 +241,7 @@ brw_clear(struct gl_context *ctx, GLbitfield mask) } /* Clear color buffers with fast clear or at least rep16 writes. */ - if (brw->gen >= 6 && brw->gen < 9 && (mask & BUFFER_BITS_COLOR)) { + if (brw->gen >= 6 && (mask & BUFFER_BITS_COLOR)) { if (brw_meta_fast_clear(brw, fb, mask, partial_clear)) { debug_mask("blorp color", mask & BUFFER_BITS_COLOR); mask &= ~BUFFER_BITS_COLOR; diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c index eb201736c6e..fbde3f04204 100644 --- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c +++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c @@ -451,6 +451,11 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb, if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS) clear_type = REP_CLEAR; + if (brw->gen >= 9 && clear_type == FAST_CLEAR) { + perf_debug("fast MCS clears are disabled on gen9"); + clear_type = REP_CLEAR; + } + /* We can't do scissored fast clears because of the restrictions on the * fast clear rectangle size. */ From 82b324c24bc28ae99a6110706c85460b71d26077 Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Thu, 8 Oct 2015 12:21:19 -0700 Subject: [PATCH 055/270] i965/gen8: Remove gen<8 checks in gen8 code Some assertions in gen8_surface_state.c checked for gen < 8. Reviewed-by: Topi Pohjolainen Reviewed-by: Anuj Phogat --- src/mesa/drivers/dri/i965/gen8_surface_state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c index e1e7704655d..18b86652fd2 100644 --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c @@ -221,8 +221,8 @@ gen8_emit_texture_surface_state(struct brw_context *brw, * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN * 16 must be used." */ - assert(brw->gen < 9 || mt->halign == 16); - assert(brw->gen < 8 || mt->num_samples > 1 || mt->halign == 16); + if (brw->gen >= 9 || mt->num_samples == 1) + assert(mt->halign == 16); } const uint32_t surf_type = translate_tex_target(target); @@ -470,8 +470,8 @@ gen8_update_renderbuffer_surface(struct brw_context *brw, * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN * 16 must be used." */ - assert(brw->gen < 9 || mt->halign == 16); - assert(brw->gen < 8 || mt->num_samples > 1 || mt->halign == 16); + if (brw->gen >= 9 || mt->num_samples == 1) + assert(mt->halign == 16); } uint32_t *surf = allocate_surface_state(brw, &offset, surf_index); From 8337a31bcc2865dfa1c4b0e0cf16294e0f7e4bf5 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Sat, 10 Oct 2015 16:21:58 +0100 Subject: [PATCH 056/270] docs: add release notes for 11.0.3 Signed-off-by: Emil Velikov (cherry picked from commit 914966befcd57764941405707d8f57d3e7e7f768) --- docs/relnotes/11.0.3.html | 184 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 docs/relnotes/11.0.3.html diff --git a/docs/relnotes/11.0.3.html b/docs/relnotes/11.0.3.html new file mode 100644 index 00000000000..25fad25e1a8 --- /dev/null +++ b/docs/relnotes/11.0.3.html @@ -0,0 +1,184 @@ + + + + + Mesa Release Notes + + + + +
+

The Mesa 3D Graphics Library

+
+ + +
+ +

Mesa 11.0.3 Release Notes / October 10, 2015

+ +

+Mesa 11.0.3 is a bug fix release which fixes bugs found since the 11.0.2 release. +

+

+Mesa 11.0.3 implements the OpenGL 4.1 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.1. OpenGL +4.1 is only available if requested at context creation +because compatibility contexts are not supported. +

+ + +

SHA256 checksums

+
+TBD
+
+ + +

New features

+

None

+ +

Bug fixes

+ +

This list is likely incomplete.

+ +
    + +
  • Bug 55552 - Compile errors with --enable-mangling
  • + +
  • Bug 71789 - [r300g] Visuals not found in (default) depth = 24
  • + +
  • Bug 91044 - piglit spec/egl_khr_create_context/valid debug flag gles* fail
  • + +
  • Bug 91342 - Very dark textures on some objects in indoors environments in Postal 2
  • + +
  • Bug 91596 - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI
  • + +
  • Bug 91718 - piglit.spec.arb_shader_image_load_store.invalid causes intermittent GPU HANG
  • + +
  • Bug 92072 - Wine breakage since d082c5324 (st/mesa: don't call st_validate_state in BlitFramebuffer)
  • + +
  • Bug 92265 - Black windows in weston after update mesa to 11.0.2-1
  • + +
+ + +

Changes

+ +

Brian Paul (1):

+
    +
  • st/mesa: try PIPE_BIND_RENDER_TARGET when choosing float texture formats
  • +
+ +

Daniel Scharrer (1):

+
    +
  • mesa: Add abs input modifier to base for POW in ffvertex_prog
  • +
+ +

Emil Velikov (3):

+
    +
  • docs: add sha256 checksums for 11.0.2
  • +
  • Revert "nouveau: make sure there's always room to emit a fence"
  • +
  • Update version to 11.0.3
  • +
+ +

Francisco Jerez (1):

+
    +
  • i965/fs: Fix hang on IVB and VLV with image format mismatch.
  • +
+ +

Ian Romanick (1):

+
    +
  • meta: Handle array textures in scaled MSAA blits
  • +
+ +

Ilia Mirkin (6):

+
    +
  • nouveau: be more careful about freeing temporary transfer buffers
  • +
  • nouveau: delay deleting buffer with unflushed fence
  • +
  • nouveau: wait to unref the transfer's bo until it's no longer used
  • +
  • nv30: pretend to have packed texture/surface formats
  • +
  • nv30: always go through translate module on big-endian
  • +
  • nouveau: make sure there's always room to emit a fence
  • +
+ +

Jason Ekstrand (1):

+
    +
  • mesa: Correctly handle GL_BGRA_EXT in ES3 format_and_type checks
  • +
+ +

Kyle Brenneman (3):

+
    +
  • glx: Fix build errors with --enable-mangling (v2)
  • +
  • mapi: Make _glapi_get_stub work with "gl" or "mgl" prefix.
  • +
  • glx: Don't hard-code the name "libGL.so.1" in driOpenDriver (v3)
  • +
+ +

Leo Liu (1):

+
    +
  • radeon/vce: fix vui time_scale zero error
  • +
+ +

Marek Olšák (21):

+
    +
  • st/mesa: fix front buffer regression after dropping st_validate_state in Blit
  • +
  • radeonsi: handle index buffer alloc failures
  • +
  • radeonsi: handle constant buffer alloc failures
  • +
  • gallium/radeon: handle buffer_map staging buffer failures better
  • +
  • gallium/radeon: handle buffer alloc failures in r600_draw_rectangle
  • +
  • gallium/radeon: add a fail path for depth MSAA texture readback
  • +
  • radeonsi: report alloc failure from si_shader_binary_read
  • +
  • radeonsi: add malloc fail paths to si_create_shader_state
  • +
  • radeonsi: skip drawing if the tess factor ring allocation fails
  • +
  • radeonsi: skip drawing if GS ring allocations fail
  • +
  • radeonsi: handle shader precompile failures
  • +
  • radeonsi: handle fixed-func TCS shader create failure
  • +
  • radeonsi: skip drawing if VS, TCS, TES, GS fail to compile or upload
  • +
  • radeonsi: skip drawing if PS fails to compile or upload
  • +
  • radeonsi: skip drawing if updating the scratch buffer fails
  • +
  • radeonsi: don't forget to update scratch relocations for LS, HS, ES shaders
  • +
  • radeonsi: handle dummy constant buffer allocation failure
  • +
  • gallium/u_blitter: handle allocation failures
  • +
  • radeonsi: add scratch buffer to the buffer list when it's re-allocated
  • +
  • st/dri: don't use _ctx in client_wait_sync
  • +
  • egl/dri2: don't require a context for ClientWaitSync (v2)
  • +
+ +

Matthew Waters (1):

+
    +
  • egl: rework handling EGL_CONTEXT_FLAGS
  • +
+ +

Michel Dänzer (1):

+
    +
  • st/dri: Use packed RGB formats
  • +
+ +

Roland Scheidegger (1):

+
    +
  • mesa: fix mipmap generation for immutable, compressed textures
  • +
+ +

Tom Stellard (3):

+
    +
  • gallium/radeon: Use call_once() when initailizing LLVM targets
  • +
  • gallivm: Allow drivers and state trackers to initialize gallivm LLVM targets v2
  • +
  • radeon/llvm: Initialize gallivm targets when initializing the AMDGPU target v2
  • +
+ +

Varad Gautam (1):

+
    +
  • egl: restore surface type before linking config to its display
  • +
+ +

Ville Syrjälä (3):

+
    +
  • i830: Fix collision between I830_UPLOAD_RASTER_RULES and I830_UPLOAD_TEX(0)
  • +
  • i915: Fix texcoord vs. varying collision in fragment programs
  • +
  • i915: Remember to call intel_prepare_render() before blitting
  • +
+ + +
+ + From 55a8f072ead8fe948347c90e28deba00953b1dad Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Sat, 10 Oct 2015 17:02:43 +0100 Subject: [PATCH 057/270] docs: add sha256 checksums for 11.0.3 Signed-off-by: Emil Velikov (cherry picked from commit b4bfea0094d0037b1f66f3437e44e333f2f0c3f6) --- docs/relnotes/11.0.3.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/relnotes/11.0.3.html b/docs/relnotes/11.0.3.html index 25fad25e1a8..e839c2121e5 100644 --- a/docs/relnotes/11.0.3.html +++ b/docs/relnotes/11.0.3.html @@ -31,7 +31,8 @@ because compatibility contexts are not supported.

SHA256 checksums

-TBD
+c2210e3daecc10ed9fdcea500327652ed6effc2f47c4b9cee63fb08f560d7117  mesa-11.0.3.tar.gz
+ab2992eece21adc23c398720ef8c6933cb69ea42e1b2611dc09d031e17e033d6  mesa-11.0.3.tar.xz
 
From 2496cfd771cff250bce5c53ca9d79dbf64d7cbcf Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Sat, 10 Oct 2015 17:09:00 +0100 Subject: [PATCH 058/270] docs: add news item and link release notes for 11.0.3 Signed-off-by: Emil Velikov --- docs/index.html | 6 ++++++ docs/relnotes.html | 1 + 2 files changed, 7 insertions(+) diff --git a/docs/index.html b/docs/index.html index 9aa2821dcfe..138447fc500 100644 --- a/docs/index.html +++ b/docs/index.html @@ -16,6 +16,12 @@

News

+

October 10, 2015

+

+Mesa 11.0.3 is released. +This is a bug-fix release. +

+

October 3, 2015

Mesa 10.6.9 is released. diff --git a/docs/relnotes.html b/docs/relnotes.html index 1c47853d81e..074c3b6a612 100644 --- a/docs/relnotes.html +++ b/docs/relnotes.html @@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.

    +
  • 11.0.3 release notes
  • 10.6.9 release notes
  • 11.0.2 release notes
  • 11.0.1 release notes From a4e988f4814d80b27102c48020c4338a6d86c6da Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Mon, 5 Oct 2015 16:21:10 -0700 Subject: [PATCH 059/270] i965/cfg: Fix cfg_t::dump() when a block has no immediate dominator. Switch statements introduce a bogus loop with an unconditional break at the end of the loop, just before the while...so the while is unreachable and has no immediate dominator. v2: With less exuberance Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_cfg.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp index 531fa16b387..10bcd4bafd4 100644 --- a/src/mesa/drivers/dri/i965/brw_cfg.cpp +++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp @@ -426,7 +426,11 @@ cfg_t::dump(backend_shader *s) calculate_idom(); foreach_block (block, this) { - fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num); + if (block->idom) + fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num); + else + fprintf(stderr, "START B%d IDOM(none)", block->num); + foreach_list_typed(bblock_link, link, link, &block->parents) { fprintf(stderr, " <-B%d", link->block->num); From 031d3501322aee0a1474c7f2a9b79f9fa9947430 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 25 Aug 2015 16:59:12 -0700 Subject: [PATCH 060/270] i965/vs: Unify URB entry size/read length calculations between backends. Both the vec4 and scalar VS backends had virtually identical URB entry size and read length calculations. We can move those up a level to backend-agnostic code and reuse it for both. Unfortunately, the backends need to know nr_attributes to compute first_non_payload_grf, so I had to store that in prog_data. We could use urb_read_length, but that's nr_attributes rounded up to a multiple of two, so doing so would waste a register in some cases. There's more code to be removed in the vec4 backend, but that will come in a follow-on patch. Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_context.h | 2 ++ src/mesa/drivers/dri/i965/brw_fs.cpp | 15 +++--------- src/mesa/drivers/dri/i965/brw_vec4.cpp | 19 +-------------- src/mesa/drivers/dri/i965/brw_vs.c | 32 +++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index aa1284db3ce..9ad6b4d90cf 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -699,6 +699,8 @@ struct brw_vs_prog_data { GLbitfield64 inputs_read; + unsigned nr_attributes; + bool uses_vertexid; bool uses_instanceid; }; diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index ba62fdd4b86..65f2e68e621 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1508,21 +1508,12 @@ void fs_visitor::assign_vs_urb_setup() { brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; - int grf, count, slot, channel, attr; + int grf, slot, channel, attr; assert(stage == MESA_SHADER_VERTEX); - count = _mesa_bitcount_64(vs_prog_data->inputs_read); - if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) - count++; /* Each attribute is 4 regs. */ - this->first_non_payload_grf += count * 4; - - unsigned vue_entries = - MAX2(count, vs_prog_data->base.vue_map.num_slots); - - vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4; - vs_prog_data->base.urb_read_length = (count + 1) / 2; + this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes; assert(vs_prog_data->base.urb_read_length <= 15); @@ -1532,7 +1523,7 @@ fs_visitor::assign_vs_urb_setup() if (inst->src[i].file == ATTR) { if (inst->src[i].reg == VERT_ATTRIB_MAX) { - slot = count - 1; + slot = vs_prog_data->nr_attributes - 1; } else { /* Attributes come in in a contiguous block, ordered by their * gl_vert_attrib value. That means we can compute the slot diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index e966b96a5ca..08f3e9188ed 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1632,28 +1632,11 @@ vec4_vs_visitor::setup_attributes(int payload_reg) */ if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) { attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; - nr_attributes++; } lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); - /* The BSpec says we always have to read at least one thing from - * the VF, and it appears that the hardware wedges otherwise. - */ - if (nr_attributes == 0) - nr_attributes = 1; - - prog_data->urb_read_length = (nr_attributes + 1) / 2; - - unsigned vue_entries = - MAX2(nr_attributes, prog_data->vue_map.num_slots); - - if (devinfo->gen == 6) - prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8; - else - prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4; - - return payload_reg + nr_attributes; + return payload_reg + vs_prog_data->nr_attributes; } int diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 38de98fab86..17d3bc49580 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -159,6 +159,38 @@ brw_codegen_vs_prog(struct brw_context *brw, &prog_data.base.vue_map, outputs_written, prog ? prog->SeparateShader : false); + unsigned nr_attributes = _mesa_bitcount_64(prog_data.inputs_read); + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (vp->program.Base.SystemValuesRead & + (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | + BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { + nr_attributes++; + } + + /* The BSpec says we always have to read at least one thing from the VF, + * and it appears that the hardware wedges otherwise. + */ + if (nr_attributes == 0 && !brw->intelScreen->compiler->scalar_vs) + nr_attributes = 1; + + prog_data.nr_attributes = nr_attributes; + prog_data.base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = + MAX2(nr_attributes, prog_data.base.vue_map.num_slots); + + if (brw->gen == 6) + prog_data.base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); + else + prog_data.base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, true); From 6842ad79125371e7e61baac8e6b8a77583f79065 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 25 Aug 2015 17:09:40 -0700 Subject: [PATCH 061/270] i965/vs: Fix a subtlety in the nr_attributes == 0 workaround. nr_attributes is used to compute first_non_payload_grf, which is the first register we're allowed to use for ordinary register allocation. The hardware requires us to read at least one pair of values, but we're completely free to overwrite that garbage register with whatever we like. Instead of altering nr_attributes, we should alter urb_read_length, which only affects the amount we ask the VF to read. This should save us a register in trivial cases (which admittedly isn't very useful). While we're at it, improve the explanation in the comments. v2: Actually do what I said (caught by Ilia). Signed-off-by: Kenneth Graunke Reviewed-by: Iago Toral Quiroga --- src/mesa/drivers/dri/i965/brw_vs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 17d3bc49580..0dc2bdccae8 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -170,14 +170,16 @@ brw_codegen_vs_prog(struct brw_context *brw, nr_attributes++; } - /* The BSpec says we always have to read at least one thing from the VF, - * and it appears that the hardware wedges otherwise. + /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry + * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in + * vec4 mode, the hardware appears to wedge unless we read something. */ - if (nr_attributes == 0 && !brw->intelScreen->compiler->scalar_vs) - nr_attributes = 1; + if (brw->intelScreen->compiler->scalar_vs) + prog_data.base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); + else + prog_data.base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2); prog_data.nr_attributes = nr_attributes; - prog_data.base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); /* Since vertex shaders reuse the same VUE entry for inputs and outputs * (overwriting the original contents), we need to make sure the size is From 2953c3d76178d7589947e6ea1dbd902b7b02b3d4 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 14 Aug 2015 15:15:11 -0700 Subject: [PATCH 062/270] i965/vs: Map scalar VS input locations properly; avoid tons of MOVs. Previously, we used nir_lower_io with the scalar type_size function, which mapped VERT_ATTRIB_* locations to...some numbers. Then, in fs_visitor::nir_setup_inputs(), we created temporaries indexed by those numbers, and emitted MOVs from the actual ATTR registers to those temporaries. Virtually all of these were copy propagated away, but it's still ugly. This patch reworks our input lowering to produce NIR lower_input intrinsics that properly index into the ATTR file, so we can access it directly. No changes in shader-db. v2: Fix unreachable() message (Ken), update commit message (Matt). Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 71 +++++++----------------- src/mesa/drivers/dri/i965/brw_nir.c | 23 +++++++- 2 files changed, 42 insertions(+), 52 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index bc0df6850c4..51189a2d263 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -56,61 +56,25 @@ fs_visitor::emit_nir_code() void fs_visitor::nir_setup_inputs() { + if (stage != MESA_SHADER_FRAGMENT) + return; + nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs); nir_foreach_variable(var, &nir->inputs) { - enum brw_reg_type type = brw_type_for_base_type(var->type); fs_reg input = offset(nir_inputs, bld, var->data.driver_location); fs_reg reg; - switch (stage) { - case MESA_SHADER_VERTEX: { - /* Our ATTR file is indexed by VERT_ATTRIB_*, which is the value - * stored in nir_variable::location. - * - * However, NIR's load_input intrinsics use a different index - an - * offset into a single contiguous array containing all inputs. - * This index corresponds to the nir_variable::driver_location field. - * - * So, we need to copy from fs_reg(ATTR, var->location) to - * offset(nir_inputs, var->data.driver_location). - */ - const glsl_type *const t = var->type->without_array(); - const unsigned components = t->components(); - const unsigned cols = t->matrix_columns; - const unsigned elts = t->vector_elements; - unsigned array_length = var->type->is_array() ? var->type->length : 1; - for (unsigned i = 0; i < array_length; i++) { - for (unsigned j = 0; j < cols; j++) { - for (unsigned k = 0; k < elts; k++) { - bld.MOV(offset(retype(input, type), bld, - components * i + elts * j + k), - offset(fs_reg(ATTR, var->data.location + i, type), - bld, 4 * j + k)); - } - } - } - break; - } - case MESA_SHADER_GEOMETRY: - case MESA_SHADER_COMPUTE: - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - unreachable("fs_visitor not used for these stages yet."); - break; - case MESA_SHADER_FRAGMENT: - if (var->data.location == VARYING_SLOT_POS) { - reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer, - var->data.origin_upper_left); - emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(), - input, reg), 0xF); - } else { - emit_general_interpolation(input, var->name, var->type, - (glsl_interp_qualifier) var->data.interpolation, - var->data.location, var->data.centroid, - var->data.sample); - } - break; + if (var->data.location == VARYING_SLOT_POS) { + reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer, + var->data.origin_upper_left); + emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(), + input, reg), 0xF); + } else { + emit_general_interpolation(input, var->name, var->type, + (glsl_interp_qualifier) var->data.interpolation, + var->data.location, var->data.centroid, + var->data.sample); } } } @@ -1575,8 +1539,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_load_input: { unsigned index = 0; for (unsigned j = 0; j < instr->num_components; j++) { - fs_reg src = offset(retype(nir_inputs, dest.type), bld, - instr->const_index[0] + index); + fs_reg src; + if (stage == MESA_SHADER_VERTEX) { + src = offset(fs_reg(ATTR, instr->const_index[0], dest.type), bld, index); + } else { + src = offset(retype(nir_inputs, dest.type), bld, + instr->const_index[0] + index); + } if (has_indirect) src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0])); index++; diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 80f36dc2399..15c1b1984a1 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -31,15 +31,36 @@ static void brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) { switch (nir->stage) { + case MESA_SHADER_VERTEX: + /* For now, leave the vec4 backend doing the old method. */ + if (!is_scalar) { + nir_assign_var_locations(&nir->inputs, &nir->num_inputs, + type_size_vec4); + break; + } + + /* Start with the location of the variable's base. */ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + /* Now use nir_lower_io to walk dereference chains. Attribute arrays + * are loaded as one vec4 per element (or matrix column), so we use + * type_size_vec4 here. + */ + nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + break; case MESA_SHADER_GEOMETRY: foreach_list_typed(nir_variable, var, node, &nir->inputs) { var->data.driver_location = var->data.location; } break; - default: + case MESA_SHADER_FRAGMENT: nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar ? type_size_scalar : type_size_vec4); break; + default: + unreachable("unsupported shader stage"); } } From a23bdd1fae196e91ebfbb4b0c7730652c63a91ea Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 12 Mar 2015 05:26:40 -0700 Subject: [PATCH 063/270] i965/gs: Make MAX_GS_INPUT_VERTICES a #define in brw_context.h. For scalar VS, I'll need this in brw_fs.cpp as well. It seems silly to redeclare it in three places. Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_context.h | 2 ++ src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 2 -- src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 9ad6b4d90cf..e59478a448a 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -955,6 +955,8 @@ struct intel_batchbuffer { } saved; }; +#define MAX_GS_INPUT_VERTICES 6 + #define BRW_MAX_XFB_STREAMS 4 struct brw_transform_feedback_object { diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 4ce471e0669..775f64d96bc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -30,8 +30,6 @@ #include "brw_vec4_gs_visitor.h" #include "gen6_gs_visitor.h" -const unsigned MAX_GS_INPUT_VERTICES = 6; - namespace brw { vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler, diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp index 59a76559103..671a535a5bd 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp @@ -31,8 +31,6 @@ #include "gen6_gs_visitor.h" -const unsigned MAX_GS_INPUT_VERTICES = 6; - namespace brw { void From 06abd1a25e6388858b7f3a9f3ae245dc39b5ed15 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sat, 10 Oct 2015 21:59:27 +0200 Subject: [PATCH 064/270] nvc0: make use of NVC0_COMPUTE_CLASS for GF110 In theory, GF110+ should also support NVC8_COMPUTE_CLASS but, in practice, a ILLEGAL_CLASS dmesg fail appears when using it. This fixes compute support and MP performance counters on GF110. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_compute.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 47bd123621b..96d753c79f3 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -37,12 +37,9 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, switch (dev->chipset & ~0xf) { case 0xc0: - if (dev->chipset == 0xc8) - obj_class = NVC8_COMPUTE_CLASS; - else - obj_class = NVC0_COMPUTE_CLASS; - break; case 0xd0: + /* In theory, GF110+ should also support NVC8_COMPUTE_CLASS but, + * in practice, a ILLEGAL_CLASS dmesg fail appears when using it. */ obj_class = NVC0_COMPUTE_CLASS; break; default: From 8053c9208f30964d89dc4e262fdf2148f0664696 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 10 Oct 2015 01:56:09 -0400 Subject: [PATCH 065/270] nouveau: avoid emitting new fences unnecessarily Right now we emit on every kick, but this is only necessary if something will ever be able to observe that the fence completed. If there are no refs, leave the fence alone and emit it another day. This also happens to work around an issue for the kick handler -- a kick can be a result of e.g. nouveau_bo_wait or explicit kick, or it can be due to lack of space in the pushbuf. We want the emit to happen in the current batch, so we want there to always be enough space. However an explicit kick could take the reserved space for the implicitly-triggered kick's fence emission if it happened right after. With the new mechanism, hopefully there's no way to cause two fences to be emitted into the same reserved space. Signed-off-by: Ilia Mirkin Reviewed-by: Samuel Pitoiset Fixes: 47d11990b (nouveau: make sure there's always room to emit a fence) Cc: mesa-stable@lists.freedesktop.org --- src/gallium/drivers/nouveau/nouveau_fence.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c index ee4e08dd520..18b15920185 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.c +++ b/src/gallium/drivers/nouveau/nouveau_fence.c @@ -190,8 +190,10 @@ nouveau_fence_wait(struct nouveau_fence *fence) /* wtf, someone is waiting on a fence in flush_notify handler? */ assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING); - if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) + if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) { + PUSH_SPACE(screen->pushbuf, 8); nouveau_fence_emit(fence); + } if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED) if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel)) @@ -224,8 +226,12 @@ nouveau_fence_wait(struct nouveau_fence *fence) void nouveau_fence_next(struct nouveau_screen *screen) { - if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING) - nouveau_fence_emit(screen->fence.current); + if (screen->fence.current->state < NOUVEAU_FENCE_STATE_EMITTING) { + if (screen->fence.current->ref > 1) + nouveau_fence_emit(screen->fence.current); + else + return; + } nouveau_fence_ref(NULL, &screen->fence.current); From 9fe458335ffd35366ef0f4b741aad0cdb3503783 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 10 Oct 2015 04:29:39 -0400 Subject: [PATCH 066/270] nv50,nvc0: don't base decisions on available pushbuf space We still have to push everything out, might as well kick earlier and flip pushbufs when we know we'll need it. This resolves some issues with the new policy of making sure that we always leave a bit of room at the end for fences. Signed-off-by: Ilia Mirkin Reviewed-by: Samuel Pitoiset Fixes: 47d11990b (nouveau: make sure there's always room to emit a fence) Cc: mesa-stable@lists.freedesktop.org --- .../drivers/nouveau/nv50/nv50_shader_state.c | 9 ++------- .../drivers/nouveau/nv50/nv50_transfer.c | 16 +++------------ .../drivers/nouveau/nvc0/nvc0_transfer.c | 20 +++++-------------- 3 files changed, 10 insertions(+), 35 deletions(-) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c index fdde11f4cd5..941555ffbf8 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c @@ -65,14 +65,9 @@ nv50_constbufs_validate(struct nv50_context *nv50) PUSH_DATA (push, (b << 12) | (i << 8) | p | 1); } while (words) { - unsigned nr; - - if (!PUSH_SPACE(push, 16)) - break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(MIN2(nr - 3, words), NV04_PFIFO_MAX_PACKET_LEN); + unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN); + PUSH_SPACE(push, nr + 3); BEGIN_NV04(push, NV50_3D(CB_ADDR), 1); PUSH_DATA (push, (start << 8) | b); BEGIN_NI04(push, NV50_3D(CB_DATA(0)), nr); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c index be514077d32..9a3fd1e705f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c @@ -187,14 +187,7 @@ nv50_sifc_linear_u8(struct nouveau_context *nv, PUSH_DATA (push, 0); while (count) { - unsigned nr; - - if (!PUSH_SPACE(push, 16)) - break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(count, nr - 1); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN); + unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN); BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr); PUSH_DATAp(push, src, nr); @@ -395,12 +388,9 @@ nv50_cb_push(struct nouveau_context *nv, nouveau_pushbuf_validate(push); while (words) { - unsigned nr; - - nr = PUSH_AVAIL(push); - nr = MIN2(nr - 7, words); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1); + unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN); + PUSH_SPACE(push, nr + 7); BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3); PUSH_DATAh(push, bo->offset + base); PUSH_DATA (push, bo->offset + base); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c index aaec60a5ac2..d459dd61c19 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c @@ -188,14 +188,10 @@ nvc0_m2mf_push_linear(struct nouveau_context *nv, nouveau_pushbuf_validate(push); while (count) { - unsigned nr; + unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN); - if (!PUSH_SPACE(push, 16)) + if (!PUSH_SPACE(push, nr + 9)) break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(count, nr - 9); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN); BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2); PUSH_DATAh(push, dst->offset + offset); @@ -234,14 +230,10 @@ nve4_p2mf_push_linear(struct nouveau_context *nv, nouveau_pushbuf_validate(push); while (count) { - unsigned nr; + unsigned nr = MIN2(count, (NV04_PFIFO_MAX_PACKET_LEN - 1)); - if (!PUSH_SPACE(push, 16)) + if (!PUSH_SPACE(push, nr + 10)) break; - nr = PUSH_AVAIL(push); - assert(nr >= 16); - nr = MIN2(count, nr - 8); - nr = MIN2(nr, (NV04_PFIFO_MAX_PACKET_LEN - 1)); BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, dst->offset + offset); @@ -571,9 +563,7 @@ nvc0_cb_bo_push(struct nouveau_context *nv, PUSH_DATA (push, bo->offset + base); while (words) { - unsigned nr = PUSH_AVAIL(push); - nr = MIN2(nr, words); - nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1); + unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN - 1); PUSH_SPACE(push, nr + 2); PUSH_REFN (push, bo, NOUVEAU_BO_WR | domain); From 4642d53a03122e6d3214ed12cb327898917eb84e Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Fri, 9 Oct 2015 12:27:04 -0700 Subject: [PATCH 067/270] i965/vec4: Implement b2f and b2i using negation. Curro added this in commit 3ee2daf23d (before the vec4/NIR backend was added) but it was missed in the new NIR backend. Add it there as well. instructions in affected programs: 1857 -> 1810 (-2.53%) helped: 15 Reviewed-by: Francisco Jerez --- src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 41bd80df377..fdf767ded64 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -1237,14 +1237,8 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; case nir_op_b2i: - emit(AND(dst, op[0], src_reg(1))); - break; - case nir_op_b2f: - op[0].type = BRW_REGISTER_TYPE_D; - dst.type = BRW_REGISTER_TYPE_D; - emit(AND(dst, op[0], src_reg(0x3f800000u))); - dst.type = BRW_REGISTER_TYPE_F; + emit(MOV(dst, negate(op[0]))); break; case nir_op_f2b: From c8083b1adc79073c0d6fc3bb87d6a18e41c779c4 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Thu, 8 Oct 2015 16:51:50 +0800 Subject: [PATCH 068/270] ilo: improve Gen8 defines based on its PRMs --- src/gallium/drivers/ilo/core/ilo_state_cc.c | 12 +- .../drivers/ilo/core/ilo_state_raster.c | 14 +- src/gallium/drivers/ilo/core/ilo_state_sbe.c | 12 +- .../drivers/ilo/core/ilo_state_surface.c | 4 - src/gallium/drivers/ilo/core/ilo_state_vf.c | 8 +- .../drivers/ilo/genhw/gen_eu_message.xml.h | 3 + src/gallium/drivers/ilo/genhw/gen_mi.xml.h | 96 +++++- src/gallium/drivers/ilo/genhw/gen_regs.xml.h | 17 +- .../drivers/ilo/genhw/gen_render.xml.h | 16 + .../drivers/ilo/genhw/gen_render_3d.xml.h | 278 ++++++++++++++---- .../ilo/genhw/gen_render_dynamic.xml.h | 18 +- .../drivers/ilo/genhw/gen_render_media.xml.h | 6 +- .../ilo/genhw/gen_render_surface.xml.h | 36 ++- 13 files changed, 399 insertions(+), 121 deletions(-) diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.c b/src/gallium/drivers/ilo/core/ilo_state_cc.c index 83ee8de979c..1f2456e19ea 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_cc.c +++ b/src/gallium/drivers/ilo/core/ilo_state_cc.c @@ -694,10 +694,10 @@ cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc, cc_get_gen6_effective_rt(dev, info, 0, &rt0); /* 0x0 is reserved for blend factors and we have to set them all */ - dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT | - rt0.a_dst << GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT | - rt0.rgb_src << GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT | - rt0.rgb_dst << GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT; + dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__SHIFT | + rt0.a_dst << GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__SHIFT | + rt0.rgb_src << GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__SHIFT | + rt0.rgb_dst << GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__SHIFT; for (i = 0; i < blend->rt_count; i++) { if (blend->rt[i].argb_write_disables != 0xf) { @@ -707,10 +707,10 @@ cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc, } if (rt0.blend_enable) { - dw1 |= GEN8_PS_BLEND_DW1_BLEND_ENABLE; + dw1 |= GEN8_PS_BLEND_DW1_RT0_BLEND_ENABLE; if (rt0.a_src != rt0.rgb_src || rt0.a_dst != rt0.rgb_dst) - dw1 |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE; + dw1 |= GEN8_PS_BLEND_DW1_RT0_INDEPENDENT_ALPHA_ENABLE; } } diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.c b/src/gallium/drivers/ilo/core/ilo_state_raster.c index ed64a1f0d3c..a694f71bbbf 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_raster.c +++ b/src/gallium/drivers/ilo/core/ilo_state_raster.c @@ -512,7 +512,7 @@ raster_set_gen8_3DSTATE_RASTER(struct ilo_state_raster *rs, /* where should line_msaa_enable be set? */ if (setup->msaa_enable) - dw1 |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE; + dw1 |= GEN8_RASTER_DW1_DX_MULTISAMPLE_ENABLE; if (tri->depth_offset_solid) dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID; @@ -574,10 +574,6 @@ get_gen6_sample_count(const struct ilo_dev *dev, uint8_t sample_count) c = GEN7_NUMSAMPLES_8; min_gen = ILO_GEN(7); break; - case 16: - c = GEN8_NUMSAMPLES_16; - min_gen = ILO_GEN(8); - break; default: assert(!"unexpected sample count"); c = GEN6_NUMSAMPLES_1; @@ -792,17 +788,17 @@ raster_set_gen8_3DSTATE_WM(struct ilo_state_raster *rs, if (ilo_dev_gen(dev) < ILO_GEN(8)) { switch (scan->earlyz_op) { case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR: - dw1 |= GEN7_WM_DW1_DEPTH_CLEAR; + dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_CLEAR; break; case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE: - dw1 |= GEN7_WM_DW1_DEPTH_RESOLVE; + dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_RESOLVE; break; case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE: - dw1 |= GEN7_WM_DW1_HIZ_RESOLVE; + dw1 |= GEN7_WM_DW1_LEGACY_HIZ_RESOLVE; break; default: if (scan->earlyz_stencil_clear) - dw1 |= GEN7_WM_DW1_DEPTH_CLEAR; + dw1 |= GEN7_WM_DW1_LEGACY_DEPTH_CLEAR; break; } } diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.c b/src/gallium/drivers/ilo/core/ilo_state_sbe.c index 5d1d400acdd..1b4ca0683c9 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_sbe.c +++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.c @@ -239,8 +239,8 @@ sbe_set_gen8_3DSTATE_SBE(struct ilo_state_sbe *sbe, vue_read_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT; if (ilo_dev_gen(dev) >= ILO_GEN(8)) { - dw1 |= GEN8_SBE_DW1_USE_URB_READ_LEN | - GEN8_SBE_DW1_USE_URB_READ_OFFSET | + dw1 |= GEN8_SBE_DW1_FORCE_URB_READ_LEN | + GEN8_SBE_DW1_FORCE_URB_READ_OFFSET | vue_read_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT; } else { dw1 |= vue_read_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT; @@ -286,10 +286,10 @@ sbe_set_gen8_3DSTATE_SBE_SWIZ(struct ilo_state_sbe *sbe, swizzle->attr << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT; if (swizzle->force_zeros) { - swiz[i] |= GEN8_SBE_SWIZ_OVERRIDE_W | - GEN8_SBE_SWIZ_OVERRIDE_Z | - GEN8_SBE_SWIZ_OVERRIDE_Y | - GEN8_SBE_SWIZ_OVERRIDE_X | + swiz[i] |= GEN8_SBE_SWIZ_CONST_OVERRIDE_W | + GEN8_SBE_SWIZ_CONST_OVERRIDE_Z | + GEN8_SBE_SWIZ_CONST_OVERRIDE_Y | + GEN8_SBE_SWIZ_CONST_OVERRIDE_X | GEN8_SBE_SWIZ_CONST_0000; } } diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c index 40fe15f316f..27c37535fc8 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_surface.c +++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c @@ -814,10 +814,6 @@ surface_get_gen6_image_sample_count(const struct ilo_dev *dev, *sample_count = GEN7_NUMSAMPLES_8; min_gen = ILO_GEN(7); break; - case 16: - *sample_count = GEN8_NUMSAMPLES_16; - min_gen = ILO_GEN(8); - break; default: assert(!"invalid sample count"); *sample_count = GEN6_NUMSAMPLES_1; diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c index 9faf835fef2..8f091e21a27 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_vf.c +++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c @@ -369,14 +369,14 @@ vf_params_set_gen8_3DSTATE_VF_SGVS(struct ilo_state_vf *vf, if (params->prepend_instanceid) { dw1 |= GEN8_SGVS_DW1_IID_ENABLE | - 1 << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT | - attr << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT; + 1 << GEN8_SGVS_DW1_IID_COMP__SHIFT | + attr << GEN8_SGVS_DW1_IID_OFFSET__SHIFT; } if (params->prepend_vertexid) { dw1 |= GEN8_SGVS_DW1_VID_ENABLE | - 0 << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT | - attr << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT; + 0 << GEN8_SGVS_DW1_VID_COMP__SHIFT | + attr << GEN8_SGVS_DW1_VID_OFFSET__SHIFT; } STATIC_ASSERT(ARRAY_SIZE(vf->sgvs) >= 1); diff --git a/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h b/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h index fe8b26908c0..96cf543d27e 100644 --- a/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_eu_message.xml.h @@ -41,7 +41,9 @@ enum gen_eu_urb_op { GEN7_MSG_URB_READ_OWORD = 0x3, GEN7_MSG_URB_ATOMIC_MOV = 0x4, GEN7_MSG_URB_ATOMIC_INC = 0x5, + GEN75_MSG_URB_ATOMIC_ADD = 0x6, GEN8_MSG_URB_SIMD8_WRITE = 0x7, + GEN8_MSG_URB_SIMD8_READ = 0x8, }; enum gen_eu_pi_simd { @@ -137,6 +139,7 @@ enum gen_eu_dp_op { GEN75_MSG_DP_RC_MEMORY_FENCE = 0x7, GEN75_MSG_DP_RC_MEDIA_BLOCK_WRITE = 0xa, GEN75_MSG_DP_RC_RT_WRITE = 0xc, + GEN8_MSG_DP_RC_RT_READ = 0xd, GEN75_MSG_DP_CC_OWORD_BLOCK_READ = 0x0, GEN75_MSG_DP_CC_UNALIGNED_OWORD_BLOCK_READ = 0x1, GEN75_MSG_DP_CC_OWORD_DUAL_BLOCK_READ = 0x2, diff --git a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h index 5a0bb4f8d77..36f9618eb2d 100644 --- a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h @@ -84,6 +84,8 @@ enum gen_mi_alu_operand { #define GEN7_MI_OPCODE_MI_PREDICATE (0xc << 23) #define GEN7_MI_OPCODE_MI_URB_CLEAR (0x19 << 23) #define GEN75_MI_OPCODE_MI_MATH (0x1a << 23) +#define GEN8_MI_OPCODE_MI_SEMAPHORE_SIGNAL (0x1b << 23) +#define GEN8_MI_OPCODE_MI_SEMAPHORE_WAIT (0x1c << 23) #define GEN6_MI_OPCODE_MI_STORE_DATA_IMM (0x20 << 23) #define GEN6_MI_OPCODE_MI_LOAD_REGISTER_IMM (0x22 << 23) #define GEN6_MI_OPCODE_MI_STORE_REGISTER_MEM (0x24 << 23) @@ -91,8 +93,11 @@ enum gen_mi_alu_operand { #define GEN6_MI_OPCODE_MI_REPORT_PERF_COUNT (0x28 << 23) #define GEN7_MI_OPCODE_MI_LOAD_REGISTER_MEM (0x29 << 23) #define GEN75_MI_OPCODE_MI_LOAD_REGISTER_REG (0x2a << 23) +#define GEN75_MI_OPCODE_MI_RS_STORE_DATA_IMM (0x2b << 23) #define GEN75_MI_OPCODE_MI_LOAD_URB_MEM (0x2c << 23) #define GEN75_MI_OPCODE_MI_STORE_URB_MEM (0x2d << 23) +#define GEN8_MI_OPCODE_MI_COPY_MEM_MEM (0x2e << 23) +#define GEN8_MI_OPCODE_MI_ATOMIC (0x2f << 23) #define GEN6_MI_OPCODE_MI_BATCH_BUFFER_START (0x31 << 23) #define GEN6_MI_LENGTH__MASK 0x0000003f #define GEN6_MI_LENGTH__SHIFT 0 @@ -155,8 +160,41 @@ enum gen_mi_alu_operand { #define GEN75_MI_MATH_DW_SRC2__MASK 0x000007ff #define GEN75_MI_MATH_DW_SRC2__SHIFT 0 +#define GEN8_MI_SEMAPHORE_SIGNAL__SIZE 2 +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_POST_SYNC_OP (0x1 << 21) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE__MASK 0x00038000 +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE__SHIFT 15 +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_RCS (0x0 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VCS0 (0x1 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_BCS (0x2 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VECS (0x3 << 15) +#define GEN8_MI_SEMAPHORE_SIGNAL_DW0_ENGINE_VCS1 (0x4 << 15) + + +#define GEN8_MI_SEMAPHORE_WAIT__SIZE 4 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_USE_GGTT (0x1 << 22) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE__MASK 0x00008000 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE__SHIFT 15 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE_SIGNAL (0x0 << 15) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_WAIT_MODE_POLL (0x1 << 15) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP__MASK 0x00007000 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP__SHIFT 12 +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_GREATER_THAN_SDD (0x0 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_GREATER_THAN_OR_EQUAL_SDD (0x1 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_LESS_THAN_SDD (0x2 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_LESS_THAN_OR_EQUAL_SDD (0x3 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_EQUAL_SDD (0x4 << 12) +#define GEN8_MI_SEMAPHORE_WAIT_DW0_OP_SAD_NO_EQUAL_SDD (0x5 << 12) + + +#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__MASK 0xfffffffc +#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__SHIFT 2 +#define GEN8_MI_SEMAPHORE_WAIT_DW2_ADDR_ADDR__SHR 2 + + #define GEN6_MI_STORE_DATA_IMM__SIZE 6 #define GEN6_MI_STORE_DATA_IMM_DW0_USE_GGTT (0x1 << 22) +#define GEN8_MI_STORE_DATA_IMM_DW0_STORE_QWORD (0x1 << 21) #define GEN6_MI_STORE_DATA_IMM_DW2_ADDR__MASK 0xfffffffc @@ -188,7 +226,17 @@ enum gen_mi_alu_operand { #define GEN6_MI_STORE_REGISTER_MEM_DW2_ADDR__SHR 2 -#define GEN6_MI_FLUSH_DW__SIZE 4 +#define GEN6_MI_FLUSH_DW__SIZE 5 +#define GEN6_MI_FLUSH_DW_DW0_WRITE__MASK 0x0000c000 +#define GEN6_MI_FLUSH_DW_DW0_WRITE__SHIFT 14 +#define GEN6_MI_FLUSH_DW_DW0_WRITE_NONE (0x0 << 14) +#define GEN6_MI_FLUSH_DW_DW0_WRITE_IMM (0x1 << 14) +#define GEN6_MI_FLUSH_DW_DW0_WRITE_TIMESTAMP (0x3 << 14) + +#define GEN6_MI_FLUSH_DW_DW1_USE_GGTT (0x1 << 2) +#define GEN6_MI_FLUSH_DW_DW1_ADDR__MASK 0xfffffff8 +#define GEN6_MI_FLUSH_DW_DW1_ADDR__SHIFT 3 +#define GEN6_MI_FLUSH_DW_DW1_ADDR__SHR 3 @@ -225,6 +273,17 @@ enum gen_mi_alu_operand { #define GEN75_MI_LOAD_REGISTER_REG_DW2_DST_REG__SHIFT 2 #define GEN75_MI_LOAD_REGISTER_REG_DW2_DST_REG__SHR 2 +#define GEN75_MI_RS_STORE_DATA_IMM__SIZE 6 +#define GEN75_MI_RS_STORE_DATA_IMM_DW0_USE_GGTT (0x1 << 22) + + +#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__MASK 0xfffffffc +#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__SHIFT 2 +#define GEN75_MI_RS_STORE_DATA_IMM_DW2_ADDR__SHR 2 + + + + #define GEN75_MI_LOAD_URB_MEM__SIZE 4 #define GEN75_MI_LOAD_URB_MEM_DW1_ADDR__MASK 0x00007ffc @@ -247,12 +306,47 @@ enum gen_mi_alu_operand { #define GEN75_MI_STORE_URB_MEM_DW2_ADDR__SHR 6 +#define GEN8_MI_COPY_MEM_MEM__SIZE 5 +#define GEN8_MI_COPY_MEM_MEM_DW0_USE_GGTT_SRC (0x1 << 22) +#define GEN8_MI_COPY_MEM_MEM_DW0_USE_GGTT_DST (0x1 << 21) + +#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__MASK 0xfffffffc +#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__SHIFT 2 +#define GEN8_MI_COPY_MEM_MEM_DW1_DST_ADDR__SHR 2 + + +#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__MASK 0xfffffffc +#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__SHIFT 2 +#define GEN8_MI_COPY_MEM_MEM_DW3_SRC_ADDR__SHR 2 + + +#define GEN8_MI_ATOMIC__SIZE 11 +#define GEN8_MI_ATOMIC_DW0_USE_GGTT (0x1 << 22) +#define GEN8_MI_ATOMIC_DW0_POST_SYNC_OP (0x1 << 21) +#define GEN8_MI_ATOMIC_DW0_SIZE__MASK 0x00180000 +#define GEN8_MI_ATOMIC_DW0_SIZE__SHIFT 19 +#define GEN8_MI_ATOMIC_DW0_SIZE_DWORD (0x0 << 19) +#define GEN8_MI_ATOMIC_DW0_SIZE_QWORD (0x1 << 19) +#define GEN8_MI_ATOMIC_DW0_SIZE_OWORD (0x2 << 19) +#define GEN8_MI_ATOMIC_DW0_INLINE_DATA (0x1 << 18) +#define GEN8_MI_ATOMIC_DW0_CS_STALL (0x1 << 17) +#define GEN8_MI_ATOMIC_DW0_RETURN_DATA_CONTROL (0x1 << 16) +#define GEN8_MI_ATOMIC_DW0_OP__MASK 0x0000ff00 +#define GEN8_MI_ATOMIC_DW0_OP__SHIFT 8 + +#define GEN8_MI_ATOMIC_DW1_ADDR__MASK 0xfffffffc +#define GEN8_MI_ATOMIC_DW1_ADDR__SHIFT 2 +#define GEN8_MI_ATOMIC_DW1_ADDR__SHR 2 + + + #define GEN6_MI_BATCH_BUFFER_START__SIZE 3 #define GEN75_MI_BATCH_BUFFER_START_DW0_SECOND_LEVEL (0x1 << 22) #define GEN75_MI_BATCH_BUFFER_START_DW0_ADD_OFFSET_ENABLE (0x1 << 16) #define GEN75_MI_BATCH_BUFFER_START_DW0_PREDICATION_ENABLE (0x1 << 15) #define GEN75_MI_BATCH_BUFFER_START_DW0_NON_PRIVILEGED (0x1 << 13) #define GEN6_MI_BATCH_BUFFER_START_DW0_CLEAR_COMMAND_BUFFER (0x1 << 11) +#define GEN75_MI_BATCH_BUFFER_START_DW0_RS_ENABLE (0x1 << 10) #define GEN6_MI_BATCH_BUFFER_START_DW0_USE_PPGTT (0x1 << 8) #define GEN6_MI_BATCH_BUFFER_START_DW1_ADDR__MASK 0xfffffffc diff --git a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h index c51e4f78bc0..54ec13eaafa 100644 --- a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h @@ -37,6 +37,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN6_REG__SIZE 0x400000 #define GEN6_REG_NOPID 0x2094 + +#define GEN6_REG_SO_PRIM_STORAGE_NEEDED 0x2280 + +#define GEN6_REG_SO_NUM_PRIMS_WRITTEN 0x2288 + + +#define GEN7_REG_TS_GPGPU_THREADS_DISPATCHED 0x2290 + #define GEN7_REG_HS_INVOCATION_COUNT 0x2300 #define GEN7_REG_DS_INVOCATION_COUNT 0x2308 @@ -95,10 +103,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN75_REG_CS_GPR__ESIZE 0x8 #define GEN75_REG_CS_GPR__LEN 0x10 +#define GEN7_REG_GPGPU_DISPATCHDIMX 0x2500 -#define GEN6_REG_SO_PRIM_STORAGE_NEEDED 0x2280 +#define GEN7_REG_GPGPU_DISPATCHDIMY 0x2504 -#define GEN6_REG_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_REG_GPGPU_DISPATCHDIMZ 0x2508 #define GEN7_REG_SO_NUM_PRIMS_WRITTEN(i0) (0x5200 + 0x8*(i0)) @@ -118,8 +127,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN7_REG_CACHE_MODE_0_HIZ_RAW_STALL_OPT_DISABLE (0x1 << 2) #define GEN7_REG_CACHE_MODE_1 0x7004 -#define GEN8_REG_CACHE_MODE_1_HIZ_NP_EARLY_Z_FAILS_DISABLE (0x1 << 13) -#define GEN8_REG_CACHE_MODE_1_HIZ_NP_PMA_FIX_ENABLE (0x1 << 11) +#define GEN8_REG_CACHE_MODE_1_NP_EARLY_Z_FAILS_DISABLE (0x1 << 13) +#define GEN8_REG_CACHE_MODE_1_NP_PMA_FIX_ENABLE (0x1 << 11) #define GEN8_REG_L3CNTLREG 0x7034 diff --git a/src/gallium/drivers/ilo/genhw/gen_render.xml.h b/src/gallium/drivers/ilo/genhw/gen_render.xml.h index 2e86ba96ae2..43d271d838a 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render.xml.h @@ -102,6 +102,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN7_RENDER_OPCODE_3DSTATE_URB_HS (0x31 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_URB_DS (0x32 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_URB_GS (0x33 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_VS (0x34 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_GS (0x35 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_HS (0x36 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_DS (0x37 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_CONSTANT_PS (0x38 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_VS (0x43 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_GS (0x44 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_HS (0x45 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_DS (0x45 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_EDIT_PS (0x46 << 16) #define GEN8_RENDER_OPCODE_3DSTATE_VF_INSTANCING (0x49 << 16) #define GEN8_RENDER_OPCODE_3DSTATE_VF_SGVS (0x4a << 16) #define GEN8_RENDER_OPCODE_3DSTATE_VF_TOPOLOGY (0x4b << 16) @@ -130,6 +140,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS (0x116 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_SO_DECL_LIST (0x117 << 16) #define GEN7_RENDER_OPCODE_3DSTATE_SO_BUFFER (0x118 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POOL_ALLOC (0x119 << 16) +#define GEN75_RENDER_OPCODE_3DSTATE_GATHER_POOL_ALLOC (0x11a << 16) #define GEN8_RENDER_OPCODE_3DSTATE_SAMPLE_PATTERN (0x11c << 16) #define GEN6_RENDER_OPCODE_PIPE_CONTROL (0x200 << 16) #define GEN6_RENDER_OPCODE_3DPRIMITIVE (0x300 << 16) @@ -178,6 +190,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_SBA_MOCS__MASK 0x000007f0 #define GEN8_SBA_MOCS__SHIFT 4 #define GEN6_SBA_ADDR_MODIFIED (0x1 << 0) +#define GEN8_SBA_SIZE__MASK 0xfffff000 +#define GEN8_SBA_SIZE__SHIFT 12 +#define GEN8_SBA_SIZE__SHR 12 +#define GEN8_SBA_SIZE_MODIFIED (0x1 << 0) #define GEN6_BINDING_TABLE_ADDR__MASK 0x0000ffe0 #define GEN6_BINDING_TABLE_ADDR__SHIFT 5 #define GEN6_BINDING_TABLE_ADDR__SHR 5 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h index 52173fe5d07..c79a4f3a830 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h @@ -168,7 +168,6 @@ enum gen_sample_count { GEN8_NUMSAMPLES_2 = 0x1, GEN6_NUMSAMPLES_4 = 0x2, GEN7_NUMSAMPLES_8 = 0x3, - GEN8_NUMSAMPLES_16 = 0x4, }; enum gen_inputattr_select { @@ -297,11 +296,58 @@ enum gen_msrast_mode { #define GEN7_URB_DW1_OFFSET__MASK 0x3e000000 #define GEN7_URB_DW1_OFFSET__SHIFT 25 +#define GEN75_URB_DW1_OFFSET__MASK 0x7e000000 +#define GEN75_URB_DW1_OFFSET__SHIFT 25 +#define GEN8_URB_DW1_OFFSET__MASK 0xfe000000 +#define GEN8_URB_DW1_OFFSET__SHIFT 25 #define GEN7_URB_DW1_ENTRY_SIZE__MASK 0x01ff0000 #define GEN7_URB_DW1_ENTRY_SIZE__SHIFT 16 #define GEN7_URB_DW1_ENTRY_COUNT__MASK 0x0000ffff #define GEN7_URB_DW1_ENTRY_COUNT__SHIFT 0 +#define GEN75_3DSTATE_GATHER_CONSTANT_ANY__SIZE 130 + + +#define GEN75_GATHER_CONST_DW1_BT_VALID__MASK 0xffff0000 +#define GEN75_GATHER_CONST_DW1_BT_VALID__SHIFT 16 +#define GEN75_GATHER_CONST_DW1_BT_BLOCK__MASK 0x0000f000 +#define GEN75_GATHER_CONST_DW1_BT_BLOCK__SHIFT 12 + +#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__MASK 0x007fffc0 +#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__SHIFT 6 +#define GEN75_GATHER_CONST_DW2_GATHER_BUFFER_OFFSET__SHR 6 +#define GEN8_GATHER_CONST_DW2_DX9_STALL (0x1 << 5) +#define GEN75_GATHER_CONST_DW2_DX9_ENABLE (0x1 << 4) + +#define GEN75_GATHER_CONST_DW_ENTRY_HIGH__MASK 0xffff0000 +#define GEN75_GATHER_CONST_DW_ENTRY_HIGH__SHIFT 16 +#define GEN75_GATHER_CONST_DW_ENTRY_OFFSET__MASK 0x0000ff00 +#define GEN75_GATHER_CONST_DW_ENTRY_OFFSET__SHIFT 8 +#define GEN75_GATHER_CONST_DW_ENTRY_CHANNEL_MASK__MASK 0x000000f0 +#define GEN75_GATHER_CONST_DW_ENTRY_CHANNEL_MASK__SHIFT 4 +#define GEN75_GATHER_CONST_DW_ENTRY_BT_INDEX__MASK 0x0000001f +#define GEN75_GATHER_CONST_DW_ENTRY_BT_INDEX__SHIFT 0 + +#define GEN75_3DSTATE_BINDING_TABLE_EDIT_ANY__SIZE 258 + + +#define GEN75_BT_EDIT_DW1_BT_BLOCK_CLEAR__MASK 0xffff0000 +#define GEN75_BT_EDIT_DW1_BT_BLOCK_CLEAR__SHIFT 16 +#define GEN75_BT_EDIT_DW1_TARGET__MASK 0x00000003 +#define GEN75_BT_EDIT_DW1_TARGET__SHIFT 0 +#define GEN75_BT_EDIT_DW1_TARGET_CORE0 0x1 +#define GEN75_BT_EDIT_DW1_TARGET_CORE1 0x2 +#define GEN75_BT_EDIT_DW1_TARGET_ALL 0x3 + +#define GEN75_BT_EDIT_DW_ENTRY_BT_INDEX__MASK 0x00ff0000 +#define GEN75_BT_EDIT_DW_ENTRY_BT_INDEX__SHIFT 16 +#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__MASK 0x0000ffff +#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHIFT 0 +#define GEN75_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHR 5 +#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__MASK 0x0000ffff +#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHIFT 0 +#define GEN8_BT_EDIT_DW_ENTRY_SURFACE_STATE_ADDR__SHR 6 + #define GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_ANY__SIZE 2 @@ -315,6 +361,48 @@ enum gen_msrast_mode { #define GEN75_PCB_ALLOC_DW1_SIZE__MASK 0x0000003f #define GEN75_PCB_ALLOC_DW1_SIZE__SHIFT 0 +#define GEN75_3DSTATE_BINDING_TABLE_POOL_ALLOC__SIZE 3 + + +#define GEN75_BT_POOL_ALLOC_DW1_ADDR__MASK 0xfffff000 +#define GEN75_BT_POOL_ALLOC_DW1_ADDR__SHIFT 12 +#define GEN75_BT_POOL_ALLOC_DW1_ADDR__SHR 12 +#define GEN75_BT_POOL_ALLOC_DW1_ENABLE (0x1 << 11) +#define GEN75_BT_POOL_ALLOC_DW1_MOCS__MASK 0x00000780 +#define GEN75_BT_POOL_ALLOC_DW1_MOCS__SHIFT 7 +#define GEN8_BT_POOL_ALLOC_DW1_MOCS__MASK 0x0000007f +#define GEN8_BT_POOL_ALLOC_DW1_MOCS__SHIFT 0 + +#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__MASK 0xfffff000 +#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__SHIFT 12 +#define GEN75_BT_POOL_ALLOC_DW2_END_ADDR__SHR 12 + + +#define GEN8_BT_POOL_ALLOC_DW3_SIZE__MASK 0xfffff000 +#define GEN8_BT_POOL_ALLOC_DW3_SIZE__SHIFT 12 +#define GEN8_BT_POOL_ALLOC_DW3_SIZE__SHR 12 + +#define GEN75_3DSTATE_GATHER_POOL_ALLOC__SIZE 3 + + +#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__MASK 0xfffff000 +#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__SHIFT 12 +#define GEN75_GATHER_POOL_ALLOC_DW1_ADDR__SHR 12 +#define GEN75_GATHER_POOL_ALLOC_DW1_ENABLE (0x1 << 11) +#define GEN75_GATHER_POOL_ALLOC_DW1_MOCS__MASK 0x0000000f +#define GEN75_GATHER_POOL_ALLOC_DW1_MOCS__SHIFT 0 +#define GEN8_GATHER_POOL_ALLOC_DW1_MOCS__MASK 0x0000007f +#define GEN8_GATHER_POOL_ALLOC_DW1_MOCS__SHIFT 0 + +#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__MASK 0xfffff000 +#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__SHIFT 12 +#define GEN75_GATHER_POOL_ALLOC_DW2_END_ADDR__SHR 12 + + +#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__MASK 0xfffff000 +#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__SHIFT 12 +#define GEN8_GATHER_POOL_ALLOC_DW3_SIZE__SHR 12 + #define GEN6_3DSTATE_VERTEX_BUFFERS__SIZE 133 @@ -402,15 +490,15 @@ enum gen_msrast_mode { #define GEN8_SGVS_DW1_IID_ENABLE (0x1 << 31) -#define GEN8_SGVS_DW1_IID_VE_COMP__MASK 0x60000000 -#define GEN8_SGVS_DW1_IID_VE_COMP__SHIFT 29 -#define GEN8_SGVS_DW1_IID_VE_INDEX__MASK 0x003f0000 -#define GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT 16 +#define GEN8_SGVS_DW1_IID_COMP__MASK 0x60000000 +#define GEN8_SGVS_DW1_IID_COMP__SHIFT 29 +#define GEN8_SGVS_DW1_IID_OFFSET__MASK 0x003f0000 +#define GEN8_SGVS_DW1_IID_OFFSET__SHIFT 16 #define GEN8_SGVS_DW1_VID_ENABLE (0x1 << 15) -#define GEN8_SGVS_DW1_VID_VE_COMP__MASK 0x00006000 -#define GEN8_SGVS_DW1_VID_VE_COMP__SHIFT 13 -#define GEN8_SGVS_DW1_VID_VE_INDEX__MASK 0x0000003f -#define GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT 0 +#define GEN8_SGVS_DW1_VID_COMP__MASK 0x00006000 +#define GEN8_SGVS_DW1_VID_COMP__SHIFT 13 +#define GEN8_SGVS_DW1_VID_OFFSET__MASK 0x0000003f +#define GEN8_SGVS_DW1_VID_OFFSET__SHIFT 0 #define GEN8_3DSTATE_VF_TOPOLOGY__SIZE 2 @@ -464,6 +552,10 @@ enum gen_msrast_mode { #define GEN7_3DSTATE_POINTERS_ANY__SIZE 2 +#define GEN7_PTR_DW1_ADDR__MASK 0xffffffe0 +#define GEN7_PTR_DW1_ADDR__SHIFT 5 +#define GEN7_PTR_DW1_ADDR__SHR 5 +#define GEN8_PTR_DW1_CHANGED (0x1 << 0) #define GEN6_3DSTATE_VS__SIZE 9 @@ -513,12 +605,14 @@ enum gen_msrast_mode { #define GEN8_VS_DW7_CACHE_DISABLE (0x1 << 1) #define GEN8_VS_DW7_VS_ENABLE (0x1 << 0) -#define GEN8_VS_DW8_URB_WRITE_OFFSET__MASK 0x03e00000 -#define GEN8_VS_DW8_URB_WRITE_OFFSET__SHIFT 21 -#define GEN8_VS_DW8_URB_WRITE_LEN__MASK 0x001f0000 -#define GEN8_VS_DW8_URB_WRITE_LEN__SHIFT 16 +#define GEN8_VS_DW8_VUE_OUT_READ_OFFSET__MASK 0x07e00000 +#define GEN8_VS_DW8_VUE_OUT_READ_OFFSET__SHIFT 21 +#define GEN8_VS_DW8_VUE_OUT_LEN__MASK 0x001f0000 +#define GEN8_VS_DW8_VUE_OUT_LEN__SHIFT 16 #define GEN8_VS_DW8_UCP_CLIP_ENABLES__MASK 0x0000ff00 #define GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT 8 +#define GEN8_VS_DW8_UCP_CULL_ENABLES__MASK 0x000000ff +#define GEN8_VS_DW8_UCP_CULL_ENABLES__SHIFT 0 #define GEN7_3DSTATE_HS__SIZE 9 @@ -558,11 +652,11 @@ enum gen_msrast_mode { -#define GEN8_HS_DW1_DISPATCH_MAX_THREADS__MASK 0x000000ff -#define GEN8_HS_DW1_DISPATCH_MAX_THREADS__SHIFT 0 #define GEN8_HS_DW2_HS_ENABLE (0x1 << 31) #define GEN8_HS_DW2_STATISTICS (0x1 << 29) +#define GEN8_HS_DW2_MAX_THREADS__MASK 0x0001ff00 +#define GEN8_HS_DW2_MAX_THREADS__SHIFT 8 #define GEN8_HS_DW2_INSTANCE_COUNT__MASK 0x0000000f #define GEN8_HS_DW2_INSTANCE_COUNT__SHIFT 0 @@ -584,9 +678,6 @@ enum gen_msrast_mode { #define GEN8_HS_DW7_URB_READ_OFFSET__MASK 0x000003f0 #define GEN8_HS_DW7_URB_READ_OFFSET__SHIFT 4 -#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__MASK 0x00001fff -#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__SHIFT 0 -#define GEN8_HS_DW8_URB_SEMAPHORE_ADDR__SHR 6 #define GEN7_3DSTATE_TE__SIZE 4 @@ -660,16 +751,19 @@ enum gen_msrast_mode { #define GEN8_DS_DW7_MAX_THREADS__MASK 0x3fe00000 #define GEN8_DS_DW7_MAX_THREADS__SHIFT 21 #define GEN8_DS_DW7_STATISTICS (0x1 << 10) +#define GEN8_DS_DW7_SIMD8_ENABLE (0x1 << 3) #define GEN8_DS_DW7_COMPUTE_W (0x1 << 2) #define GEN8_DS_DW7_CACHE_DISABLE (0x1 << 1) #define GEN8_DS_DW7_DS_ENABLE (0x1 << 0) -#define GEN8_DS_DW8_URB_WRITE_OFFSET__MASK 0x03e00000 -#define GEN8_DS_DW8_URB_WRITE_OFFSET__SHIFT 21 -#define GEN8_DS_DW8_URB_WRITE_LEN__MASK 0x001f0000 -#define GEN8_DS_DW8_URB_WRITE_LEN__SHIFT 16 +#define GEN8_DS_DW8_VUE_OUT_READ_OFFSET__MASK 0x07e00000 +#define GEN8_DS_DW8_VUE_OUT_READ_OFFSET__SHIFT 21 +#define GEN8_DS_DW8_VUE_OUT_LEN__MASK 0x001f0000 +#define GEN8_DS_DW8_VUE_OUT_LEN__SHIFT 16 #define GEN8_DS_DW8_UCP_CLIP_ENABLES__MASK 0x0000ff00 #define GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT 8 +#define GEN8_DS_DW8_UCP_CULL_ENABLES__MASK 0x000000ff +#define GEN8_DS_DW8_UCP_CULL_ENABLES__SHIFT 0 @@ -771,7 +865,7 @@ enum gen_msrast_mode { #define GEN8_GS_DW1_KERNEL_ADDR__SHR 6 -#define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__MASK 0x0000007f +#define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__MASK 0x0000003f #define GEN8_GS_DW3_EXPECTED_VERTEX_COUNT__SHIFT 0 @@ -815,18 +909,20 @@ enum gen_msrast_mode { #define GEN8_GS_DW8_GSCTRL__SHIFT 31 #define GEN8_GS_DW8_GSCTRL_CUT (0x0 << 31) #define GEN8_GS_DW8_GSCTRL_SID (0x1 << 31) -#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__MASK 0x00001fff -#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__SHIFT 0 -#define GEN8_GS_DW8_URB_SEMAPHORE_ADDR__SHR 6 -#define GEN9_GS_DW8_MAX_THREADS__MASK 0x00001fff +#define GEN8_GS_DW8_STATIC_OUTPUT (0x1 << 30) +#define GEN8_GS_DW8_STATIC_OUTPUT_VERTEX_COUNT__MASK 0x07ff0000 +#define GEN8_GS_DW8_STATIC_OUTPUT_VERTEX_COUNT__SHIFT 16 +#define GEN9_GS_DW8_MAX_THREADS__MASK 0x000001ff #define GEN9_GS_DW8_MAX_THREADS__SHIFT 0 -#define GEN8_GS_DW9_URB_WRITE_OFFSET__MASK 0x03e00000 -#define GEN8_GS_DW9_URB_WRITE_OFFSET__SHIFT 21 -#define GEN8_GS_DW9_URB_WRITE_LEN__MASK 0x001f0000 -#define GEN8_GS_DW9_URB_WRITE_LEN__SHIFT 16 +#define GEN8_GS_DW9_VUE_OUT_READ_OFFSET__MASK 0x07e00000 +#define GEN8_GS_DW9_VUE_OUT_READ_OFFSET__SHIFT 21 +#define GEN8_GS_DW9_VUE_OUT_LEN__MASK 0x001f0000 +#define GEN8_GS_DW9_VUE_OUT_LEN__SHIFT 16 #define GEN8_GS_DW9_UCP_CLIP_ENABLES__MASK 0x0000ff00 #define GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT 8 +#define GEN8_GS_DW9_UCP_CULL_ENABLES__MASK 0x000000ff +#define GEN8_GS_DW9_UCP_CULL_ENABLES__SHIFT 0 #define GEN7_3DSTATE_STREAMOUT__SIZE 5 @@ -838,6 +934,11 @@ enum gen_msrast_mode { #define GEN7_SO_DW1_REORDER_MODE__MASK 0x04000000 #define GEN7_SO_DW1_REORDER_MODE__SHIFT 26 #define GEN7_SO_DW1_STATISTICS (0x1 << 25) +#define GEN8_SO_DW1_FORCE_RENDERING__MASK 0x01800000 +#define GEN8_SO_DW1_FORCE_RENDERING__SHIFT 23 +#define GEN8_SO_DW1_FORCE_RENDERING_NORMAL (0x0 << 23) +#define GEN8_SO_DW1_FORCE_RENDERING_OFF (0x2 << 23) +#define GEN8_SO_DW1_FORCE_RENDERING_ON (0x3 << 23) #define GEN7_SO_DW1_BUFFER_ENABLES__MASK 0x00000f00 #define GEN7_SO_DW1_BUFFER_ENABLES__SHIFT 8 @@ -928,9 +1029,9 @@ enum gen_msrast_mode { -#define GEN8_SO_BUF_DW5_OFFSET_ADDR__MASK 0xfffffffc -#define GEN8_SO_BUF_DW5_OFFSET_ADDR__SHIFT 2 -#define GEN8_SO_BUF_DW5_OFFSET_ADDR__SHR 2 +#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__MASK 0xfffffffc +#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__SHIFT 2 +#define GEN8_SO_BUF_DW5_OFFSET_ADDR_ADDR__SHR 2 @@ -939,6 +1040,7 @@ enum gen_msrast_mode { #define GEN7_CLIP_DW1_FRONT_WINDING__MASK 0x00100000 #define GEN7_CLIP_DW1_FRONT_WINDING__SHIFT 20 +#define GEN8_CLIP_DW1_FORCE_UCP_CULL_ENABLES (0x1 << 20) #define GEN7_CLIP_DW1_SUBPIXEL__MASK 0x00080000 #define GEN7_CLIP_DW1_SUBPIXEL__SHIFT 19 #define GEN7_CLIP_DW1_SUBPIXEL_8BITS (0x0 << 19) @@ -946,6 +1048,8 @@ enum gen_msrast_mode { #define GEN7_CLIP_DW1_EARLY_CULL_ENABLE (0x1 << 18) #define GEN7_CLIP_DW1_CULL_MODE__MASK 0x00030000 #define GEN7_CLIP_DW1_CULL_MODE__SHIFT 16 +#define GEN8_CLIP_DW1_FORCE_UCP_CLIP_ENABLES (0x1 << 17) +#define GEN8_CLIP_DW1_FORCE_CLIP_MODE (0x1 << 16) #define GEN6_CLIP_DW1_STATISTICS (0x1 << 10) #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__MASK 0x000000ff #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT 0 @@ -1026,6 +1130,7 @@ enum gen_msrast_mode { #define GEN7_SF_DW3_TRIFAN_PROVOKE__MASK 0x06000000 #define GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT 25 #define GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE (0x1 << 14) +#define GEN8_SF_DW3_SMOOTH_POINT_ENABLE (0x1 << 13) #define GEN7_SF_DW3_SUBPIXEL__MASK 0x00001000 #define GEN7_SF_DW3_SUBPIXEL__SHIFT 12 #define GEN7_SF_DW3_SUBPIXEL_8BITS (0x0 << 12) @@ -1037,8 +1142,8 @@ enum gen_msrast_mode { #define GEN7_3DSTATE_SBE_DW1__SIZE 13 -#define GEN8_SBE_DW1_USE_URB_READ_LEN (0x1 << 29) -#define GEN8_SBE_DW1_USE_URB_READ_OFFSET (0x1 << 28) +#define GEN8_SBE_DW1_FORCE_URB_READ_LEN (0x1 << 29) +#define GEN8_SBE_DW1_FORCE_URB_READ_OFFSET (0x1 << 28) #define GEN7_SBE_DW1_ATTR_SWIZZLE__MASK 0x10000000 #define GEN7_SBE_DW1_ATTR_SWIZZLE__SHIFT 28 #define GEN7_SBE_DW1_ATTR_SWIZZLE_0_15 (0x0 << 28) @@ -1050,21 +1155,28 @@ enum gen_msrast_mode { #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD__SHIFT 20 #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT (0x0 << 20) #define GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT (0x1 << 20) +#define GEN8_SBE_DW1_PID_OVERRIDE_W (0x1 << 19) +#define GEN8_SBE_DW1_PID_OVERRIDE_Z (0x1 << 18) +#define GEN8_SBE_DW1_PID_OVERRIDE_Y (0x1 << 17) +#define GEN8_SBE_DW1_PID_OVERRIDE_X (0x1 << 16) #define GEN7_SBE_DW1_URB_READ_LEN__MASK 0x0000f800 #define GEN7_SBE_DW1_URB_READ_LEN__SHIFT 11 #define GEN7_SBE_DW1_URB_READ_OFFSET__MASK 0x000003f0 #define GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT 4 #define GEN8_SBE_DW1_URB_READ_OFFSET__MASK 0x000007e0 #define GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT 5 +#define GEN8_SBE_DW1_PID_OVERRIDE_ATTR__MASK 0x0000001f +#define GEN8_SBE_DW1_PID_OVERRIDE_ATTR__SHIFT 0 #define GEN8_3DSTATE_SBE_SWIZ_DW1_DW8__SIZE 8 #define GEN8_SBE_SWIZ_HIGH__MASK 0xffff0000 #define GEN8_SBE_SWIZ_HIGH__SHIFT 16 -#define GEN8_SBE_SWIZ_OVERRIDE_W (0x1 << 15) -#define GEN8_SBE_SWIZ_OVERRIDE_Z (0x1 << 14) -#define GEN8_SBE_SWIZ_OVERRIDE_Y (0x1 << 13) -#define GEN8_SBE_SWIZ_OVERRIDE_X (0x1 << 12) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_W (0x1 << 15) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_Z (0x1 << 14) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_Y (0x1 << 13) +#define GEN8_SBE_SWIZ_CONST_OVERRIDE_X (0x1 << 12) +#define GEN8_SBE_SWIZ_SWIZZLE_CONTROL (0x1 << 11) #define GEN8_SBE_SWIZ_CONST__MASK 0x00000600 #define GEN8_SBE_SWIZ_CONST__SHIFT 9 #define GEN8_SBE_SWIZ_CONST_0000 (0x0 << 9) @@ -1126,12 +1238,28 @@ enum gen_msrast_mode { #define GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE (0x1 << 26) +#define GEN8_RASTER_DW1_API__MASK 0x00c00000 +#define GEN8_RASTER_DW1_API__SHIFT 22 +#define GEN8_RASTER_DW1_API_DX9_OGL (0x0 << 22) +#define GEN8_RASTER_DW1_API_DX10 (0x1 << 22) +#define GEN8_RASTER_DW1_API_DX10_1 (0x2 << 22) #define GEN8_RASTER_DW1_FRONT_WINDING__MASK 0x00200000 #define GEN8_RASTER_DW1_FRONT_WINDING__SHIFT 21 +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT__MASK 0x001c0000 +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT__SHIFT 18 +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_0 (0x0 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_1 (0x1 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_2 (0x2 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_4 (0x3 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_8 (0x4 << 18) +#define GEN8_RASTER_DW1_FORCED_SAMPLE_COUNT_NUMRASTSAMPLES_16 (0x5 << 18) #define GEN8_RASTER_DW1_CULL_MODE__MASK 0x00030000 #define GEN8_RASTER_DW1_CULL_MODE__SHIFT 16 +#define GEN8_RASTER_DW1_FORCE_MULTISAMPLE_ENABLE (0x1 << 14) #define GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE (0x1 << 13) -#define GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE (0x1 << 12) +#define GEN8_RASTER_DW1_DX_MULTISAMPLE_ENABLE (0x1 << 12) +#define GEN8_RASTER_DW1_DX_MSRASTMODE__MASK 0x00000c00 +#define GEN8_RASTER_DW1_DX_MSRASTMODE__SHIFT 10 #define GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID (0x1 << 9) #define GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME (0x1 << 8) #define GEN8_RASTER_DW1_DEPTH_OFFSET_POINT (0x1 << 7) @@ -1223,10 +1351,10 @@ enum gen_msrast_mode { #define GEN7_WM_DW1_STATISTICS (0x1 << 31) -#define GEN7_WM_DW1_DEPTH_CLEAR (0x1 << 30) +#define GEN7_WM_DW1_LEGACY_DEPTH_CLEAR (0x1 << 30) #define GEN7_WM_DW1_PS_DISPATCH_ENABLE (0x1 << 29) -#define GEN7_WM_DW1_DEPTH_RESOLVE (0x1 << 28) -#define GEN7_WM_DW1_HIZ_RESOLVE (0x1 << 27) +#define GEN7_WM_DW1_LEGACY_DEPTH_RESOLVE (0x1 << 28) +#define GEN7_WM_DW1_LEGACY_HIZ_RESOLVE (0x1 << 27) #define GEN7_WM_DW1_LEGACY_LINE_RAST (0x1 << 26) #define GEN7_WM_DW1_PS_KILL_PIXEL (0x1 << 25) #define GEN7_WM_DW1_PSCDEPTH__MASK 0x01800000 @@ -1235,6 +1363,11 @@ enum gen_msrast_mode { #define GEN7_WM_DW1_EDSC__SHIFT 21 #define GEN7_WM_DW1_PS_USE_DEPTH (0x1 << 20) #define GEN7_WM_DW1_PS_USE_W (0x1 << 19) +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE__MASK 0x00180000 +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE__SHIFT 19 +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_NORMAL (0x0 << 19) +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_OFF (0x1 << 19) +#define GEN8_WM_DW1_FORCE_DISPATCH_ENABLE_ON (0x2 << 19) #define GEN7_WM_DW1_ZW_INTERP__MASK 0x00060000 #define GEN7_WM_DW1_ZW_INTERP__SHIFT 17 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__MASK 0x0001f800 @@ -1261,6 +1394,11 @@ enum gen_msrast_mode { #define GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT (0x1 << 2) #define GEN7_WM_DW1_MSRASTMODE__MASK 0x00000003 #define GEN7_WM_DW1_MSRASTMODE__SHIFT 0 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL__MASK 0x00000003 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL__SHIFT 0 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL_NORMAL 0x0 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL_OFF 0x1 +#define GEN8_WM_DW1_FORCE_KILL_PIXEL_ON 0x2 #define GEN7_WM_DW2_MSDISPMODE__MASK 0x80000000 #define GEN7_WM_DW2_MSDISPMODE__SHIFT 31 @@ -1271,6 +1409,7 @@ enum gen_msrast_mode { #define GEN8_3DSTATE_WM_CHROMAKEY__SIZE 2 +#define GEN8_CHROMAKEY_DW1_KILL_ENABLE (0x1 << 31) #define GEN8_3DSTATE_WM_DEPTH_STENCIL__SIZE 4 @@ -1318,6 +1457,7 @@ enum gen_msrast_mode { #define GEN8_WM_HZ_DW1_STENCIL_CLEAR (0x1 << 31) #define GEN8_WM_HZ_DW1_DEPTH_CLEAR (0x1 << 30) +#define GEN8_WM_HZ_DW1_SCISSOR_ENABLE (0x1 << 29) #define GEN8_WM_HZ_DW1_DEPTH_RESOLVE (0x1 << 28) #define GEN8_WM_HZ_DW1_HIZ_RESOLVE (0x1 << 27) #define GEN8_WM_HZ_DW1_PIXEL_OFFSET_ENABLE (0x1 << 26) @@ -1443,17 +1583,17 @@ enum gen_msrast_mode { #define GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE (0x1 << 31) #define GEN8_PS_BLEND_DW1_WRITABLE_RT (0x1 << 30) -#define GEN8_PS_BLEND_DW1_BLEND_ENABLE (0x1 << 29) -#define GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__MASK 0x1f000000 -#define GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT 24 -#define GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__MASK 0x00f80000 -#define GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT 19 -#define GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__MASK 0x0007c000 -#define GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT 14 -#define GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__MASK 0x00003e00 -#define GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT 9 +#define GEN8_PS_BLEND_DW1_RT0_BLEND_ENABLE (0x1 << 29) +#define GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__MASK 0x1f000000 +#define GEN8_PS_BLEND_DW1_RT0_SRC_ALPHA_FACTOR__SHIFT 24 +#define GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__MASK 0x00f80000 +#define GEN8_PS_BLEND_DW1_RT0_DST_ALPHA_FACTOR__SHIFT 19 +#define GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__MASK 0x0007c000 +#define GEN8_PS_BLEND_DW1_RT0_SRC_COLOR_FACTOR__SHIFT 14 +#define GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__MASK 0x00003e00 +#define GEN8_PS_BLEND_DW1_RT0_DST_COLOR_FACTOR__SHIFT 9 #define GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE (0x1 << 8) -#define GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE (0x1 << 7) +#define GEN8_PS_BLEND_DW1_RT0_INDEPENDENT_ALPHA_ENABLE (0x1 << 7) #define GEN6_3DSTATE_CONSTANT_ANY__SIZE 11 @@ -1469,6 +1609,8 @@ enum gen_msrast_mode { #define GEN6_CONSTANT_DW_ADDR_ADDR__SHR 5 +#define GEN8_CONSTANT_DW0_MOCS__MASK 0x00007f00 +#define GEN8_CONSTANT_DW0_MOCS__SHIFT 8 #define GEN7_CONSTANT_DW1_BUFFER1_READ_LEN__MASK 0xffff0000 #define GEN7_CONSTANT_DW1_BUFFER1_READ_LEN__SHIFT 16 @@ -1502,6 +1644,8 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_DRAWING_RECTANGLE__SIZE 4 +#define GEN8_DRAWING_RECTANGLE_DW0_CORE_MODE_SELECT__MASK 0x0000c000 +#define GEN8_DRAWING_RECTANGLE_DW0_CORE_MODE_SELECT__SHIFT 14 #define GEN6_DRAWING_RECTANGLE_DW1_MIN_Y__MASK 0xffff0000 #define GEN6_DRAWING_RECTANGLE_DW1_MIN_Y__SHIFT 16 @@ -1624,15 +1768,12 @@ enum gen_msrast_mode { #define GEN8_DEPTH_DW5_MOCS__MASK 0x0000007f #define GEN8_DEPTH_DW5_MOCS__SHIFT 0 -#define GEN8_DEPTH_DW6_OFFSET_Y__MASK 0xffff0000 -#define GEN8_DEPTH_DW6_OFFSET_Y__SHIFT 16 -#define GEN8_DEPTH_DW6_OFFSET_X__MASK 0x0000ffff -#define GEN8_DEPTH_DW6_OFFSET_X__SHIFT 0 #define GEN8_DEPTH_DW7_RT_VIEW_EXTENT__MASK 0xffe00000 #define GEN8_DEPTH_DW7_RT_VIEW_EXTENT__SHIFT 21 #define GEN8_DEPTH_DW7_QPITCH__MASK 0x00007fff #define GEN8_DEPTH_DW7_QPITCH__SHIFT 0 +#define GEN8_DEPTH_DW7_QPITCH__SHR 2 #define GEN6_3DSTATE_POLY_STIPPLE_OFFSET__SIZE 2 @@ -1649,6 +1790,11 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_LINE_STIPPLE__SIZE 3 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_MODIFY_ENABLE (0x1 << 31) +#define GEN6_LINE_STIPPLE_DW1_CURRENT_REPEAT_COUNTER__MASK 0x3fe00000 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_REPEAT_COUNTER__SHIFT 21 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_STIPPLE_INDEX__MASK 0x000f0000 +#define GEN6_LINE_STIPPLE_DW1_CURRENT_STIPPLE_INDEX__SHIFT 16 #define GEN6_LINE_STIPPLE_DW1_PATTERN__MASK 0x0000ffff #define GEN6_LINE_STIPPLE_DW1_PATTERN__SHIFT 0 @@ -1664,16 +1810,28 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_AA_LINE_PARAMETERS__SIZE 3 +#define GEN8_AA_LINE_DW1_POINT_BIAS__MASK 0xff000000 +#define GEN8_AA_LINE_DW1_POINT_BIAS__SHIFT 24 +#define GEN8_AA_LINE_DW1_POINT_BIAS__RADIX 8 #define GEN6_AA_LINE_DW1_BIAS__MASK 0x00ff0000 #define GEN6_AA_LINE_DW1_BIAS__SHIFT 16 #define GEN6_AA_LINE_DW1_BIAS__RADIX 8 +#define GEN8_AA_LINE_DW1_POINT_SLOPE__MASK 0x0000ff00 +#define GEN8_AA_LINE_DW1_POINT_SLOPE__SHIFT 8 +#define GEN8_AA_LINE_DW1_POINT_SLOPE__RADIX 8 #define GEN6_AA_LINE_DW1_SLOPE__MASK 0x000000ff #define GEN6_AA_LINE_DW1_SLOPE__SHIFT 0 #define GEN6_AA_LINE_DW1_SLOPE__RADIX 8 +#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__MASK 0xff000000 +#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__SHIFT 24 +#define GEN8_AA_LINE_DW2_POINT_CAP_BIAS__RADIX 8 #define GEN6_AA_LINE_DW2_CAP_BIAS__MASK 0x00ff0000 #define GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT 16 #define GEN6_AA_LINE_DW2_CAP_BIAS__RADIX 8 +#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__MASK 0x0000ff00 +#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__SHIFT 8 +#define GEN8_AA_LINE_DW2_POINT_CAP_SLOPE__RADIX 8 #define GEN6_AA_LINE_DW2_CAP_SLOPE__MASK 0x000000ff #define GEN6_AA_LINE_DW2_CAP_SLOPE__SHIFT 0 #define GEN6_AA_LINE_DW2_CAP_SLOPE__RADIX 8 @@ -1690,7 +1848,7 @@ enum gen_msrast_mode { #define GEN6_3DSTATE_MULTISAMPLE__SIZE 4 -#define GEN75_MULTISAMPLE_DW1_DX9_MULTISAMPLE_ENABLE (0x1 << 5) +#define GEN75_MULTISAMPLE_DW1_PIXEL_OFFSET_ENABLE (0x1 << 5) #define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__MASK 0x00000010 #define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT 4 #define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__MASK 0x0000000e @@ -1724,6 +1882,7 @@ enum gen_msrast_mode { #define GEN8_STENCIL_DW4_QPITCH__MASK 0x00007fff #define GEN8_STENCIL_DW4_QPITCH__SHIFT 0 +#define GEN8_STENCIL_DW4_QPITCH__SHR 2 #define GEN6_3DSTATE_HIER_DEPTH_BUFFER__SIZE 5 @@ -1739,6 +1898,7 @@ enum gen_msrast_mode { #define GEN8_HIZ_DW4_QPITCH__MASK 0x00007fff #define GEN8_HIZ_DW4_QPITCH__SHIFT 0 +#define GEN8_HIZ_DW4_QPITCH__SHR 2 #define GEN6_3DSTATE_CLEAR_PARAMS__SIZE 3 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h index b65b704adc6..b2c2142af78 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h @@ -430,8 +430,10 @@ enum gen_key_filter { #define GEN7_SAMPLER_DW0_BORDER_COLOR_MODE_DX9 (0x1 << 29) #define GEN6_SAMPLER_DW0_LOD_PRECLAMP_ENABLE (0x1 << 28) #define GEN6_SAMPLER_DW0_MIN_MAG_NOT_EQUAL (0x1 << 27) -#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__MASK 0x18000000 -#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__SHIFT 27 +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE__MASK 0x18000000 +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE__SHIFT 27 +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE_NONE (0x0 << 27) +#define GEN8_SAMPLER_DW0_LOD_PRECLAMP_MODE_OGL (0x2 << 27) #define GEN6_SAMPLER_DW0_BASE_LOD__MASK 0x07c00000 #define GEN6_SAMPLER_DW0_BASE_LOD__SHIFT 22 #define GEN6_SAMPLER_DW0_BASE_LOD__RADIX 1 @@ -493,23 +495,11 @@ enum gen_key_filter { #define GEN6_SAMPLER_DW2_BORDER_COLOR_ADDR__SHIFT 5 #define GEN6_SAMPLER_DW2_BORDER_COLOR_ADDR__SHR 5 -#define GEN8_SAMPLER_DW2_SEP_FILTER_COEFF_TABLE_SIZE__MASK 0xc0000000 -#define GEN8_SAMPLER_DW2_SEP_FILTER_COEFF_TABLE_SIZE__SHIFT 30 -#define GEN8_SAMPLER_DW2_SEP_FILTER_WIDTH__MASK 0x30000000 -#define GEN8_SAMPLER_DW2_SEP_FILTER_WIDTH__SHIFT 28 -#define GEN8_SAMPLER_DW2_SEP_FILTER_HEIGHT__MASK 0x0c000000 -#define GEN8_SAMPLER_DW2_SEP_FILTER_HEIGHT__SHIFT 26 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__MASK 0x00ffffc0 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__SHIFT 6 #define GEN8_SAMPLER_DW2_INDIRECT_STATE_ADDR__SHR 6 -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_MODE (0x1 << 4) -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_COEFF_SIZE (0x1 << 3) -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_HALIGN (0x1 << 2) -#define GEN8_SAMPLER_DW2_FLEXIBLE_FILTER_VALIGN (0x1 << 1) #define GEN8_SAMPLER_DW2_LOD_CLAMP_MAG_MODE (0x1 << 0) -#define GEN8_SAMPLER_DW3_NON_SEP_FILTER_FOOTPRINT_MASK__MASK 0xff000000 -#define GEN8_SAMPLER_DW3_NON_SEP_FILTER_FOOTPRINT_MASK__SHIFT 24 #define GEN6_SAMPLER_DW3_CHROMAKEY_ENABLE (0x1 << 25) #define GEN6_SAMPLER_DW3_CHROMAKEY_INDEX__MASK 0x01800000 #define GEN6_SAMPLER_DW3_CHROMAKEY_INDEX__SHIFT 23 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h index 55d830bad32..2476002ec91 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_media.xml.h @@ -111,6 +111,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_IDRT_DW5_CURBE_READ_LEN__MASK 0xffff0000 #define GEN8_IDRT_DW5_CURBE_READ_LEN__SHIFT 16 +#define GEN8_IDRT_DW5_CURBE_READ_OFFSET__MASK 0x0000ffff +#define GEN8_IDRT_DW5_CURBE_READ_OFFSET__SHIFT 0 #define GEN8_IDRT_DW6_ROUNDING_MODE__MASK 0x00c00000 #define GEN8_IDRT_DW6_ROUNDING_MODE__SHIFT 22 @@ -121,7 +123,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_IDRT_DW6_BARRIER_ENABLE (0x1 << 21) #define GEN8_IDRT_DW6_SLM_SIZE__MASK 0x001f0000 #define GEN8_IDRT_DW6_SLM_SIZE__SHIFT 16 -#define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__MASK 0x000000ff +#define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__MASK 0x000003ff #define GEN8_IDRT_DW6_THREAD_GROUP_SIZE__SHIFT 0 #define GEN8_IDRT_DW7_CROSS_THREAD_CURBE_READ_LEN__MASK 0x000000ff @@ -280,6 +282,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define GEN8_GPGPU_DW1_IDRT_OFFSET__MASK 0x0000003f #define GEN8_GPGPU_DW1_IDRT_OFFSET__SHIFT 0 +#define GEN8_GPGPU_DW2_INDIRECT_LEN__MASK 0x0001ffff +#define GEN8_GPGPU_DW2_INDIRECT_LEN__SHIFT 0 #define GEN8_GPGPU_DW3_INDIRECT_ADDR__MASK 0xffffffe0 #define GEN8_GPGPU_DW3_INDIRECT_ADDR__SHIFT 5 diff --git a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h index b5d09f64429..c180450ce27 100644 --- a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h +++ b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h @@ -388,7 +388,7 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW0_TILING__SHIFT 12 #define GEN8_SURFACE_DW0_VSTRIDE (0x1 << 11) #define GEN8_SURFACE_DW0_VSTRIDE_OFFSET (0x1 << 10) -#define GEN8_SURFACE_DW0_SAMPLER_L2_BYPASS_MODE (0x1 << 9) +#define GEN8_SURFACE_DW0_SAMPLER_L2_BYPASS_DISABLE (0x1 << 9) #define GEN7_SURFACE_DW0_RENDER_CACHE_RW (0x1 << 8) #define GEN7_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__MASK 0x000000c0 #define GEN7_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__SHIFT 6 @@ -402,6 +402,7 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW1_BASE_LOD__SHIFT 19 #define GEN8_SURFACE_DW1_QPITCH__MASK 0x00007fff #define GEN8_SURFACE_DW1_QPITCH__SHIFT 0 +#define GEN8_SURFACE_DW1_QPITCH__SHR 2 #define GEN7_SURFACE_DW2_HEIGHT__MASK 0x3fff0000 #define GEN7_SURFACE_DW2_HEIGHT__SHIFT 16 @@ -434,7 +435,6 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW4_MULTISAMPLECOUNT_2 (0x1 << 3) #define GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4 (0x2 << 3) #define GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8 (0x3 << 3) -#define GEN8_SURFACE_DW4_MULTISAMPLECOUNT_16 (0x4 << 3) #define GEN7_SURFACE_DW4_MSPOS_INDEX__MASK 0x00000007 #define GEN7_SURFACE_DW4_MSPOS_INDEX__SHIFT 0 #define GEN7_SURFACE_DW4_MIN_ARRAY_ELEMENT_STRBUF__MASK 0x07ffffff @@ -451,8 +451,11 @@ enum gen_surface_scs { #define GEN8_SURFACE_DW5_Y_OFFSET__MASK 0x00e00000 #define GEN8_SURFACE_DW5_Y_OFFSET__SHIFT 21 #define GEN8_SURFACE_DW5_Y_OFFSET__SHR 1 -#define GEN8_SURFACE_DW5_CUBE_EWA (0x1 << 20) -#define GEN8_SURFACE_DW5_COHERENCY_TYPE (0x1 << 14) +#define GEN8_SURFACE_DW5_CUBE_EWA_DISABLE (0x1 << 20) +#define GEN8_SURFACE_DW5_COHERENCY_TYPE__MASK 0x00004000 +#define GEN8_SURFACE_DW5_COHERENCY_TYPE__SHIFT 14 +#define GEN8_SURFACE_DW5_COHERENCY_TYPE_GPU (0x0 << 14) +#define GEN8_SURFACE_DW5_COHERENCY_TYPE_IA (0x1 << 14) #define GEN7_SURFACE_DW5_MIN_LOD__MASK 0x000000f0 #define GEN7_SURFACE_DW5_MIN_LOD__SHIFT 4 #define GEN7_SURFACE_DW5_MIP_COUNT_LOD__MASK 0x0000000f @@ -463,22 +466,23 @@ enum gen_surface_scs { #define GEN7_SURFACE_DW6_UV_X_OFFSET__SHIFT 16 #define GEN7_SURFACE_DW6_UV_Y_OFFSET__MASK 0x00003fff #define GEN7_SURFACE_DW6_UV_Y_OFFSET__SHIFT 0 +#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__MASK 0xffffffc0 +#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHIFT 6 +#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHR 6 #define GEN7_SURFACE_DW6_MCS_ADDR__MASK 0xfffff000 #define GEN7_SURFACE_DW6_MCS_ADDR__SHIFT 12 #define GEN7_SURFACE_DW6_MCS_ADDR__SHR 12 #define GEN8_SURFACE_DW6_AUX_QPITCH__MASK 0x7fff0000 #define GEN8_SURFACE_DW6_AUX_QPITCH__SHIFT 16 +#define GEN8_SURFACE_DW6_AUX_QPITCH__SHR 2 #define GEN7_SURFACE_DW6_AUX_PITCH__MASK 0x00000ff8 #define GEN7_SURFACE_DW6_AUX_PITCH__SHIFT 3 -#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__MASK 0xffffffc0 -#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHIFT 6 -#define GEN7_SURFACE_DW6_APPEND_COUNTER_ADDR__SHR 6 -#define GEN7_SURFACE_DW6_AUX_MODE__MASK 0x00000007 -#define GEN7_SURFACE_DW6_AUX_MODE__SHIFT 0 -#define GEN7_SURFACE_DW6_AUX_MODE_NONE 0x0 -#define GEN7_SURFACE_DW6_AUX_MODE_MCS 0x1 -#define GEN7_SURFACE_DW6_AUX_MODE_APPEND 0x2 -#define GEN8_SURFACE_DW6_AUX_MODE_HIZ 0x3 +#define GEN7_SURFACE_DW6_AUX__MASK 0x00000007 +#define GEN7_SURFACE_DW6_AUX__SHIFT 0 +#define GEN7_SURFACE_DW6_AUX_NONE 0x0 +#define GEN7_SURFACE_DW6_AUX_MCS 0x1 +#define GEN7_SURFACE_DW6_AUX_APPEND 0x2 +#define GEN8_SURFACE_DW6_AUX_HIZ 0x3 #define GEN7_SURFACE_DW7_CC_R__MASK 0x80000000 #define GEN7_SURFACE_DW7_CC_R__SHIFT 31 @@ -504,6 +508,12 @@ enum gen_surface_scs { +#define GEN8_SURFACE_DW11_V_X_OFFSET__MASK 0x3fff0000 +#define GEN8_SURFACE_DW11_V_X_OFFSET__SHIFT 16 +#define GEN8_SURFACE_DW11_V_Y_OFFSET__MASK 0x00003fff +#define GEN8_SURFACE_DW11_V_Y_OFFSET__SHIFT 0 +#define GEN8_SURFACE_DW11_AUX_ADDR_HI__MASK 0xffffffff +#define GEN8_SURFACE_DW11_AUX_ADDR_HI__SHIFT 0 From bcfaab38858fdcfbd8ffeaf6b0e3da8a726f02e6 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 30 Sep 2015 17:48:38 +1000 Subject: [PATCH 069/270] mesa/uniforms: fix get_uniform for doubles (v2) The initial glGetUniformdv support didn't cover all the casting cases that are apparantly legal, and cts seems to test for them. I've updated the piglit test to cover these cases now. v2: fix indentation - it's all broken in this file (Ilia) fix src/dst index tracking in light of fp64 support (Ilia) cc: "11.0" Reviewed-by: Ilia Mirkin Signed-off-by: Dave Airlie --- src/mesa/main/uniform_query.cpp | 53 +++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp index d48729778ae..083087d6baa 100644 --- a/src/mesa/main/uniform_query.cpp +++ b/src/mesa/main/uniform_query.cpp @@ -318,19 +318,12 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, return; } - if ((uni->type->base_type == GLSL_TYPE_DOUBLE && - returnType != GLSL_TYPE_DOUBLE) || - (uni->type->base_type != GLSL_TYPE_DOUBLE && - returnType == GLSL_TYPE_DOUBLE)) { - _mesa_error( ctx, GL_INVALID_OPERATION, - "glGetnUniform*vARB(incompatible uniform types)"); - return; - } { unsigned elements = (uni->type->is_sampler()) ? 1 : uni->type->components(); const int dmul = uni->type->base_type == GLSL_TYPE_DOUBLE ? 2 : 1; + const int rmul = returnType == GLSL_TYPE_DOUBLE ? 2 : 1; /* Calculate the source base address *BEFORE* modifying elements to * account for the size of the user's buffer. @@ -342,7 +335,7 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, returnType == GLSL_TYPE_UINT || returnType == GLSL_TYPE_DOUBLE); /* doubles have a different size than the other 3 types */ - unsigned bytes = sizeof(src[0]) * elements * dmul; + unsigned bytes = sizeof(src[0]) * elements * rmul; if (bufSize < 0 || bytes > (unsigned) bufSize) { _mesa_error( ctx, GL_INVALID_OPERATION, "glGetnUniform*vARB(out of bounds: bufSize is %d," @@ -366,32 +359,57 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, } else { union gl_constant_value *const dst = (union gl_constant_value *) paramsOut; - /* This code could be optimized by putting the loop inside the switch * statements. However, this is not expected to be * performance-critical code. */ for (unsigned i = 0; i < elements; i++) { + int sidx = i * dmul; + int didx = i * rmul; + switch (returnType) { case GLSL_TYPE_FLOAT: switch (uni->type->base_type) { case GLSL_TYPE_UINT: - dst[i].f = (float) src[i].u; + dst[didx].f = (float) src[sidx].u; break; case GLSL_TYPE_INT: case GLSL_TYPE_SAMPLER: case GLSL_TYPE_IMAGE: - dst[i].f = (float) src[i].i; + dst[didx].f = (float) src[sidx].i; break; case GLSL_TYPE_BOOL: - dst[i].f = src[i].i ? 1.0f : 0.0f; + dst[didx].f = src[sidx].i ? 1.0f : 0.0f; + break; + case GLSL_TYPE_DOUBLE: + dst[didx].f = *(double *)&src[sidx].f; + break; + default: + assert(!"Should not get here."); + break; + } + break; + case GLSL_TYPE_DOUBLE: + switch (uni->type->base_type) { + case GLSL_TYPE_UINT: + *(double *)&dst[didx].f = (double) src[sidx].u; + break; + case GLSL_TYPE_INT: + case GLSL_TYPE_SAMPLER: + case GLSL_TYPE_IMAGE: + *(double *)&dst[didx].f = (double) src[sidx].i; + break; + case GLSL_TYPE_BOOL: + *(double *)&dst[didx].f = src[sidx].i ? 1.0f : 0.0f; + break; + case GLSL_TYPE_FLOAT: + *(double *)&dst[didx].f = (double) src[sidx].f; break; default: assert(!"Should not get here."); break; } break; - case GLSL_TYPE_INT: case GLSL_TYPE_UINT: switch (uni->type->base_type) { @@ -413,10 +431,13 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location, * a floating-point value is rounded to the * nearest integer..." */ - dst[i].i = IROUND(src[i].f); + dst[didx].i = IROUND(src[sidx].f); break; case GLSL_TYPE_BOOL: - dst[i].i = src[i].i ? 1 : 0; + dst[didx].i = src[sidx].i ? 1 : 0; + break; + case GLSL_TYPE_DOUBLE: + dst[didx].i = *(double *)&src[sidx].f; break; default: assert(!"Should not get here."); From 8281a7c5333d9b78aabf9ce3e9cc7077ccca9413 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 9 Oct 2015 07:57:19 +0200 Subject: [PATCH 070/270] i965: Fix unsafe pointer when dumping VS/FS IR For the VS and FS stages that use ARB_vertex_program or ARB_fragment_program we don't have a shader program, however, when debuging is enabled, we call brw_dump_ir like this: brw_dump_ir("vertex", prog, &vs->base, &vp->program.Base); where vs will be NULL (since prog is NULL). As pointed out by Chris, this &vs->base is not really a dereference, it simply computes a new address that just happens to be 0x0 because the offset of base in brw_shader is 0. Then brw_dump_ir will see a NULL pointer and not do anything. This is why this does not crash at the moment. However, this does not look very safe (it would crash for any location of base that is not the first in brw_shader), so patch it to prevent a potential (even if unlikely) problem in the future. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_vs.c | 2 +- src/mesa/drivers/dri/i965/brw_wm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 0dc2bdccae8..de9a8677599 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -205,7 +205,7 @@ brw_codegen_vs_prog(struct brw_context *brw, } if (unlikely(INTEL_DEBUG & DEBUG_VS)) - brw_dump_ir("vertex", prog, &vs->base, &vp->program.Base); + brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 4d5e7f67bd6..65de54335e8 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -222,7 +222,7 @@ brw_codegen_wm_prog(struct brw_context *brw, } if (unlikely(INTEL_DEBUG & DEBUG_WM)) - brw_dump_ir("fragment", prog, &fs->base, &fp->program.Base); + brw_dump_ir("fragment", prog, fs ? &fs->base : NULL, &fp->program.Base); int st_index8 = -1, st_index16 = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { From f09c229cc6db838ae595fb57f5e6386a035bdf42 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Wed, 7 Oct 2015 09:21:36 +0200 Subject: [PATCH 071/270] glsl: shader outputs cannot have initializers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GLSL Spec 4.20.8, 4.3 Storage Qualifiers: "Initializers in global declarations may only be used in declarations of global variables with no storage qualifier, with a const qualifier or with a uniform qualifier." We do this for input variables, but not for output variables. AMD and NVIDIA proprietary drivers don't allow this either. Reviewed-by: Samuel Iglesias Gonsálvez Reviewed-by: Matt Turner --- src/glsl/ast_to_hir.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 9511440ba3a..2aea5aef9c0 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -3201,6 +3201,12 @@ process_initializer(ir_variable *var, ast_declaration *decl, ? "attribute" : "varying"); } + if (var->data.mode == ir_var_shader_out && state->current_function == NULL) { + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize %s shader output", + _mesa_shader_stage_to_string(state->stage)); + } + /* If the initializer is an ast_aggregate_initializer, recursively store * type information from the LHS into it, so that its hir() function can do * type checking. From 7a1143f29e477601f2b34b23d154edd5699352b1 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Wed, 7 Oct 2015 09:28:43 +0200 Subject: [PATCH 072/270] glsl: include variable name in error messages about initializers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also fix style / wrong indentation along the way and make the messages more uniform. Reviewed-by: Samuel Iglesias Gonsálvez Reviewed-by: Matt Turner --- src/glsl/ast_to_hir.cpp | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 2aea5aef9c0..fdded1e5819 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -3170,7 +3170,8 @@ process_initializer(ir_variable *var, ast_declaration *decl, */ if (var->data.mode == ir_var_uniform) { state->check_version(120, 0, &initializer_loc, - "cannot initialize uniforms"); + "cannot initialize uniform %s", + var->name); } /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec: @@ -3178,8 +3179,9 @@ process_initializer(ir_variable *var, ast_declaration *decl, * "Buffer variables cannot have initializers." */ if (var->data.mode == ir_var_shader_storage) { - _mesa_glsl_error(& initializer_loc, state, - "SSBO variables cannot have initializers"); + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize buffer variable %s", + var->name); } /* From section 4.1.7 of the GLSL 4.40 spec: @@ -3189,22 +3191,25 @@ process_initializer(ir_variable *var, ast_declaration *decl, * shader." */ if (var->type->contains_opaque()) { - _mesa_glsl_error(& initializer_loc, state, - "cannot initialize opaque variable"); + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize opaque variable %s", + var->name); } if ((var->data.mode == ir_var_shader_in) && (state->current_function == NULL)) { - _mesa_glsl_error(& initializer_loc, state, - "cannot initialize %s shader input / %s", - _mesa_shader_stage_to_string(state->stage), - (state->stage == MESA_SHADER_VERTEX) - ? "attribute" : "varying"); + _mesa_glsl_error(&initializer_loc, state, + "cannot initialize %s shader input / %s %s", + _mesa_shader_stage_to_string(state->stage), + (state->stage == MESA_SHADER_VERTEX) + ? "attribute" : "varying", + var->name); } if (var->data.mode == ir_var_shader_out && state->current_function == NULL) { _mesa_glsl_error(&initializer_loc, state, - "cannot initialize %s shader output", - _mesa_shader_stage_to_string(state->stage)); + "cannot initialize %s shader output %s", + _mesa_shader_stage_to_string(state->stage), + var->name); } /* If the initializer is an ast_aggregate_initializer, recursively store From 45ed627d894aa4d51682e8b07e7234bbc6e7c02d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Sun, 4 Oct 2015 00:44:00 +0200 Subject: [PATCH 073/270] u_vbuf: fix vb slot assignment for translated buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vertex attributes of different categories (constant/per-instance/ per-vertex) go into different buffers for translation, and this is now properly reflected in the vertex buffers passed to the driver. Fixes e.g. piglit's point-vertex-id divisor test. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Marek Olšák --- src/gallium/auxiliary/util/u_vbuf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index 3d2193c3bf5..b31ada138b8 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -544,6 +544,7 @@ u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr, index = ffs(unused_vb_mask) - 1; fallback_vbs[type] = index; + unused_vb_mask &= ~(1 << index); /*printf("found slot=%i for type=%i\n", index, type);*/ } } From 43b07eb60faba1c65fc6f7a99087d051b00e9c0f Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Fri, 9 Oct 2015 14:17:32 -0700 Subject: [PATCH 074/270] glsl: Allow built-in functions as constant expressions in OpenGL ES 1.00 In d4a24745 (August 2012), Paul made functions calls not be constant expressions in GLSL ES 1.00. Since this feature was added in desktop GLSL 1.20, we believed that it was added in GLSL ES 3.00. That turns out to be completely wrong. Built-in functions have always been allowed as constant expressions in GLSL ES, and the patch adds the (many) spec quotations to prove it. While we never previously encountered this, a later patch enforces a GLSL ES 1.00 rule that global variable initializers must be constant expressions. Without this fix, several dEQP tests fail. Fixes: tests/spec/glsl-es-1.00/compiler/const-initializer/from-function.frag tests/spec/glsl-es-1.00/compiler/const-initializer/from-function.vert tests/spec/glsl-es-1.00/compiler/const-initializer/from-sequence-in-function.frag tests/spec/glsl-es-1.00/compiler/const-initializer/from-sequence-in-function.vert Signed-off-by: Ian Romanick Reviewed-by: Matt Turner Cc: "10.0 10.1 10.2 10.3 10.4 10.5 10.6 11.0" Yes, I know we don't maintain stable branches that far back, but that *is* how far back this bug goes! --- src/glsl/ast_function.cpp | 51 +++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp index 26d4c62ce36..6538992ae0e 100644 --- a/src/glsl/ast_function.cpp +++ b/src/glsl/ast_function.cpp @@ -437,13 +437,54 @@ generate_call(exec_list *instructions, ir_function_signature *sig, } } - /* If the function call is a constant expression, don't generate any - * instructions; just generate an ir_constant. + /* Section 4.3.2 (Const) of the GLSL 1.10.59 spec says: * - * Function calls were first allowed to be constant expressions in GLSL - * 1.20 and GLSL ES 3.00. + * "Initializers for const declarations must be formed from literal + * values, other const variables (not including function call + * paramaters), or expressions of these. + * + * Constructors may be used in such expressions, but function calls may + * not." + * + * Section 4.3.3 (Constant Expressions) of the GLSL 1.20.8 spec says: + * + * "A constant expression is one of + * + * ... + * + * - a built-in function call whose arguments are all constant + * expressions, with the exception of the texture lookup + * functions, the noise functions, and ftransform. The built-in + * functions dFdx, dFdy, and fwidth must return 0 when evaluated + * inside an initializer with an argument that is a constant + * expression." + * + * Section 5.10 (Constant Expressions) of the GLSL ES 1.00.17 spec says: + * + * "A constant expression is one of + * + * ... + * + * - a built-in function call whose arguments are all constant + * expressions, with the exception of the texture lookup + * functions." + * + * Section 4.3.3 (Constant Expressions) of the GLSL ES 3.00.4 spec says: + * + * "A constant expression is one of + * + * ... + * + * - a built-in function call whose arguments are all constant + * expressions, with the exception of the texture lookup + * functions. The built-in functions dFdx, dFdy, and fwidth must + * return 0 when evaluated inside an initializer with an argument + * that is a constant expression." + * + * If the function call is a constant expression, don't generate any + * instructions; just generate an ir_constant. */ - if (state->is_version(120, 300)) { + if (state->is_version(120, 100)) { ir_constant *value = sig->constant_expression_value(actual_parameters, NULL); if (value != NULL) { return value; From 8acce5d53af44a3d1d05a26e69559fd35f835de4 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 8 Oct 2015 11:13:00 -0700 Subject: [PATCH 075/270] ff_fragment_shader: Use binding to set the sampler unit This is the way layout(binding=xxx) works from GLSL. The old method just happened to work (and significantly predated support for layout(binding=xxx)), but future changes will break this. v2: Remove some stale comments. Suggested by Matt and Chris Forbes. Signed-off-by: Ian Romanick Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/mesa/main/ff_fragment_shader.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/mesa/main/ff_fragment_shader.cpp b/src/mesa/main/ff_fragment_shader.cpp index e4e2a18c1da..aad726689cc 100644 --- a/src/mesa/main/ff_fragment_shader.cpp +++ b/src/mesa/main/ff_fragment_shader.cpp @@ -975,13 +975,11 @@ static void load_texture( texenv_fragment_program *p, GLuint unit ) ir_var_uniform); p->top_instructions->push_head(sampler); - /* Set the texture unit for this sampler. The linker will pick this value - * up and do-the-right-thing. - * - * NOTE: The cast to int is important. Without it, the constant will have - * type uint, and things later on may get confused. + /* Set the texture unit for this sampler in the same way that + * layout(binding=X) would. */ - sampler->constant_value = new(p->mem_ctx) ir_constant(int(unit)); + sampler->data.explicit_binding = true; + sampler->data.binding = unit; deref = new(p->mem_ctx) ir_dereference_variable(sampler); tex->set_sampler(deref, glsl_type::vec4_type); From 313372cae8e10e4b9a3de093d65c0a0d8954bb0d Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 8 Oct 2015 14:24:25 -0700 Subject: [PATCH 076/270] glsl/linker: Use constant_initializer instead of constant_value to initialize uniforms Signed-off-by: Ian Romanick Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/glsl/link_uniform_initializers.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp index 0918d2af9b8..065257b5a0e 100644 --- a/src/glsl/link_uniform_initializers.cpp +++ b/src/glsl/link_uniform_initializers.cpp @@ -326,9 +326,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog, } else { assert(!"Explicit binding not on a sampler, UBO or atomic."); } - } else if (var->constant_value) { + } else if (var->constant_initializer) { linker::set_uniform_initializer(mem_ctx, prog, var->name, - var->type, var->constant_value, + var->type, var->constant_initializer, boolean_true); } } From 5bc68f0f2b80b21997435742af74c49eb72891f7 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 8 Oct 2015 17:32:41 -0700 Subject: [PATCH 077/270] glsl: Use constant_initializer instead of constant_value to determine whether to keep an unused uniform This even matches the comment "uniform initializers are precious, and could get used by another stage." Signed-off-by: Ian Romanick Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/glsl/opt_dead_code.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp index 2cb7f41adef..071485ad31b 100644 --- a/src/glsl/opt_dead_code.cpp +++ b/src/glsl/opt_dead_code.cpp @@ -103,7 +103,7 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned) */ if (entry->var->data.mode == ir_var_uniform || entry->var->data.mode == ir_var_shader_storage) { - if (uniform_locations_assigned || entry->var->constant_value) + if (uniform_locations_assigned || entry->var->constant_initializer) continue; /* Section 2.11.6 (Uniform Variables) of the OpenGL ES 3.0.3 spec From 3524d6df33b1e3716992f9a555ffb0f7b1ae2f4f Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 7 Oct 2015 12:52:58 -0700 Subject: [PATCH 078/270] glsl: Only set ir_variable::constant_value for const-decorated variables Right now we're also setting for uniforms, and that doesn't seem to hurt things. The next patch will make general global variables in GLSL ES, and those definitely should not have constant_value set! Signed-off-by: Ian Romanick Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/glsl/ast_to_hir.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index fdded1e5819..0f05cea5e28 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -3249,17 +3249,20 @@ process_initializer(ir_variable *var, ast_declaration *decl, decl->identifier); if (var->type->is_numeric()) { /* Reduce cascading errors. */ - var->constant_value = ir_constant::zero(state, var->type); + var->constant_value = type->qualifier.flags.q.constant + ? ir_constant::zero(state, var->type) : NULL; } } } else { rhs = constant_value; - var->constant_value = constant_value; + var->constant_value = type->qualifier.flags.q.constant + ? constant_value : NULL; } } else { if (var->type->is_numeric()) { /* Reduce cascading errors. */ - var->constant_value = ir_constant::zero(state, var->type); + var->constant_value = type->qualifier.flags.q.constant + ? ir_constant::zero(state, var->type) : NULL; } } } From bb329f2ff6e8bf8910a467b09f69a4d843689617 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 6 Oct 2015 17:05:55 -0700 Subject: [PATCH 079/270] glsl: Restrict initializers for global variables to constant expression in ES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2: Combine this check with the existing const and uniform checks. This change depends on the previous patch (glsl: Only set ir_variable::constant_value for const-decorated variables). Fixes: ES2-CTS.shaders.negative.initialize ES3-CTS.shaders.negative.initialize spec/glsl-es-1.00/compiler/global-initializer/from-attribute.vert spec/glsl-es-1.00/compiler/global-initializer/from-uniform.vert spec/glsl-es-1.00/compiler/global-initializer/from-uniform.frag spec/glsl-es-1.00/compiler/global-initializer/from-global.vert spec/glsl-es-1.00/compiler/global-initializer/from-global.frag spec/glsl-es-1.00/compiler/global-initializer/from-varying.frag spec/glsl-es-3.00/compiler/global-initializer/from-uniform.vert spec/glsl-es-3.00/compiler/global-initializer/from-uniform.frag spec/glsl-es-3.00/compiler/global-initializer/from-in.vert spec/glsl-es-3.00/compiler/global-initializer/from-in.frag spec/glsl-es-3.00/compiler/global-initializer/from-global.vert spec/glsl-es-3.00/compiler/global-initializer/from-global.frag Note: spec/glsl-es-3.00/compiler/global-initializer/from-sequence.* still fail because the result of a sequence operator is still considered to be a constant expression. Signed-off-by: Ian Romanick Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92304 Reviewed-by: Tapani Pälli [v1] Reviewed-by: Iago Toral Quiroga [v1] Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/glsl/ast_to_hir.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 0f05cea5e28..0d83d02aa32 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -3224,9 +3224,19 @@ process_initializer(ir_variable *var, ast_declaration *decl, /* Calculate the constant value if this is a const or uniform * declaration. + * + * Section 4.3 (Storage Qualifiers) of the GLSL ES 1.00.17 spec says: + * + * "Declarations of globals without a storage qualifier, or with + * just the const qualifier, may include initializers, in which case + * they will be initialized before the first line of main() is + * executed. Such initializers must be a constant expression." + * + * The same section of the GLSL ES 3.00.4 spec has similar language. */ if (type->qualifier.flags.q.constant - || type->qualifier.flags.q.uniform) { + || type->qualifier.flags.q.uniform + || (state->es_shader && state->current_function == NULL)) { ir_rvalue *new_rhs = validate_assignment(state, initializer_loc, lhs, rhs, true); if (new_rhs != NULL) { @@ -3234,6 +3244,11 @@ process_initializer(ir_variable *var, ast_declaration *decl, ir_constant *constant_value = rhs->constant_expression_value(); if (!constant_value) { + const char *const variable_mode = + (type->qualifier.flags.q.constant) + ? "const" + : ((type->qualifier.flags.q.uniform) ? "uniform" : "global"); + /* If ARB_shading_language_420pack is enabled, initializers of * const-qualified local variables do not have to be constant * expressions. Const-qualified global variables must still be @@ -3244,8 +3259,7 @@ process_initializer(ir_variable *var, ast_declaration *decl, _mesa_glsl_error(& initializer_loc, state, "initializer of %s variable `%s' must be a " "constant expression", - (type->qualifier.flags.q.constant) - ? "const" : "uniform", + variable_mode, decl->identifier); if (var->type->is_numeric()) { /* Reduce cascading errors. */ From 05e4601c6b9ce456cc4a4c395677a22125d889d2 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 7 Oct 2015 13:03:53 -0700 Subject: [PATCH 080/270] glsl: Add method to determine whether an expression contains the sequence operator This will be used in the next patch to enforce some language sematics. v2: Fix inverted logic in ast_function_expression::has_sequence_subexpression. The method originally had a different name and a different meaning. I fixed the logic in ast_to_hir.cpp, but I only changed the names in ast_function.cpp. Signed-off-by: Ian Romanick Reviewed-by: Marta Lofstedt [v1] Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/glsl/ast.h | 6 +++ src/glsl/ast_function.cpp | 11 ++++++ src/glsl/ast_to_hir.cpp | 80 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/src/glsl/ast.h b/src/glsl/ast.h index 4c314366133..67faacd0ef8 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -62,6 +62,8 @@ public: virtual ir_rvalue *hir(exec_list *instructions, struct _mesa_glsl_parse_state *state); + virtual bool has_sequence_subexpression() const; + /** * Retrieve the source location of an AST node * @@ -221,6 +223,8 @@ public: virtual void hir_no_rvalue(exec_list *instructions, struct _mesa_glsl_parse_state *state); + virtual bool has_sequence_subexpression() const; + ir_rvalue *do_hir(exec_list *instructions, struct _mesa_glsl_parse_state *state, bool needs_rvalue); @@ -299,6 +303,8 @@ public: virtual void hir_no_rvalue(exec_list *instructions, struct _mesa_glsl_parse_state *state); + virtual bool has_sequence_subexpression() const; + private: /** * Is this function call actually a constructor? diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp index 6538992ae0e..b72eb3ffb9e 100644 --- a/src/glsl/ast_function.cpp +++ b/src/glsl/ast_function.cpp @@ -1999,6 +1999,17 @@ ast_function_expression::hir(exec_list *instructions, unreachable("not reached"); } +bool +ast_function_expression::has_sequence_subexpression() const +{ + foreach_list_typed(const ast_node, ast, link, &this->expressions) { + if (ast->has_sequence_subexpression()) + return true; + } + + return false; +} + ir_rvalue * ast_aggregate_initializer::hir(exec_list *instructions, struct _mesa_glsl_parse_state *state) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 0d83d02aa32..a5a9cc078f6 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -1004,6 +1004,12 @@ ast_node::hir(exec_list *instructions, struct _mesa_glsl_parse_state *state) return NULL; } +bool +ast_node::has_sequence_subexpression() const +{ + return false; +} + void ast_function_expression::hir_no_rvalue(exec_list *instructions, struct _mesa_glsl_parse_state *state) @@ -1915,6 +1921,80 @@ ast_expression::do_hir(exec_list *instructions, return result; } +bool +ast_expression::has_sequence_subexpression() const +{ + switch (this->oper) { + case ast_plus: + case ast_neg: + case ast_bit_not: + case ast_logic_not: + case ast_pre_inc: + case ast_pre_dec: + case ast_post_inc: + case ast_post_dec: + return this->subexpressions[0]->has_sequence_subexpression(); + + case ast_assign: + case ast_add: + case ast_sub: + case ast_mul: + case ast_div: + case ast_mod: + case ast_lshift: + case ast_rshift: + case ast_less: + case ast_greater: + case ast_lequal: + case ast_gequal: + case ast_nequal: + case ast_equal: + case ast_bit_and: + case ast_bit_xor: + case ast_bit_or: + case ast_logic_and: + case ast_logic_or: + case ast_logic_xor: + case ast_array_index: + case ast_mul_assign: + case ast_div_assign: + case ast_add_assign: + case ast_sub_assign: + case ast_mod_assign: + case ast_ls_assign: + case ast_rs_assign: + case ast_and_assign: + case ast_xor_assign: + case ast_or_assign: + return this->subexpressions[0]->has_sequence_subexpression() || + this->subexpressions[1]->has_sequence_subexpression(); + + case ast_conditional: + return this->subexpressions[0]->has_sequence_subexpression() || + this->subexpressions[1]->has_sequence_subexpression() || + this->subexpressions[2]->has_sequence_subexpression(); + + case ast_sequence: + return true; + + case ast_field_selection: + case ast_identifier: + case ast_int_constant: + case ast_uint_constant: + case ast_float_constant: + case ast_bool_constant: + case ast_double_constant: + return false; + + case ast_aggregate: + unreachable("ast_aggregate: Should never get here."); + + case ast_function_call: + unreachable("should be handled by ast_function_expression::hir"); + } + + return false; +} ir_rvalue * ast_expression_statement::hir(exec_list *instructions, From 92635a84a7f464b827baa406578420dd6109e1a4 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 7 Oct 2015 14:26:29 -0700 Subject: [PATCH 081/270] glsl: In later GLSL versions, sequence operator is cannot be a constant expression Fixes: ES3-CTS.shaders.negative.constant_sequence spec/glsl-es-3.00/compiler/global-initializer/from-sequence.vert spec/glsl-es-3.00/compiler/global-initializer/from-sequence.frag v2: Fix a couple copy-and-paste mistake in the spec quotations. Suggested by Matt. Signed-off-by: Ian Romanick Reviewed-by: Matt Turner Cc: "10.6 11.0" --- src/glsl/ast_to_hir.cpp | 43 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index a5a9cc078f6..23ded46f26c 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -3322,8 +3322,49 @@ process_initializer(ir_variable *var, ast_declaration *decl, if (new_rhs != NULL) { rhs = new_rhs; + /* Section 4.3.3 (Constant Expressions) of the GLSL ES 3.00.4 spec + * says: + * + * "A constant expression is one of + * + * ... + * + * - an expression formed by an operator on operands that are + * all constant expressions, including getting an element of + * a constant array, or a field of a constant structure, or + * components of a constant vector. However, the sequence + * operator ( , ) and the assignment operators ( =, +=, ...) + * are not included in the operators that can create a + * constant expression." + * + * Section 12.43 (Sequence operator and constant expressions) says: + * + * "Should the following construct be allowed? + * + * float a[2,3]; + * + * The expression within the brackets uses the sequence operator + * (',') and returns the integer 3 so the construct is declaring + * a single-dimensional array of size 3. In some languages, the + * construct declares a two-dimensional array. It would be + * preferable to make this construct illegal to avoid confusion. + * + * One possibility is to change the definition of the sequence + * operator so that it does not return a constant-expression and + * hence cannot be used to declare an array size. + * + * RESOLUTION: The result of a sequence operator is not a + * constant-expression." + * + * Section 4.3.3 (Constant Expressions) of the GLSL 4.30.9 spec + * contains language almost identical to the section 4.3.3 in the + * GLSL ES 3.00.4 spec. This is a new limitation for these GLSL + * versions. + */ ir_constant *constant_value = rhs->constant_expression_value(); - if (!constant_value) { + if (!constant_value || + (state->is_version(430, 300) && + decl->initializer->has_sequence_subexpression())) { const char *const variable_mode = (type->qualifier.flags.q.constant) ? "const" From eeb444bc995c25224ce661c49dd5df6266e370d1 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Fri, 9 Oct 2015 15:26:20 -0700 Subject: [PATCH 082/270] glsl: Never allow the sequence operator anywhere in an array size Fixes: spec/glsl-1.20/compiler/structure-and-array-operations/array-size-sequence-in-parenthesis.vert spec/glsl-es-1.00/compiler/array-sized-by-sequence-in-parenthesis.vert spec/glsl-es-3.00/compiler/array-sized-by-sequence-in-parenthesis.vert Signed-off-by: Ian Romanick Reviewed-by: Matt Turner --- src/glsl/ast_to_hir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 23ded46f26c..c04db3505c1 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -2069,7 +2069,7 @@ process_array_size(exec_node *node, } ir_constant *const size = ir->constant_expression_value(); - if (size == NULL) { + if (size == NULL || array_size->has_sequence_subexpression()) { _mesa_glsl_error(& loc, state, "array size must be a " "constant valued expression"); return 0; From bf97f8d467ad1d485c2327da3f4fe1f9e1dc7379 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Mon, 12 Oct 2015 17:15:32 -0400 Subject: [PATCH 083/270] nouveau: avoid double-emitting fence The act of ensuring that there is space can cause a flush to happen, which will emit the current screen fence. If that is the fence we're trying to wait on, then it will have been emitted as a result of doing the PUSH_SPACE. Don't attempt to emit it a second time. Signed-off-by: Ilia Mirkin Fixes: 8053c9208f (nouveau: avoid emitting new fences unnecessarily) Cc: mesa-stable@lists.freedesktop.org --- src/gallium/drivers/nouveau/nouveau_fence.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c index 18b15920185..21cf2b9ae5e 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.c +++ b/src/gallium/drivers/nouveau/nouveau_fence.c @@ -192,7 +192,11 @@ nouveau_fence_wait(struct nouveau_fence *fence) if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) { PUSH_SPACE(screen->pushbuf, 8); - nouveau_fence_emit(fence); + /* The space allocation might trigger a flush, which could emit the + * current fence. So check again. + */ + if (fence->state < NOUVEAU_FENCE_STATE_EMITTED) + nouveau_fence_emit(fence); } if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED) From bd198b9f0a292a9ff4ffffec3a29bad23d62caba Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 14 Aug 2015 16:01:33 -0700 Subject: [PATCH 084/270] i965/vs: Simplify fs_visitor's ATTR file. Previously, ATTR was indexed by VERT_ATTRIB_* slots; at the end of compilation, assign_vs_urb_setup() translated those into GRF units, and converted ATTR to HW_REGs. This patch moves the transslation earlier, making ATTR work in terms of GRF units from the beginning. assign_vs_urb_setup() simply has to add the number of payload registers and push constants to obtain the final hardware GRF number. (We can't do this earlier as those values aren't known.) ATTR still supports reg_offset; however, it's simply added to reg. It's not clear whether this is valuable or not. Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_fs.cpp | 27 ++++--------- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +- src/mesa/drivers/dri/i965/brw_nir.c | 40 ++++++++++++++++++++ 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 65f2e68e621..d000f16f49a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1508,9 +1508,11 @@ void fs_visitor::assign_vs_urb_setup() { brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; - int grf, slot, channel, attr; assert(stage == MESA_SHADER_VERTEX); + int count = _mesa_bitcount_64(vs_prog_data->inputs_read); + if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) + count++; /* Each attribute is 4 regs. */ this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes; @@ -1521,25 +1523,10 @@ fs_visitor::assign_vs_urb_setup() foreach_block_and_inst(block, fs_inst, inst, cfg) { for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == ATTR) { - - if (inst->src[i].reg == VERT_ATTRIB_MAX) { - slot = vs_prog_data->nr_attributes - 1; - } else { - /* Attributes come in in a contiguous block, ordered by their - * gl_vert_attrib value. That means we can compute the slot - * number for an attribute by masking out the enabled - * attributes before it and counting the bits. - */ - attr = inst->src[i].reg + inst->src[i].reg_offset / 4; - slot = _mesa_bitcount_64(vs_prog_data->inputs_read & - BITFIELD64_MASK(attr)); - } - - channel = inst->src[i].reg_offset & 3; - - grf = payload.num_regs + - prog_data->curb_read_length + - slot * 4 + channel; + int grf = payload.num_regs + + prog_data->curb_read_length + + inst->src[i].reg + + inst->src[i].reg_offset; inst->src[i].file = HW_REG; inst->src[i].fixed_hw_reg = diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index df1a7ed9b59..8aee2c087f7 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -53,7 +53,8 @@ fs_reg * fs_visitor::emit_vs_system_value(int location) { fs_reg *reg = new(this->mem_ctx) - fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D); + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info.inputs_read), + BRW_REGISTER_TYPE_D); brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; switch (location) { diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 15c1b1984a1..4f35d81fc7e 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -27,6 +27,34 @@ #include "glsl/nir/glsl_to_nir.h" #include "program/prog_to_nir.h" +static bool +remap_vs_attrs(nir_block *block, void *closure) +{ + GLbitfield64 inputs_read = *((GLbitfield64 *) closure); + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + /* We set EmitNoIndirect for VS inputs, so there are no indirects. */ + assert(intrin->intrinsic != nir_intrinsic_load_input_indirect); + + if (intrin->intrinsic == nir_intrinsic_load_input) { + /* Attributes come in a contiguous block, ordered by their + * gl_vert_attrib value. That means we can compute the slot + * number for an attribute by masking out the enabled attributes + * before it and counting the bits. + */ + int attr = intrin->const_index[0]; + int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr)); + intrin->const_index[0] = 4 * slot; + } + } + return true; +} + static void brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) { @@ -49,6 +77,18 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) * type_size_vec4 here. */ nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* Finally, translate VERT_ATTRIB_* values into the actual registers. + * + * Note that we can use nir->info.inputs_read instead of key->inputs_read + * since the two are identical aside from Gen4-5 edge flag differences. + */ + GLbitfield64 inputs_read = nir->info.inputs_read; + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_foreach_block(overload->impl, remap_vs_attrs, &inputs_read); + } + } break; case MESA_SHADER_GEOMETRY: foreach_list_typed(nir_variable, var, node, &nir->inputs) { From 80c5062abfdef28e23615f44b214760449f6a582 Mon Sep 17 00:00:00 2001 From: Glenn Kennard Date: Mon, 21 Sep 2015 16:21:37 +0200 Subject: [PATCH 085/270] r600g/sb: Support gs5 sampler indexing (v2) [airlied: v2 cayman fixups] Signed-off-by: Glenn Kennard Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/r600_shader.c | 12 +- src/gallium/drivers/r600/r600_shader.h | 4 +- src/gallium/drivers/r600/sb/sb_bc.h | 10 +- src/gallium/drivers/r600/sb/sb_bc_dump.cpp | 17 ++- .../drivers/r600/sb/sb_bc_finalize.cpp | 3 +- src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 40 +++++- src/gallium/drivers/r600/sb/sb_gcm.cpp | 11 +- src/gallium/drivers/r600/sb/sb_sched.cpp | 118 +++++++++++++++++- src/gallium/drivers/r600/sb/sb_sched.h | 5 +- 9 files changed, 195 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 1d905822cde..24c3d43b0fa 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -166,8 +166,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx, if (rctx->b.chip_class <= R700) { use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); } - /* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */ - use_sb &= !shader->shader.uses_index_registers; + /* disable SB for shaders using ubo array indexing as it doesn't handle those currently */ + use_sb &= !shader->shader.uses_ubo_indexing; /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; @@ -1251,7 +1251,7 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx) } if (ctx->src[i].kc_rel) - ctx->shader->uses_index_registers = true; + ctx->shader->uses_ubo_indexing = true; if (ctx->src[i].rel) { int chan = inst->Src[i].Indirect.Swizzle; @@ -1912,7 +1912,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->uses_doubles = ctx.info.uses_doubles; - indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT); + indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); tgsi_parse_init(&ctx.parse, tokens); ctx.type = ctx.info.processor; shader->processor_type = ctx.type; @@ -1936,7 +1936,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.gs_next_vertex = 0; ctx.gs_stream_output_info = &so; - shader->uses_index_registers = false; + shader->uses_ubo_indexing = false; ctx.face_gpr = -1; ctx.fixed_pt_position_gpr = -1; ctx.fragcoord_input = -1; @@ -5703,8 +5703,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) sampler_src_reg = 3; sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE - if (sampler_index_mode) - ctx->shader->uses_index_registers = true; src_gpr = tgsi_tex_get_src_gpr(ctx, 0); diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 48de9cdb156..8ba32ae4999 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -75,8 +75,8 @@ struct r600_shader { boolean has_txq_cube_array_z_comp; boolean uses_tex_buffers; boolean gs_prim_id_input; - /* Temporarily workaround SB not handling CF_INDEX_[01] index registers */ - boolean uses_index_registers; + /* Temporarily workaround SB not handling ubo indexing */ + boolean uses_ubo_indexing; /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index ab988f8716d..126750d5c7e 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -48,6 +48,7 @@ class fetch_node; class alu_group_node; class region_node; class shader; +class value; class sb_ostream { public: @@ -818,13 +819,16 @@ class bc_parser { bool gpr_reladdr; + // Note: currently relies on input emitting SET_CF in same basic block as uses + value *cf_index_value[2]; + alu_node *mova; public: bc_parser(sb_context &sctx, r600_bytecode *bc, r600_shader* pshader) : ctx(sctx), dec(), bc(bc), pshader(pshader), dw(), bc_ndw(), max_cf(), sh(), error(), slots(), cgroup(), - cf_map(), loop_stack(), gpr_reladdr() { } + cf_map(), loop_stack(), gpr_reladdr(), cf_index_value(), mova() { } int decode(); int prepare(); @@ -852,6 +856,10 @@ private: int prepare_loop(cf_node *c); int prepare_if(cf_node *c); + void save_set_cf_index(value *val, unsigned idx); + value *get_cf_index_value(unsigned idx); + void save_mova(alu_node *mova); + alu_node *get_mova(); }; diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp index 0fc73c419a6..3c70ea7cd3d 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp @@ -27,6 +27,7 @@ #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_0/1 namespace r600_sb { @@ -354,6 +355,14 @@ void bc_dump::dump(alu_node& n) { s << " " << vec_bs[n.bc.bank_swizzle]; } + if (ctx.is_cayman()) { + if (n.bc.op == ALU_OP1_MOVA_INT) { + static const char *mova_str[] = { " AR_X", " PC", " CF_IDX0", " CF_IDX1", + " Unknown MOVA_INT dest" }; + s << mova_str[std::min(n.bc.dst_gpr, 4u)]; // CM_V_SQ_MOVA_DST_AR_* + } + } + sblog << s.str() << "\n"; } @@ -450,9 +459,9 @@ void bc_dump::dump(fetch_node& n) { if (n.bc.fetch_whole_quad) s << " FWQ"; if (ctx.is_egcm() && n.bc.resource_index_mode) - s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode; + s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0); if (ctx.is_egcm() && n.bc.sampler_index_mode) - s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode; + s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0); s << " UCF:" << n.bc.use_const_fields << " FMT(DTA:" << n.bc.data_format @@ -470,9 +479,9 @@ void bc_dump::dump(fetch_node& n) { if (n.bc.offset[k]) s << " O" << chans[k] << ":" << n.bc.offset[k]; if (ctx.is_egcm() && n.bc.resource_index_mode) - s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode; + s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0); if (ctx.is_egcm() && n.bc.sampler_index_mode) - s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode; + s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0); } sblog << s.str() << "\n"; diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index 522ff9d956e..193ade8a661 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -303,7 +303,8 @@ void bc_finalizer::finalize_alu_group(alu_group_node* g, node *prev_node) { assert(fdst.chan() == slot || slot == SLOT_TRANS); } - n->bc.dst_gpr = fdst.sel(); + if (!(n->bc.op_ptr->flags & AF_MOVA && ctx.is_cayman())) + n->bc.dst_gpr = fdst.sel(); n->bc.dst_chan = d ? fdst.chan() : slot < SLOT_TRANS ? slot : 0; diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index 19bd0784a61..7f712b451c9 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -34,6 +34,7 @@ #include "r600_pipe.h" #include "r600_shader.h" +#include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1 #include @@ -121,7 +122,7 @@ int bc_parser::parse_decls() { return 0; } - if (pshader->indirect_files & ~(1 << TGSI_FILE_CONSTANT)) { + if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) { assert(pshader->num_arrays); @@ -328,6 +329,28 @@ int bc_parser::prepare_alu_clause(cf_node* cf) { return 0; } +void bc_parser::save_set_cf_index(value *val, unsigned idx) +{ + assert(idx <= 1); + assert(val); + cf_index_value[idx] = val; +} +value *bc_parser::get_cf_index_value(unsigned idx) +{ + assert(idx <= 1); + return cf_index_value[idx]; +} +void bc_parser::save_mova(alu_node *mova) +{ + assert(mova); + this->mova = mova; +} +alu_node *bc_parser::get_mova() +{ + assert(mova); + return mova; +} + int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { alu_node *n; @@ -375,9 +398,14 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { n->dst.resize(1); } - if (flags & AF_MOVA) { + if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) { + // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX + // DCE will kill this op + save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1); + } else if (flags & AF_MOVA) { n->dst[0] = sh->get_special_value(SV_AR_INDEX); + save_mova(n); n->flags |= NF_DONT_HOIST; @@ -469,6 +497,10 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { } } } + if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && + ctx.is_cayman()) + // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX + save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1); } // pack multislot instructions into alu_packed_node @@ -608,6 +640,10 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) { n->bc.src_sel[s], false); } + // Scheduler will emit the appropriate instructions to set CF_IDX0/1 + if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1)); + } } } diff --git a/src/gallium/drivers/r600/sb/sb_gcm.cpp b/src/gallium/drivers/r600/sb/sb_gcm.cpp index bccb6713967..236b2ea0031 100644 --- a/src/gallium/drivers/r600/sb/sb_gcm.cpp +++ b/src/gallium/drivers/r600/sb/sb_gcm.cpp @@ -37,6 +37,7 @@ #include "sb_bc.h" #include "sb_shader.h" #include "sb_pass.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_NONE namespace r600_sb { @@ -406,6 +407,14 @@ void gcm::bu_sched_bb(bb_node* bb) { ncnt = 3; } + bool sampler_indexing = false; + if (n->is_fetch_inst() && + static_cast(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) + { + sampler_indexing = true; // Give sampler indexed ops get their own clause + ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1 + } + if ((sq == SQ_TEX || sq == SQ_VTX) && ((last_count >= ctx.max_fetch/2 && check_alu_ready_count(24)) || @@ -418,7 +427,7 @@ void gcm::bu_sched_bb(bb_node* bb) { bu_ready[sq].pop_front(); if (sq != SQ_CF) { - if (!clause) { + if (!clause || sampler_indexing) { clause = sh.create_clause(sq == SQ_ALU ? NST_ALU_CLAUSE : sq == SQ_TEX ? NST_TEX_CLAUSE : diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp index c98b8fff764..601445f7dc3 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.cpp +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -36,6 +36,7 @@ #include "sb_shader.h" #include "sb_pass.h" #include "sb_sched.h" +#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1 namespace r600_sb { @@ -781,7 +782,14 @@ void post_scheduler::schedule_bb(bb_node* bb) { sblog << "\n"; ); - if (n->subtype == NST_ALU_CLAUSE) { + // May require emitting ALU ops to load index registers + if (n->is_fetch_clause()) { + n->remove(); + process_fetch(static_cast(n)); + continue; + } + + if (n->is_alu_clause()) { n->remove(); process_alu(static_cast(n)); continue; @@ -823,6 +831,102 @@ void post_scheduler::init_regmap() { } } +static alu_node *create_set_idx(shader &sh, unsigned ar_idx) { + alu_node *a = sh.create_alu(); + + assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1); + if (ar_idx == V_SQ_CF_INDEX_0) + a->bc.set_op(ALU_OP0_SET_CF_IDX0); + else + a->bc.set_op(ALU_OP0_SET_CF_IDX1); + a->bc.slot = SLOT_X; + a->dst.resize(1); // Dummy needed for recolor + + PSC_DUMP( + sblog << "created IDX load: " + dump::dump_op(a); + sblog << "\n"; + ); + + return a; +} + +void post_scheduler::load_index_register(value *v, unsigned ar_idx) +{ + alu.reset(); + + if (!sh.get_ctx().is_cayman()) { + // Evergreen has to first load address register, then use CF_SET_IDX0/1 + alu_group_tracker &rt = alu.grp(); + alu_node *set_idx = create_set_idx(sh, ar_idx); + if (!rt.try_reserve(set_idx)) { + sblog << "can't emit SET_CF_IDX"; + dump::dump_op(set_idx); + sblog << "\n"; + } + process_group(); + + if (!alu.check_clause_limits()) { + // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 + } + alu.emit_group(); + } + + alu_group_tracker &rt = alu.grp(); + alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y); + + if (!rt.try_reserve(a)) { + sblog << "can't emit AR load : "; + dump::dump_op(a); + sblog << "\n"; + } + + process_group(); + + if (!alu.check_clause_limits()) { + // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 + } + + alu.emit_group(); + alu.emit_clause(cur_bb); +} + +void post_scheduler::process_fetch(container_node *c) { + if (c->empty()) + return; + + for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) { + N = I; + ++N; + + node *n = *I; + + fetch_node *f = static_cast(n); + + PSC_DUMP( + sblog << "process_tex "; + dump::dump_op(n); + sblog << " "; + ); + + if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + // Currently require prior opt passes to use one TEX per indexed op + assert(f->parent->count() == 1); + + value *v = f->src.back(); // Last src is index offset + + cur_bb->push_front(c); + + load_index_register(v, f->bc.sampler_index_mode); + f->src.pop_back(); // Don't need index value any more + + return; + } + } + + cur_bb->push_front(c); +} + void post_scheduler::process_alu(container_node *c) { if (c->empty()) @@ -1180,7 +1284,7 @@ void post_scheduler::emit_load_ar() { alu.discard_current_group(); alu_group_tracker &rt = alu.grp(); - alu_node *a = alu.create_ar_load(); + alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X); if (!rt.try_reserve(a)) { sblog << "can't emit AR load : "; @@ -1936,11 +2040,9 @@ bool alu_kcache_tracker::update_kc() { return true; } -alu_node* alu_clause_tracker::create_ar_load() { +alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) { alu_node *a = sh.create_alu(); - // FIXME use MOVA_GPR on R6xx - if (sh.get_ctx().uses_mova_gpr) { a->bc.set_op(ALU_OP1_MOVA_GPR_INT); a->bc.slot = SLOT_TRANS; @@ -1948,9 +2050,13 @@ alu_node* alu_clause_tracker::create_ar_load() { a->bc.set_op(ALU_OP1_MOVA_INT); a->bc.slot = SLOT_X; } + a->bc.dst_chan = ar_channel; + if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) { + a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1; + } a->dst.resize(1); - a->src.push_back(current_ar); + a->src.push_back(v); PSC_DUMP( sblog << "created AR load: "; diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h index 87c45867e16..2ca714665a7 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.h +++ b/src/gallium/drivers/r600/sb/sb_sched.h @@ -235,7 +235,7 @@ public: void new_group(); bool is_empty(); - alu_node* create_ar_load(); + alu_node* create_ar_load(value *v, chan_select ar_channel); void discard_current_group(); @@ -266,6 +266,9 @@ public: void run_on(container_node *n); void schedule_bb(bb_node *bb); + void load_index_register(value *v, unsigned idx); + void process_fetch(container_node *c); + void process_alu(container_node *c); void schedule_alu(container_node *c); bool prepare_alu_group(); From 1befb7ed9856381cbfe874f361fae73b8e331bb4 Mon Sep 17 00:00:00 2001 From: Glenn Kennard Date: Wed, 7 Oct 2015 17:17:33 +0200 Subject: [PATCH 086/270] r600g/sb: SB support for UBO indexing Signed-off-by: Glenn Kennard Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/r600_shader.c | 6 - src/gallium/drivers/r600/r600_shader.h | 2 - src/gallium/drivers/r600/sb/sb_bc.h | 4 +- .../drivers/r600/sb/sb_bc_finalize.cpp | 6 +- src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 21 +++- src/gallium/drivers/r600/sb/sb_expr.cpp | 3 +- src/gallium/drivers/r600/sb/sb_ir.h | 7 ++ src/gallium/drivers/r600/sb/sb_sched.cpp | 108 ++++++++++++++++-- src/gallium/drivers/r600/sb/sb_sched.h | 4 + src/gallium/drivers/r600/sb/sb_shader.cpp | 4 +- src/gallium/drivers/r600/sb/sb_shader.h | 2 +- 11 files changed, 140 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 24c3d43b0fa..8efe902a329 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -166,8 +166,6 @@ int r600_pipe_shader_create(struct pipe_context *ctx, if (rctx->b.chip_class <= R700) { use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); } - /* disable SB for shaders using ubo array indexing as it doesn't handle those currently */ - use_sb &= !shader->shader.uses_ubo_indexing; /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; @@ -1250,9 +1248,6 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx) continue; } - if (ctx->src[i].kc_rel) - ctx->shader->uses_ubo_indexing = true; - if (ctx->src[i].rel) { int chan = inst->Src[i].Indirect.Swizzle; int treg = r600_get_temp(ctx); @@ -1936,7 +1931,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.gs_next_vertex = 0; ctx.gs_stream_output_info = &so; - shader->uses_ubo_indexing = false; ctx.face_gpr = -1; ctx.fixed_pt_position_gpr = -1; ctx.fragcoord_input = -1; diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 8ba32ae4999..c240e7110c1 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -75,8 +75,6 @@ struct r600_shader { boolean has_txq_cube_array_z_comp; boolean uses_tex_buffers; boolean gs_prim_id_input; - /* Temporarily workaround SB not handling ubo indexing */ - boolean uses_ubo_indexing; /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index 126750d5c7e..9c2a9170436 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -478,7 +478,9 @@ struct bc_cf { bool is_alu_extended() { assert(op_ptr->flags & CF_ALU); - return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE; + return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE || + kc[0].index_mode != KC_INDEX_NONE || kc[1].index_mode != KC_INDEX_NONE || + kc[2].index_mode != KC_INDEX_NONE || kc[3].index_mode != KC_INDEX_NONE; } }; diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index 193ade8a661..82826a90921 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -515,7 +515,7 @@ void bc_finalizer::copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg void bc_finalizer::emit_set_grad(fetch_node* f) { - assert(f->src.size() == 12); + assert(f->src.size() == 12 || f->src.size() == 13); unsigned ops[2] = { FETCH_OP_SET_GRADIENTS_V, FETCH_OP_SET_GRADIENTS_H }; unsigned arg_start = 0; @@ -810,8 +810,8 @@ void bc_finalizer::finalize_cf(cf_node* c) { } sel_chan bc_finalizer::translate_kcache(cf_node* alu, value* v) { - unsigned sel = v->select.sel(); - unsigned bank = sel >> 12; + unsigned sel = v->select.kcache_sel(); + unsigned bank = v->select.kcache_bank(); unsigned chan = v->select.chan(); static const unsigned kc_base[] = {128, 160, 256, 288}; diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index 7f712b451c9..28ebfa2ce62 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -338,6 +338,7 @@ void bc_parser::save_set_cf_index(value *val, unsigned idx) value *bc_parser::get_cf_index_value(unsigned idx) { assert(idx <= 1); + assert(cf_index_value[idx]); return cf_index_value[idx]; } void bc_parser::save_mova(alu_node *mova) @@ -361,6 +362,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { n = static_cast(*I); + bool ubo_indexing[2] = {}; if (!sh->assign_slot(n, slots[cgroup])) { assert(!"alu slot assignment failed"); @@ -460,7 +462,12 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { bc_kcache &kc = cf->bc.kc[kc_set]; kc_addr = (kc.addr << 4) + (sel & 0x1F); - n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan); + n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode); + + if (kc.index_mode != KC_INDEX_NONE) { + assert(kc.index_mode != KC_LOCK_LOOP); + ubo_indexing[kc.index_mode - KC_INDEX_0] = true; + } } else if (src.sel < MAX_GPR) { value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel); @@ -497,6 +504,15 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { } } } + + // add UBO index values if any as dependencies + if (ubo_indexing[0]) { + n->src.push_back(get_cf_index_value(0)); + } + if (ubo_indexing[1]) { + n->src.push_back(get_cf_index_value(1)); + } + if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && ctx.is_cayman()) // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX @@ -644,6 +660,9 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) { if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1)); } + if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { + n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1)); + } } } diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 9c2274e65a3..556a05da395 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -403,7 +403,8 @@ bool expr_handler::fold_alu_op1(alu_node& n) { if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT || n.bc.op == ALU_OP1_MOVA_GPR_INT) && n.bc.clamp == 0 && n.bc.omod == 0 - && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0) { + && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0 && + n.src.size() == 1 /* RIM/SIM can be appended as additional values */) { assign_source(n.dst[0], v0); return true; } diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h index 560a4a9b284..c612e6c4ec6 100644 --- a/src/gallium/drivers/r600/sb/sb_ir.h +++ b/src/gallium/drivers/r600/sb/sb_ir.h @@ -62,6 +62,13 @@ struct sel_chan static unsigned sel(unsigned idx) { return (idx-1) >> 2; } static unsigned chan(unsigned idx) { return (idx-1) & 3; } + + sel_chan(unsigned bank, unsigned index, + unsigned chan, alu_kcache_index_mode index_mode) + : id(sel_chan((bank << 12) | index | ((unsigned)index_mode << 28), chan).id) {} + unsigned kcache_index_mode() const { return sel() >> 28; } + unsigned kcache_sel() const { return sel() & 0x0fffffffu; } + unsigned kcache_bank() const { return kcache_sel() >> 12; } }; inline sb_ostream& operator <<(sb_ostream& o, sel_chan r) { diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp index 601445f7dc3..5113b756847 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.cpp +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -843,7 +843,7 @@ static alu_node *create_set_idx(shader &sh, unsigned ar_idx) { a->dst.resize(1); // Dummy needed for recolor PSC_DUMP( - sblog << "created IDX load: " + sblog << "created IDX load: "; dump::dump_op(a); sblog << "\n"; ); @@ -909,15 +909,21 @@ void post_scheduler::process_fetch(container_node *c) { sblog << " "; ); - if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + // TODO: If same values used can avoid reloading index register + if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE || + f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { + unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ? + f->bc.sampler_index_mode : f->bc.resource_index_mode; + // Currently require prior opt passes to use one TEX per indexed op assert(f->parent->count() == 1); value *v = f->src.back(); // Last src is index offset + assert(v); cur_bb->push_front(c); - load_index_register(v, f->bc.sampler_index_mode); + load_index_register(v, index_mode); f->src.pop_back(); // Don't need index value any more return; @@ -959,6 +965,7 @@ void post_scheduler::process_alu(container_node *c) { if (uc) { n->remove(); + pending.push_back(n); PSC_DUMP( sblog << "pending\n"; ); } else { @@ -1101,6 +1108,18 @@ void post_scheduler::init_globals(val_set &s, bool prealloc) { } } +void post_scheduler::emit_index_registers() { + for (unsigned i = 0; i < 2; i++) { + if (alu.current_idx[i]) { + regmap = prev_regmap; + alu.discard_current_group(); + + load_index_register(alu.current_idx[i], KC_INDEX_0 + i); + alu.current_idx[i] = NULL; + } + } +} + void post_scheduler::emit_clause() { if (alu.current_ar) { @@ -1109,7 +1128,11 @@ void post_scheduler::emit_clause() { alu.emit_group(); } - alu.emit_clause(cur_bb); + if (!alu.is_empty()) { + alu.emit_clause(cur_bb); + } + + emit_index_registers(); } void post_scheduler::schedule_alu(container_node *c) { @@ -1121,6 +1144,14 @@ void post_scheduler::schedule_alu(container_node *c) { prev_regmap = regmap; if (!prepare_alu_group()) { + if (alu.current_idx[0] || alu.current_idx[1]) { + regmap = prev_regmap; + emit_clause(); + init_globals(live, false); + + continue; + } + if (alu.current_ar) { emit_load_ar(); continue; @@ -1132,6 +1163,7 @@ void post_scheduler::schedule_alu(container_node *c) { regmap = prev_regmap; emit_clause(); init_globals(live, false); + continue; } @@ -1391,6 +1423,42 @@ bool post_scheduler::map_src_val(value *v) { } bool post_scheduler::map_src_vec(vvec &vv, bool src) { + if (src) { + // Handle possible UBO indexing + bool ubo_indexing[2] = { false, false }; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_kcache()) { + unsigned index_mode = v->select.kcache_index_mode(); + if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) { + ubo_indexing[index_mode - KC_INDEX_0] = true; + } + } + } + + // idx values stored at end of src vec, see bc_parser::prepare_alu_group + for (unsigned i = 2; i != 0; i--) { + if (ubo_indexing[i-1]) { + // TODO: skip adding value to kcache reservation somehow, causes + // unnecessary group breaks and cache line locks + value *v = vv.back(); + if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) { + PSC_DUMP( + sblog << "IDX" << i-1 << " already set to " << + *alu.current_idx[i-1] << ", trying to set " << *v << "\n"; + ); + return false; + } + + alu.current_idx[i-1] = v; + PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";); + } + } + } + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { value *v = *I; if (!v) @@ -1456,6 +1524,10 @@ void post_scheduler::dump_regmap() { sblog << " current_AR: " << *alu.current_ar << "\n"; if (alu.current_pr) sblog << " current_PR: " << *alu.current_pr << "\n"; + if (alu.current_idx[0]) + sblog << " current IDX0: " << *alu.current_idx[0] << "\n"; + if (alu.current_idx[1]) + sblog << " current IDX1: " << *alu.current_idx[1] << "\n"; } void post_scheduler::recolor_locals() { @@ -1545,6 +1617,13 @@ unsigned post_scheduler::try_add_instruction(node *n) { unsigned avail_slots = rt.avail_slots(); + // Cannot schedule in same clause as instructions using this index value + if (!n->dst.empty() && n->dst[0] && + (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) { + PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";); + return 0; + } + if (n->is_alu_packed()) { alu_packed_node *p = static_cast(n); unsigned slots = p->get_slot_mask(); @@ -1874,7 +1953,7 @@ alu_clause_tracker::alu_clause_tracker(shader &sh) grp0(sh), grp1(sh), group(), clause(), push_exec_mask(), - current_ar(), current_pr() {} + current_ar(), current_pr(), current_idx() {} void alu_clause_tracker::emit_group() { @@ -1931,6 +2010,8 @@ bool alu_clause_tracker::check_clause_limits() { // reserving slots to load AR and PR values unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0); + // ...and index registers + reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL); if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots) return false; @@ -1996,13 +2077,15 @@ unsigned rp_kcache_tracker::get_lines(kc_lines& lines) { unsigned cnt = 0; for (unsigned i = 0; i < sel_count; ++i) { - unsigned line = rp[i]; + unsigned line = rp[i] & 0x1fffffffu; + unsigned index_mode = rp[i] >> 29; if (!line) return cnt; --line; line = (sel_count == 2) ? line >> 5 : line >> 6; + line |= index_mode << 29; if (lines.insert(line).second) ++cnt; @@ -2017,14 +2100,18 @@ bool alu_kcache_tracker::update_kc() { memcpy(old_kc, kc, sizeof(kc)); for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) { - unsigned line = *I; + unsigned index_mode = *I >> 29; + unsigned line = *I & 0x1fffffffu; unsigned bank = line >> 8; + assert(index_mode <= KC_INDEX_INVALID); line &= 0xFF; - if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line)) - ++kc[c-1].mode; - else { + if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) && + kc[c-1].index_mode == index_mode) + { + kc[c-1].mode = KC_LOCK_2; + } else { if (c == max_kcs) { memcpy(kc, old_kc, sizeof(kc)); return false; @@ -2034,6 +2121,7 @@ bool alu_kcache_tracker::update_kc() { kc[c].bank = bank; kc[c].addr = line; + kc[c].index_mode = index_mode; ++c; } } diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h index 2ca714665a7..05b428ca884 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.h +++ b/src/gallium/drivers/r600/sb/sb_sched.h @@ -66,6 +66,7 @@ public: class literal_tracker { literal lt[4]; unsigned uc[4]; + public: literal_tracker() : lt(), uc() {} @@ -219,6 +220,8 @@ public: // bottom-up) value *current_ar; value *current_pr; + // current values of CF_IDX registers that need preloading + value *current_idx[2]; alu_clause_tracker(shader &sh); @@ -256,6 +259,7 @@ class post_scheduler : public pass { val_set cleared_interf; + void emit_index_registers(); public: post_scheduler(shader &sh) : pass(sh), diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp index f996c0786d1..87e28e98157 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.cpp +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp @@ -188,9 +188,9 @@ value* shader::create_temp_value() { return get_value(VLK_TEMP, id, 0); } -value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan) { +value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode) { return get_ro_value(kcache_values, VLK_KCACHE, - sel_chan((bank << 12) | index, chan)); + sel_chan(bank, index, chan, index_mode)); } void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) { diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h index 7955bba9b67..70bea891b76 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.h +++ b/src/gallium/drivers/r600/sb/sb_shader.h @@ -323,7 +323,7 @@ public: value* get_special_ro_value(unsigned sel); - value* get_kcache_value(unsigned bank, unsigned index, unsigned chan); + value* get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode); value* get_value_version(value* v, unsigned ver); From 24a1a157a694e961aaad611a8bf9d47ce8cf47f6 Mon Sep 17 00:00:00 2001 From: Glenn Kennard Date: Wed, 7 Oct 2015 17:17:34 +0200 Subject: [PATCH 087/270] r600g: Enable GL_ARB_gpu_shader5 extension Signed-off-by: Glenn Kennard Signed-off-by: Dave Airlie --- docs/GL3.txt | 16 ++++++++-------- docs/relnotes/11.1.0.html | 1 + src/gallium/drivers/r600/r600_pipe.c | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/GL3.txt b/docs/GL3.txt index e17e783d331..6503e2ab1da 100644 --- a/docs/GL3.txt +++ b/docs/GL3.txt @@ -96,18 +96,18 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0, radeonsi GL_ARB_draw_buffers_blend DONE (i965, nv50, r600, llvmpipe, softpipe) GL_ARB_draw_indirect DONE (i965, r600, llvmpipe, softpipe) - GL_ARB_gpu_shader5 DONE (i965) + GL_ARB_gpu_shader5 DONE (i965, r600) - 'precise' qualifier DONE - - Dynamically uniform sampler array indices DONE (r600, softpipe) - - Dynamically uniform UBO array indices DONE (r600) + - Dynamically uniform sampler array indices DONE (softpipe) + - Dynamically uniform UBO array indices DONE () - Implicit signed -> unsigned conversions DONE - Fused multiply-add DONE () - - Packing/bitfield/conversion functions DONE (r600, softpipe) - - Enhanced textureGather DONE (r600, softpipe) - - Geometry shader instancing DONE (r600, llvmpipe, softpipe) + - Packing/bitfield/conversion functions DONE (softpipe) + - Enhanced textureGather DONE (softpipe) + - Geometry shader instancing DONE (llvmpipe, softpipe) - Geometry shader multiple streams DONE () - - Enhanced per-sample shading DONE (r600) - - Interpolation functions DONE (r600) + - Enhanced per-sample shading DONE () + - Interpolation functions DONE () - New overload resolution rules DONE GL_ARB_gpu_shader_fp64 DONE (r600, llvmpipe, softpipe) GL_ARB_sample_shading DONE (i965, nv50, r600) diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html index d4f30d0da62..dcf425e4c68 100644 --- a/docs/relnotes/11.1.0.html +++ b/docs/relnotes/11.1.0.html @@ -46,6 +46,7 @@ Note: some of the new features are only available with certain drivers.
    • GL_ARB_blend_func_extended on freedreno (a3xx)
    • GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips
    • +
    • GL_ARB_gpu_shader5 on r600 for Evergreen and later chips
    • GL_ARB_shader_storage_buffer_object on i965
    • GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi
    • GL_ARB_texture_barrier / GL_NV_texture_barrier on i965
    • diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index efb4889e562..32ce76a9e07 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -305,7 +305,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_GLSL_FEATURE_LEVEL: if (family >= CHIP_CEDAR) - return 330; + return 410; /* pre-evergreen geom shaders need newer kernel */ if (rscreen->b.info.drm_minor >= 37) return 330; From 83de93309e38ce3af0c8f92ef54446db70b2cb38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Sun, 11 Oct 2015 20:09:52 +0200 Subject: [PATCH 088/270] r600/uvd: disable UVD tiling by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has only minimal advantages for post processing and doesn't work with VCE. Signed-off-by: Christian König Reviewed-by: Alex Deucher --- src/gallium/drivers/r600/r600_uvd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c index 357e9017a65..3a94a5a95ec 100644 --- a/src/gallium/drivers/r600/r600_uvd.c +++ b/src/gallium/drivers/r600/r600_uvd.c @@ -49,6 +49,8 @@ #include "radeon/radeon_uvd.h" #include "r600d.h" +#define R600_UVD_ENABLE_TILING 0 + /** * creates an video buffer with an UVD compatible memory layout */ @@ -77,7 +79,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, template.height = align(tmpl->height / array_size, VL_MACROBLOCK_HEIGHT); vl_video_buffer_template(&templ, &template, resource_formats[0], 1, array_size, PIPE_USAGE_DEFAULT, 0); - if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced) + if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING) templ.bind = PIPE_BIND_LINEAR; resources[0] = (struct r600_texture *) pipe->screen->resource_create(pipe->screen, &templ); @@ -86,7 +88,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, if (resource_formats[1] != PIPE_FORMAT_NONE) { vl_video_buffer_template(&templ, &template, resource_formats[1], 1, array_size, PIPE_USAGE_DEFAULT, 1); - if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced) + if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING) templ.bind = PIPE_BIND_LINEAR; resources[1] = (struct r600_texture *) pipe->screen->resource_create(pipe->screen, &templ); @@ -96,7 +98,7 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, if (resource_formats[2] != PIPE_FORMAT_NONE) { vl_video_buffer_template(&templ, &template, resource_formats[2], 1, array_size, PIPE_USAGE_DEFAULT, 2); - if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced) + if (ctx->b.chip_class < EVERGREEN || tmpl->interlaced || !R600_UVD_ENABLE_TILING) templ.bind = PIPE_BIND_LINEAR; resources[2] = (struct r600_texture *) pipe->screen->resource_create(pipe->screen, &templ); From 685335639a982b398d305b8f314fc3857fcdbead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Sun, 11 Oct 2015 20:13:25 +0200 Subject: [PATCH 089/270] r600/vce: enable VCE for trinity/richland MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christian König Reviewed-by: Alex Deucher --- src/gallium/drivers/r600/r600_uvd.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c index 3a94a5a95ec..e2e9033ea2c 100644 --- a/src/gallium/drivers/r600/r600_uvd.c +++ b/src/gallium/drivers/r600/r600_uvd.c @@ -47,6 +47,7 @@ #include "r600_pipe.h" #include "radeon/radeon_video.h" #include "radeon/radeon_uvd.h" +#include "radeon/radeon_vce.h" #include "r600d.h" #define R600_UVD_ENABLE_TILING 0 @@ -168,9 +169,28 @@ static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, st return luma->resource.cs_buf; } +/* get the radeon resources for VCE */ +static void r600_vce_get_buffer(struct pipe_resource *resource, + struct radeon_winsys_cs_handle **handle, + struct radeon_surf **surface) +{ + struct r600_texture *res = (struct r600_texture *)resource; + + if (handle) + *handle = res->resource.cs_buf; + + if (surface) + *surface = &res->surface; +} + /* create decoder */ struct pipe_video_codec *r600_uvd_create_decoder(struct pipe_context *context, - const struct pipe_video_codec *templat) + const struct pipe_video_codec *templat) { + struct r600_context *ctx = (struct r600_context *)context; + + if (templat->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) + return rvce_create_encoder(context, templat, ctx->b.ws, r600_vce_get_buffer); + return ruvd_create_decoder(context, templat, r600_uvd_set_dtb); } From 6a506689db287ea41b3374bd3174a5da78b56d16 Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Thu, 1 Oct 2015 16:46:55 +0200 Subject: [PATCH 090/270] glsl: fix matrix stride calculation for std430's row_major matrices with two columns This is the result of applying several rules: From OpenGL 4.3 spec, section 7.6.2.2 "Standard Uniform Block Layout": "2. If the member is a two- or four-component vector with components consuming N basic machine units, the base alignment is 2N or 4N, respectively." [...] "4. If the member is an array of scalars or vectors, the base alignment and array stride are set to match the base alignment of a single array element, according to rules (1), (2), and (3), and rounded up to the base alignment of a vec4." [...] "7. If the member is a row-major matrix with C columns and R rows, the matrix is stored identically to an array of R row vectors with C components each, according to rule (4)." [...] "When using the std430 storage layout, shader storage blocks will be laid out in buffer storage identically to uniform and shader storage blocks using the std140 layout, except that the base alignment and stride of arrays of scalars and vectors in rule 4 and of structures in rule 9 are not rounded up a multiple of the base alignment of a vec4." In summary: vec2 has a base alignment of 2*N, a row-major mat2xY is stored like an array of Y row vectors with 2 components each. Because of std430 storage layout, the base alignment of the array of vectors is not rounded up to vec4, so it is still 2*N. Fixes 15 dEQP tests: dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_lowp_mat2 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_mediump_mat2 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_highp_mat2 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_lowp_mat2x3 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_mediump_mat2x3 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_highp_mat2x3 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_lowp_mat2x4 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_mediump_mat2x4 dEQP-GLES31.functional.ssbo.layout.single_basic_type.std430.row_major_highp_mat2x4 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std430.row_major_mat2 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std430.row_major_mat2x3 dEQP-GLES31.functional.ssbo.layout.single_basic_array.std430.row_major_mat2x4 dEQP-GLES31.functional.ssbo.layout.instance_array_basic_type.std430.row_major_mat2 dEQP-GLES31.functional.ssbo.layout.instance_array_basic_type.std430.row_major_mat2x3 dEQP-GLES31.functional.ssbo.layout.instance_array_basic_type.std430.row_major_mat2x4 v2: - Add spec quote in both commit log and code (Timothy) Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Marta Lofstedt --- src/glsl/lower_ubo_reference.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index 247620e6148..c8ec5c19f41 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -744,7 +744,31 @@ lower_ubo_reference_visitor::emit_access(bool is_write, * or 32 depending on the number of columns. */ assert(matrix_columns <= 4); - unsigned matrix_stride = glsl_align(matrix_columns * N, 16); + unsigned matrix_stride = 0; + /* Matrix stride for std430 mat2xY matrices are not rounded up to + * vec4 size. From OpenGL 4.3 spec, section 7.6.2.2 "Standard Uniform + * Block Layout": + * + * "2. If the member is a two- or four-component vector with components + * consuming N basic machine units, the base alignment is 2N or 4N, + * respectively." [...] + * "4. If the member is an array of scalars or vectors, the base alignment + * and array stride are set to match the base alignment of a single array + * element, according to rules (1), (2), and (3), and rounded up to the + * base alignment of a vec4." [...] + * "7. If the member is a row-major matrix with C columns and R rows, the + * matrix is stored identically to an array of R row vectors with C + * components each, according to rule (4)." [...] + * "When using the std430 storage layout, shader storage blocks will be + * laid out in buffer storage identically to uniform and shader storage + * blocks using the std140 layout, except that the base alignment and + * stride of arrays of scalars and vectors in rule 4 and of structures in + * rule 9 are not rounded up a multiple of the base alignment of a vec4." + */ + if (packing == GLSL_INTERFACE_PACKING_STD430 && matrix_columns == 2) + matrix_stride = 2 * N; + else + matrix_stride = glsl_align(matrix_columns * N, 16); const glsl_type *deref_type = deref->type->base_type == GLSL_TYPE_FLOAT ? glsl_type::float_type : glsl_type::double_type; From c73c481c4a8fbe454cfd6a0014d25beb175c0a7f Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 8 Oct 2015 14:45:28 -0600 Subject: [PATCH 091/270] mesa: pass caller name to create_textures() Simpler than the dsa flag approach. --- src/mesa/main/texobj.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index 60c55aeb206..b571b1b2ff6 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -1205,17 +1205,16 @@ invalidate_tex_image_error_check(struct gl_context *ctx, GLuint texture, */ static void create_textures(struct gl_context *ctx, GLenum target, - GLsizei n, GLuint *textures, bool dsa) + GLsizei n, GLuint *textures, const char *caller) { GLuint first; GLint i; - const char *func = dsa ? "Create" : "Gen"; if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE)) - _mesa_debug(ctx, "gl%sTextures %d\n", func, n); + _mesa_debug(ctx, "%s %d\n", caller, n); if (n < 0) { - _mesa_error( ctx, GL_INVALID_VALUE, "gl%sTextures(n < 0)", func ); + _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller); return; } @@ -1236,7 +1235,7 @@ create_textures(struct gl_context *ctx, GLenum target, texObj = ctx->Driver.NewTextureObject(ctx, name, target); if (!texObj) { mtx_unlock(&ctx->Shared->Mutex); - _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", func); + _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sTextures", caller); return; } @@ -1273,7 +1272,7 @@ void GLAPIENTRY _mesa_GenTextures(GLsizei n, GLuint *textures) { GET_CURRENT_CONTEXT(ctx); - create_textures(ctx, 0, n, textures, false); + create_textures(ctx, 0, n, textures, "glGenTextures"); } /** @@ -1306,7 +1305,7 @@ _mesa_CreateTextures(GLenum target, GLsizei n, GLuint *textures) return; } - create_textures(ctx, target, n, textures, true); + create_textures(ctx, target, n, textures, "glCreateTextures"); } /** From dd293d8aae324ac7b9d5297e33a1e732e1f3f4d3 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 11:32:35 -0600 Subject: [PATCH 092/270] vbo: fix incorrect switch statement in init_mat_currval() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable 'i' is a value in [0, MAT_ATTRIB_MAX-1] so subtracting VERT_ATTRIB_GENERIC0 gave a bogus value and we executed the default switch clause for all loop iterations. This doesn't fix any known issues but was clearly incorrect. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c index e3eb286e482..802955da28e 100644 --- a/src/mesa/vbo/vbo_context.c +++ b/src/mesa/vbo/vbo_context.c @@ -121,7 +121,7 @@ static void init_mat_currval(struct gl_context *ctx) /* Size is fixed for the material attributes, for others will * be determined at runtime: */ - switch (i - VERT_ATTRIB_GENERIC0) { + switch (i) { case MAT_ATTRIB_FRONT_SHININESS: case MAT_ATTRIB_BACK_SHININESS: cl->Size = 1; From 20f31ae37c42b4c98e0a55ba6b2e57ad9d1c277b Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 11:26:16 -0600 Subject: [PATCH 093/270] vbo: get rid of needless NR_MAT_ATTRIBS constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_context.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c index 802955da28e..b5dc45cb603 100644 --- a/src/mesa/vbo/vbo_context.c +++ b/src/mesa/vbo/vbo_context.c @@ -33,7 +33,6 @@ #include "vbo.h" #include "vbo_context.h" -#define NR_MAT_ATTRIBS 12 static GLuint check_size( const GLfloat *attr ) { @@ -108,14 +107,12 @@ static void init_mat_currval(struct gl_context *ctx) &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT]; GLuint i; - assert(NR_MAT_ATTRIBS == MAT_ATTRIB_MAX); - - memset(arrays, 0, sizeof(*arrays) * NR_MAT_ATTRIBS); + memset(arrays, 0, sizeof(*arrays) * MAT_ATTRIB_MAX); /* Set up a constant (StrideB == 0) array for each current * attribute: */ - for (i = 0; i < NR_MAT_ATTRIBS; i++) { + for (i = 0; i < MAT_ATTRIB_MAX; i++) { struct gl_client_array *cl = &arrays[i]; /* Size is fixed for the material attributes, for others will @@ -175,7 +172,7 @@ GLboolean _vbo_CreateContext( struct gl_context *ctx ) for (i = 0; i < ARRAY_SIZE(vbo->map_vp_none); i++) vbo->map_vp_none[i] = i; /* map material attribs to generic slots */ - for (i = 0; i < NR_MAT_ATTRIBS; i++) + for (i = 0; i < MAT_ATTRIB_MAX; i++) vbo->map_vp_none[VERT_ATTRIB_GENERIC(i)] = VBO_ATTRIB_MAT_FRONT_AMBIENT + i; From a639bbf0987873a0214f0ef562755fa0130b1236 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 09:55:57 -0600 Subject: [PATCH 094/270] vbo: simplify vertex array initializations in vbo_context.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_context.c | 95 +++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c index b5dc45cb603..5e1a760eb2c 100644 --- a/src/mesa/vbo/vbo_context.c +++ b/src/mesa/vbo/vbo_context.c @@ -43,32 +43,47 @@ static GLuint check_size( const GLfloat *attr ) } +/** + * Helper for initializing a vertex array. + */ +static void +init_array(struct gl_context *ctx, struct gl_client_array *cl, + unsigned size, const void *pointer) +{ + memset(cl, 0, sizeof(*cl)); + + cl->Size = size; + cl->Type = GL_FLOAT; + cl->Format = GL_RGBA; + cl->Stride = 0; + cl->StrideB = 0; + cl->_ElementSize = cl->Size * sizeof(GLfloat); + cl->Ptr = pointer; + cl->Enabled = 1; + + _mesa_reference_buffer_object(ctx, &cl->BufferObj, + ctx->Shared->NullBufferObj); +} + + +/** + * Set up the vbo->currval arrays to point at the context's current + * vertex attributes (with strides = 0). + */ static void init_legacy_currval(struct gl_context *ctx) { struct vbo_context *vbo = vbo_context(ctx); - struct gl_client_array *arrays = &vbo->currval[VBO_ATTRIB_POS]; GLuint i; - memset(arrays, 0, sizeof(*arrays) * VERT_ATTRIB_FF_MAX); - /* Set up a constant (StrideB == 0) array for each current * attribute: */ for (i = 0; i < VERT_ATTRIB_FF_MAX; i++) { - struct gl_client_array *cl = &arrays[i]; + struct gl_client_array *cl = &vbo->currval[VERT_ATTRIB_FF(i)]; - /* Size will have to be determined at runtime: - */ - cl->Size = check_size(ctx->Current.Attrib[i]); - cl->Stride = 0; - cl->StrideB = 0; - cl->Enabled = 1; - cl->Type = GL_FLOAT; - cl->Format = GL_RGBA; - cl->Ptr = (const void *)ctx->Current.Attrib[i]; - cl->_ElementSize = cl->Size * sizeof(GLfloat); - _mesa_reference_buffer_object(ctx, &cl->BufferObj, - ctx->Shared->NullBufferObj); + init_array(ctx, cl, + check_size(ctx->Current.Attrib[i]), + ctx->Current.Attrib[i]); } } @@ -76,26 +91,12 @@ static void init_legacy_currval(struct gl_context *ctx) static void init_generic_currval(struct gl_context *ctx) { struct vbo_context *vbo = vbo_context(ctx); - struct gl_client_array *arrays = &vbo->currval[VBO_ATTRIB_GENERIC0]; GLuint i; - memset(arrays, 0, sizeof(*arrays) * VERT_ATTRIB_GENERIC_MAX); - for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) { - struct gl_client_array *cl = &arrays[i]; + struct gl_client_array *cl = &vbo->currval[VBO_ATTRIB_GENERIC0 + i]; - /* This will have to be determined at runtime: - */ - cl->Size = 1; - cl->Type = GL_FLOAT; - cl->Format = GL_RGBA; - cl->Ptr = (const void *)ctx->Current.Attrib[VERT_ATTRIB_GENERIC0 + i]; - cl->Stride = 0; - cl->StrideB = 0; - cl->Enabled = 1; - cl->_ElementSize = cl->Size * sizeof(GLfloat); - _mesa_reference_buffer_object(ctx, &cl->BufferObj, - ctx->Shared->NullBufferObj); + init_array(ctx, cl, 1, ctx->Current.Attrib[VERT_ATTRIB_GENERIC0 + i]); } } @@ -103,17 +104,15 @@ static void init_generic_currval(struct gl_context *ctx) static void init_mat_currval(struct gl_context *ctx) { struct vbo_context *vbo = vbo_context(ctx); - struct gl_client_array *arrays = - &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT]; GLuint i; - memset(arrays, 0, sizeof(*arrays) * MAT_ATTRIB_MAX); - /* Set up a constant (StrideB == 0) array for each current * attribute: */ for (i = 0; i < MAT_ATTRIB_MAX; i++) { - struct gl_client_array *cl = &arrays[i]; + struct gl_client_array *cl = + &vbo->currval[VBO_ATTRIB_MAT_FRONT_AMBIENT + i]; + unsigned size; /* Size is fixed for the material attributes, for others will * be determined at runtime: @@ -121,26 +120,18 @@ static void init_mat_currval(struct gl_context *ctx) switch (i) { case MAT_ATTRIB_FRONT_SHININESS: case MAT_ATTRIB_BACK_SHININESS: - cl->Size = 1; - break; + size = 1; + break; case MAT_ATTRIB_FRONT_INDEXES: case MAT_ATTRIB_BACK_INDEXES: - cl->Size = 3; - break; + size = 3; + break; default: - cl->Size = 4; - break; + size = 4; + break; } - cl->Ptr = (const void *)ctx->Light.Material.Attrib[i]; - cl->Type = GL_FLOAT; - cl->Format = GL_RGBA; - cl->Stride = 0; - cl->StrideB = 0; - cl->Enabled = 1; - cl->_ElementSize = cl->Size * sizeof(GLfloat); - _mesa_reference_buffer_object(ctx, &cl->BufferObj, - ctx->Shared->NullBufferObj); + init_array(ctx, cl, size, ctx->Light.Material.Attrib[i]); } } From a1cbf85de0a0c06a95086ea52d2260343e1783c4 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 10:43:11 -0600 Subject: [PATCH 095/270] vbo: improve fprintf() formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_save_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c index 1a70d168c55..02cd03c1dc3 100644 --- a/src/mesa/vbo/vbo_save_api.c +++ b/src/mesa/vbo/vbo_save_api.c @@ -1543,7 +1543,7 @@ vbo_print_vertex_list(struct gl_context *ctx, void *data, FILE *f) node->vertex_store->bufferobj : NULL; (void) ctx; - fprintf(f, "VBO-VERTEX-LIST, %u vertices %d primitives, %d vertsize " + fprintf(f, "VBO-VERTEX-LIST, %u vertices, %d primitives, %d vertsize, " "buffer %p\n", node->count, node->prim_count, node->vertex_size, buffer); From 8fbb72c2977f652b0790b0fcf5f87286549e39f0 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 10:43:36 -0600 Subject: [PATCH 096/270] vbo: move 'tmp' var initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improve readability a bit. Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_save_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c index 02cd03c1dc3..fdc677f9a07 100644 --- a/src/mesa/vbo/vbo_save_api.c +++ b/src/mesa/vbo/vbo_save_api.c @@ -648,7 +648,8 @@ _save_upgrade_vertex(struct gl_context *ctx, GLuint attr, GLuint newsz) /* Recalculate all the attrptr[] values: */ - for (i = 0, tmp = save->vertex; i < VBO_ATTRIB_MAX; i++) { + tmp = save->vertex; + for (i = 0; i < VBO_ATTRIB_MAX; i++) { if (save->attrsz[i]) { save->attrptr[i] = tmp; tmp += save->attrsz[i]; From e729f36c0904de24978c11f4eac3d5b64de3deec Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 10:47:26 -0600 Subject: [PATCH 097/270] vbo: fix whitespace in vbo_exec_draw.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_exec_draw.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c index 2bfb0c32b73..174cbc37c26 100644 --- a/src/mesa/vbo/vbo_exec_draw.c +++ b/src/mesa/vbo/vbo_exec_draw.c @@ -53,10 +53,10 @@ vbo_exec_debug_verts( struct vbo_exec_context *exec ) for (i = 0 ; i < exec->vtx.prim_count ; i++) { struct _mesa_prim *prim = &exec->vtx.prim[i]; printf(" prim %d: %s%s %d..%d %s %s\n", - i, + i, _mesa_lookup_prim_by_nr(prim->mode), prim->weak ? " (weak)" : "", - prim->start, + prim->start, prim->start + prim->count, prim->begin ? "BEGIN" : "(wrap)", prim->end ? "END" : "(wrap)"); @@ -79,7 +79,6 @@ vbo_copy_vertices( struct vbo_exec_context *exec ) exec->vtx.prim[exec->vtx.prim_count-1].start * exec->vtx.vertex_size); - switch (exec->ctx->Driver.CurrentExecPrimitive) { case GL_POINTS: return 0; @@ -219,7 +218,7 @@ vbo_exec_bind_arrays( struct gl_context *ctx ) exec->vtx.inputs[attr] = &arrays[attr]; if (_mesa_is_bufferobj(exec->vtx.bufferobj)) { - /* a real buffer obj: Ptr is an offset, not a pointer*/ + /* a real buffer obj: Ptr is an offset, not a pointer */ assert(exec->vtx.bufferobj->Mappings[MAP_INTERNAL].Pointer); assert(offset >= 0); arrays[attr].Ptr = (GLubyte *) @@ -259,7 +258,7 @@ vbo_exec_vtx_unmap( struct vbo_exec_context *exec ) { if (_mesa_is_bufferobj(exec->vtx.bufferobj)) { struct gl_context *ctx = exec->ctx; - + if (ctx->Driver.FlushMappedBufferRange) { GLintptr offset = exec->vtx.buffer_used - exec->vtx.bufferobj->Mappings[MAP_INTERNAL].Offset; @@ -277,7 +276,7 @@ vbo_exec_vtx_unmap( struct vbo_exec_context *exec ) assert(exec->vtx.buffer_used <= VBO_VERT_BUFFER_SIZE); assert(exec->vtx.buffer_ptr != NULL); - + ctx->Driver.UnmapBuffer(ctx, exec->vtx.bufferobj, MAP_INTERNAL); exec->vtx.buffer_map = NULL; exec->vtx.buffer_ptr = NULL; @@ -299,7 +298,7 @@ vbo_exec_vtx_map( struct vbo_exec_context *exec ) GL_MAP_FLUSH_EXPLICIT_BIT | MESA_MAP_NOWAIT_BIT; const GLenum usage = GL_STREAM_DRAW_ARB; - + if (!_mesa_is_bufferobj(exec->vtx.bufferobj)) return; @@ -323,7 +322,7 @@ vbo_exec_vtx_map( struct vbo_exec_context *exec ) exec->vtx.buffer_ptr = exec->vtx.buffer_map = NULL; } } - + if (!exec->vtx.buffer_map) { /* Need to allocate a new VBO */ exec->vtx.buffer_used = 0; @@ -381,14 +380,14 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped) if (0) vbo_exec_debug_verts( exec ); - if (exec->vtx.prim_count && + if (exec->vtx.prim_count && exec->vtx.vert_count) { - exec->vtx.copied.nr = vbo_copy_vertices( exec ); + exec->vtx.copied.nr = vbo_copy_vertices( exec ); if (exec->vtx.copied.nr != exec->vtx.vert_count) { struct gl_context *ctx = exec->ctx; - + /* Before the update_state() as this may raise _NEW_VARYING_VP_INPUTS * from _mesa_set_varying_vp_inputs(). */ @@ -405,7 +404,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped) printf("%s %d %d\n", __func__, exec->vtx.prim_count, exec->vtx.vert_count); - vbo_context(ctx)->draw_prims( ctx, + vbo_context(ctx)->draw_prims( ctx, exec->vtx.prim, exec->vtx.prim_count, NULL, @@ -433,7 +432,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped) if (keepUnmapped || exec->vtx.vertex_size == 0) exec->vtx.max_vert = 0; else - exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / + exec->vtx.max_vert = ((VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) / (exec->vtx.vertex_size * sizeof(GLfloat))); exec->vtx.buffer_ptr = exec->vtx.buffer_map; From 3491ec5930b15d5417bdf17b2a70fedaada969f1 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 10:52:58 -0600 Subject: [PATCH 098/270] vbo: add comments, braces in ATTR_UNION() in vbo_exec_api.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_exec_api.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c index 583a2f9b79f..9de2886c499 100644 --- a/src/mesa/vbo/vbo_exec_api.c +++ b/src/mesa/vbo/vbo_exec_api.c @@ -413,18 +413,27 @@ vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenu /** * This macro is used to implement all the glVertex, glColor, glTexCoord, * glVertexAttrib, etc functions. + * \param A attribute index + * \param N attribute size (1..4) + * \param T type (GL_FLOAT, GL_DOUBLE, GL_INT, GL_UNSIGNED_INT) + * \param C cast type (fi_type or double) + * \param V0, V1, v2, V3 attribute value */ #define ATTR_UNION( A, N, T, C, V0, V1, V2, V3 ) \ do { \ struct vbo_exec_context *exec = &vbo_context(ctx)->exec; \ int sz = (sizeof(C) / sizeof(GLfloat)); \ - if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) \ + if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) { \ vbo_exec_BeginVertices(ctx); \ + } \ \ + /* check if attribute size or type is changing */ \ if (unlikely(exec->vtx.active_sz[A] != N * sz) || \ - unlikely(exec->vtx.attrtype[A] != T)) \ + unlikely(exec->vtx.attrtype[A] != T)) { \ vbo_exec_fixup_vertex(ctx, A, N * sz, T); \ + } \ \ + /* store vertex attribute in vertex buffer */ \ { \ C *dest = (C *)exec->vtx.attrptr[A]; \ if (N>0) dest[0] = V0; \ @@ -438,6 +447,7 @@ do { \ /* This is a glVertex call */ \ GLuint i; \ \ + /* copy 32-bit words */ \ for (i = 0; i < exec->vtx.vertex_size; i++) \ exec->vtx.buffer_ptr[i] = exec->vtx.vertex[i]; \ \ From 7f67bfaa7471ac297ec86be122f251b271cea2ca Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 10:53:59 -0600 Subject: [PATCH 099/270] vbo: add assertion in ATTR_UNION macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_exec_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c index 9de2886c499..3943523b702 100644 --- a/src/mesa/vbo/vbo_exec_api.c +++ b/src/mesa/vbo/vbo_exec_api.c @@ -423,6 +423,9 @@ vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenu do { \ struct vbo_exec_context *exec = &vbo_context(ctx)->exec; \ int sz = (sizeof(C) / sizeof(GLfloat)); \ + \ + assert(sz == 1 || sz == 2); \ + \ if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) { \ vbo_exec_BeginVertices(ctx); \ } \ From d65b029dc20a1680c1e0203c493720f7a03a803c Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 10:58:49 -0600 Subject: [PATCH 100/270] vbo: minor clean-ups for vbo_exec_fixup_vertex() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_exec_api.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c index 3943523b702..a99887a4885 100644 --- a/src/mesa/vbo/vbo_exec_api.c +++ b/src/mesa/vbo/vbo_exec_api.c @@ -375,13 +375,16 @@ vbo_exec_wrap_upgrade_vertex(struct vbo_exec_context *exec, * This is when a vertex attribute transitions to a different size. * For example, we saw a bunch of glTexCoord2f() calls and now we got a * glTexCoord4f() call. We promote the array from size=2 to size=4. + * \param newSize size of new vertex (number of 32-bit words). */ static void -vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint newSize, GLenum newType) +vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, + GLuint newSize, GLenum newType) { struct vbo_exec_context *exec = &vbo_context(ctx)->exec; - if (newSize > exec->vtx.attrsz[attr] || newType != exec->vtx.attrtype[attr]) { + if (newSize > exec->vtx.attrsz[attr] || + newType != exec->vtx.attrtype[attr]) { /* New size is larger. Need to flush existing vertices and get * an enlarged vertex format. */ From 84719ad9df5a48ef8c92461956abda1b20cdbefc Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 11:01:23 -0600 Subject: [PATCH 101/270] vbo: document vbo_exec_context fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_exec.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h index 80f3015925d..0f894aab740 100644 --- a/src/mesa/vbo/vbo_exec.h +++ b/src/mesa/vbo/vbo_exec.h @@ -79,7 +79,7 @@ struct vbo_exec_copied_vtx { struct vbo_exec_context { - struct gl_context *ctx; + struct gl_context *ctx; GLvertexformat vtxfmt; GLvertexformat vtxfmt_noop; GLboolean validating; /**< if we're in the middle of state validation */ @@ -97,15 +97,17 @@ struct vbo_exec_context GLuint buffer_used; /* in bytes */ fi_type vertex[VBO_ATTRIB_MAX*4]; /* current vertex */ - GLuint vert_count; - GLuint max_vert; + GLuint vert_count; /**< Number of vertices currently in buffer */ + GLuint max_vert; /**< Max number of vertices allowed in buffer */ struct vbo_exec_copied_vtx copied; - GLubyte attrsz[VBO_ATTRIB_MAX]; - GLenum attrtype[VBO_ATTRIB_MAX]; - GLubyte active_sz[VBO_ATTRIB_MAX]; + GLubyte attrsz[VBO_ATTRIB_MAX]; /**< nr. of attrib components (1..4) */ + GLenum attrtype[VBO_ATTRIB_MAX]; /**< GL_FLOAT, GL_DOUBLE, GL_INT, etc */ + GLubyte active_sz[VBO_ATTRIB_MAX]; /**< attrib size (nr. 32-bit words) */ + /** pointers into the current 'vertex' array, declared above */ fi_type *attrptr[VBO_ATTRIB_MAX]; + struct gl_client_array arrays[VERT_ATTRIB_MAX]; /* According to program mode, the values above plus current @@ -115,7 +117,6 @@ struct vbo_exec_context const struct gl_client_array *inputs[VERT_ATTRIB_MAX]; } vtx; - struct { GLboolean recalculate_maps; struct vbo_exec_eval1_map map1[VERT_ATTRIB_MAX]; From a7b6e6192a2cb36a36aaf12153de2e4b2c6e1cef Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 14:16:37 -0600 Subject: [PATCH 102/270] vbo: make void vbo_exec_BeginVertices() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not called from any other file. Rename and move before use. Reviewed-by: Marek Olšák --- src/mesa/vbo/vbo_exec.h | 4 +--- src/mesa/vbo/vbo_exec_api.c | 37 +++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/mesa/vbo/vbo_exec.h b/src/mesa/vbo/vbo_exec.h index 0f894aab740..00378eb7984 100644 --- a/src/mesa/vbo/vbo_exec.h +++ b/src/mesa/vbo/vbo_exec.h @@ -132,7 +132,7 @@ struct vbo_exec_context GLboolean recalculate_inputs; } array; - /* Which flags to set in vbo_exec_BeginVertices() */ + /* Which flags to set in vbo_exec_begin_vertices() */ GLbitfield begin_vertices_flags; #ifdef DEBUG @@ -148,8 +148,6 @@ void vbo_exec_init( struct gl_context *ctx ); void vbo_exec_destroy( struct gl_context *ctx ); void vbo_exec_invalidate_state( struct gl_context *ctx, GLuint new_state ); -void vbo_exec_BeginVertices( struct gl_context *ctx ); - /* Internal functions: */ diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c index a99887a4885..7ae08fe3062 100644 --- a/src/mesa/vbo/vbo_exec_api.c +++ b/src/mesa/vbo/vbo_exec_api.c @@ -413,6 +413,23 @@ vbo_exec_fixup_vertex(struct gl_context *ctx, GLuint attr, } +/** + * Called upon first glVertex, glColor, glTexCoord, etc. + */ +static void +vbo_exec_begin_vertices(struct gl_context *ctx) +{ + struct vbo_exec_context *exec = &vbo_context(ctx)->exec; + + vbo_exec_vtx_map( exec ); + + assert((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0); + assert(exec->begin_vertices_flags); + + ctx->Driver.NeedFlush |= exec->begin_vertices_flags; +} + + /** * This macro is used to implement all the glVertex, glColor, glTexCoord, * glVertexAttrib, etc functions. @@ -430,7 +447,7 @@ do { \ assert(sz == 1 || sz == 2); \ \ if (unlikely(!(ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT))) { \ - vbo_exec_BeginVertices(ctx); \ + vbo_exec_begin_vertices(ctx); \ } \ \ /* check if attribute size or type is changing */ \ @@ -1164,22 +1181,6 @@ void vbo_exec_vtx_destroy( struct vbo_exec_context *exec ) } -/** - * Called upon first glVertex, glColor, glTexCoord, etc. - */ -void vbo_exec_BeginVertices( struct gl_context *ctx ) -{ - struct vbo_exec_context *exec = &vbo_context(ctx)->exec; - - vbo_exec_vtx_map( exec ); - - assert((ctx->Driver.NeedFlush & FLUSH_UPDATE_CURRENT) == 0); - assert(exec->begin_vertices_flags); - - ctx->Driver.NeedFlush |= exec->begin_vertices_flags; -} - - /** * If inside glBegin()/glEnd(), it should assert(0). Otherwise, if * FLUSH_STORED_VERTICES bit in \p flags is set flushes any buffered @@ -1213,7 +1214,7 @@ void vbo_exec_FlushVertices( struct gl_context *ctx, GLuint flags ) /* Flush (draw), and make sure VBO is left unmapped when done */ vbo_exec_FlushVertices_internal(exec, GL_TRUE); - /* Need to do this to ensure vbo_exec_BeginVertices gets called again: + /* Need to do this to ensure vbo_exec_begin_vertices gets called again: */ ctx->Driver.NeedFlush &= ~(FLUSH_UPDATE_CURRENT | flags); From 4a168ad797af26b31c64e408fb5f84838bf37b4e Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 14:50:17 -0600 Subject: [PATCH 103/270] mesa: clean up comments for gl_current_attrib struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák --- src/mesa/main/mtypes.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 0a54b2073e2..62eb5927637 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -487,26 +487,24 @@ struct gl_colorbuffer_attrib struct gl_current_attrib { /** - * \name Current vertex attributes. + * \name Current vertex attributes (color, texcoords, etc). * \note Values are valid only after FLUSH_VERTICES has been called. * \note Index and Edgeflag current values are stored as floats in the * SIX and SEVEN attribute slots. + * \note We need double storage for 64-bit vertex attributes */ - /* we need double storage for this for vertex attrib 64bit */ - GLfloat Attrib[VERT_ATTRIB_MAX][4*2]; /**< Position, color, texcoords, etc */ + GLfloat Attrib[VERT_ATTRIB_MAX][4*2]; /** - * \name Current raster position attributes (always valid). - * \note This set of attributes is very similar to the SWvertex struct. + * \name Current raster position attributes (always up to date after a + * glRasterPos call). */ - /*@{*/ GLfloat RasterPos[4]; GLfloat RasterDistance; GLfloat RasterColor[4]; GLfloat RasterSecondaryColor[4]; GLfloat RasterTexCoords[MAX_TEXTURE_COORD_UNITS][4]; GLboolean RasterPosValid; - /*@}*/ }; From 9d2bbca98d2712f7bafe66cb3bc08859ff14133e Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 13 Oct 2015 11:26:21 +0200 Subject: [PATCH 104/270] i965/fs: Fix indentation in fs_live_variables::compute_start_end Reviewed-by: Francisco Jerez --- .../drivers/dri/i965/brw_fs_live_variables.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp index 19aec92fad1..ce066a9778e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp @@ -259,16 +259,15 @@ fs_live_variables::compute_start_end() struct block_data *bd = &block_data[block->num]; for (int i = 0; i < num_vars; i++) { - if (BITSET_TEST(bd->livein, i)) { - start[i] = MIN2(start[i], block->start_ip); - end[i] = MAX2(end[i], block->start_ip); - } - - if (BITSET_TEST(bd->liveout, i)) { - start[i] = MIN2(start[i], block->end_ip); - end[i] = MAX2(end[i], block->end_ip); - } + if (BITSET_TEST(bd->livein, i)) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } + if (BITSET_TEST(bd->liveout, i)) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } } } } From be800ef6d80fc43279780e652e611253428d7a78 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Tue, 13 Oct 2015 11:59:59 +0200 Subject: [PATCH 105/270] i965/vec4: fix indentation in vec4_visitor::calculate_live_intervals Reviewed-by: Francisco Jerez --- .../drivers/dri/i965/brw_vec4_live_variables.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp index cc688ef8083..678237901f2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp @@ -291,15 +291,15 @@ vec4_visitor::calculate_live_intervals() struct block_data *bd = &live_intervals->block_data[block->num]; for (int i = 0; i < live_intervals->num_vars; i++) { - if (BITSET_TEST(bd->livein, i)) { - start[i] = MIN2(start[i], block->start_ip); - end[i] = MAX2(end[i], block->start_ip); - } + if (BITSET_TEST(bd->livein, i)) { + start[i] = MIN2(start[i], block->start_ip); + end[i] = MAX2(end[i], block->start_ip); + } - if (BITSET_TEST(bd->liveout, i)) { - start[i] = MIN2(start[i], block->end_ip); - end[i] = MAX2(end[i], block->end_ip); - } + if (BITSET_TEST(bd->liveout, i)) { + start[i] = MIN2(start[i], block->end_ip); + end[i] = MAX2(end[i], block->end_ip); + } } } } From baee16bf02eedc6a32381d79da6c7ac942f782ae Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 28 Sep 2015 10:47:22 +0200 Subject: [PATCH 106/270] nir: split SSBO min/max atomic instrinsics into signed/unsigned versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NIR is typeless so this is the only way to keep track of the type to select the proper atomic to use. v2: - Use imin,imax,umin,umax for the intrinsic names (Connor Abbott) - Change message for unreachable paths (Michael Schellenberger) Tested-by: Markus Wick Reviewed-by: Kristian Høgsberg Reviewed-by: Kenneth Graunke --- src/glsl/nir/glsl_to_nir.cpp | 22 ++++++++++++++++++---- src/glsl/nir/nir_intrinsics.h | 6 ++++-- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 20 ++++++++++---------- src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 20 ++++++++++---------- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index efaa73e12f1..4b9201e2f60 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -691,9 +691,21 @@ nir_visitor::visit(ir_call *ir) } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_xor_internal") == 0) { op = nir_intrinsic_ssbo_atomic_xor; } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_min_internal") == 0) { - op = nir_intrinsic_ssbo_atomic_min; + assert(ir->return_deref); + if (ir->return_deref->type == glsl_type::int_type) + op = nir_intrinsic_ssbo_atomic_imin; + else if (ir->return_deref->type == glsl_type::uint_type) + op = nir_intrinsic_ssbo_atomic_umin; + else + unreachable("Invalid type"); } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_max_internal") == 0) { - op = nir_intrinsic_ssbo_atomic_max; + assert(ir->return_deref); + if (ir->return_deref->type == glsl_type::int_type) + op = nir_intrinsic_ssbo_atomic_imax; + else if (ir->return_deref->type == glsl_type::uint_type) + op = nir_intrinsic_ssbo_atomic_umax; + else + unreachable("Invalid type"); } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_exchange_internal") == 0) { op = nir_intrinsic_ssbo_atomic_exchange; } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_comp_swap_internal") == 0) { @@ -902,8 +914,10 @@ nir_visitor::visit(ir_call *ir) break; } case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_min: - case nir_intrinsic_ssbo_atomic_max: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: case nir_intrinsic_ssbo_atomic_and: case nir_intrinsic_ssbo_atomic_or: case nir_intrinsic_ssbo_atomic_xor: diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h index 263d8c14f4a..49bf3b22aed 100644 --- a/src/glsl/nir/nir_intrinsics.h +++ b/src/glsl/nir/nir_intrinsics.h @@ -174,8 +174,10 @@ INTRINSIC(image_samples, 0, ARR(), true, 1, 1, 0, * 3: For CompSwap only: the second data parameter. */ INTRINSIC(ssbo_atomic_add, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) -INTRINSIC(ssbo_atomic_min, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) -INTRINSIC(ssbo_atomic_max, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_imin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_umin, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_imax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) +INTRINSIC(ssbo_atomic_umax, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) INTRINSIC(ssbo_atomic_and, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) INTRINSIC(ssbo_atomic_or, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) INTRINSIC(ssbo_atomic_xor, 3, ARR(1, 1, 1), true, 1, 0, 0, 0) diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 51189a2d263..21d2967935a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1834,17 +1834,17 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_ssbo_atomic_add: nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr); break; - case nir_intrinsic_ssbo_atomic_min: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); - else - nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); + case nir_intrinsic_ssbo_atomic_imin: + nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); break; - case nir_intrinsic_ssbo_atomic_max: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); - else - nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); + case nir_intrinsic_ssbo_atomic_umin: + nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_imax: + nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_umax: + nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); break; case nir_intrinsic_ssbo_atomic_and: nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index fdf767ded64..9e095fb52c8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -645,17 +645,17 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_ssbo_atomic_add: nir_emit_ssbo_atomic(BRW_AOP_ADD, instr); break; - case nir_intrinsic_ssbo_atomic_min: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); - else - nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); + case nir_intrinsic_ssbo_atomic_imin: + nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr); break; - case nir_intrinsic_ssbo_atomic_max: - if (dest.type == BRW_REGISTER_TYPE_D) - nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); - else - nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); + case nir_intrinsic_ssbo_atomic_umin: + nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr); + break; + case nir_intrinsic_ssbo_atomic_imax: + nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr); + break; + case nir_intrinsic_ssbo_atomic_umax: + nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr); break; case nir_intrinsic_ssbo_atomic_and: nir_emit_ssbo_atomic(BRW_AOP_AND, instr); From 9de651b261286f15ae000e4a698587b805b95d2b Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 5 Oct 2015 11:42:43 +0200 Subject: [PATCH 107/270] glsl: Fix variable_referenced() for vector_{extract,insert} expressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get these when we operate on vector variables with array accessors (i.e. things like a[0] where 'a' is a vec4). When we call variable_referenced() on these expressions we want to return a reference to 'a' instead of NULL. This fixes a problem where we pass a[0] as the first argument to an atomic SSBO function that expects a buffer variable. In order to check this, we use variable_referenced(), but that is currently returning NULL in this case, since the underlying rvalue is a vector_extract expression. Tested-by: Markus Wick Reviewed-by: Kristian Høgsberg --- src/glsl/ir.cpp | 16 ++++++++++++++++ src/glsl/ir.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index 2c45b9edc0f..4c228437d15 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -662,6 +662,22 @@ ir_expression::get_operator(const char *str) return (ir_expression_operation) -1; } +ir_variable * +ir_expression::variable_referenced() const +{ + switch (operation) { + case ir_binop_vector_extract: + case ir_triop_vector_insert: + /* We get these for things like a[0] where a is a vector type. In these + * cases we want variable_referenced() to return the actual vector + * variable this is wrapping. + */ + return operands[0]->variable_referenced(); + default: + return ir_rvalue::variable_referenced(); + } +} + ir_constant::ir_constant() : ir_rvalue(ir_type_constant) { diff --git a/src/glsl/ir.h b/src/glsl/ir.h index 43a2bf0ae1c..9c9f22d018b 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -1731,6 +1731,8 @@ public: virtual ir_visitor_status accept(ir_hierarchical_visitor *); + virtual ir_variable *variable_referenced() const; + ir_expression_operation operation; ir_rvalue *operands[4]; }; From 27dccf097d053b085c498a7bcab47197a5e83525 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 1 Oct 2015 09:08:20 +0200 Subject: [PATCH 108/270] mesa: Rename {Num}UniformBlocks to {Num}BufferInterfaceBlocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, these arrays in gl_shader and gl_shader_program hold both UBOs and SSBOs, so this looks like a better name. We were already using NumBufferInterfaceBlocks in gl_shader_program, so this makes things more consistent as well. In a later patch we will add {Num}UniformBlocks and {Num}ShaderStorageBlocks which will contain only references to UBOs and SSBOs respectively that will provide backends with a separate index space for both types of objects. Reviewed-by: Kristian Høgsberg --- src/glsl/link_uniform_initializers.cpp | 4 +- src/glsl/link_uniforms.cpp | 22 +++++----- src/glsl/linker.cpp | 44 +++++++++---------- src/glsl/lower_ubo_reference.cpp | 8 ++-- src/glsl/nir/glsl_to_nir.cpp | 2 +- src/glsl/standalone_scaffolding.cpp | 4 +- src/mesa/drivers/dri/i965/brw_shader.cpp | 4 +- .../drivers/dri/i965/brw_wm_surface_state.c | 10 ++--- src/mesa/main/mtypes.h | 6 +-- src/mesa/main/shader_query.cpp | 4 +- src/mesa/main/shaderapi.c | 4 +- src/mesa/main/shaderobj.c | 4 +- src/mesa/main/uniforms.c | 12 ++--- src/mesa/state_tracker/st_atom_constbuf.c | 4 +- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 4 +- 15 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp index 065257b5a0e..c48ca69c641 100644 --- a/src/glsl/link_uniform_initializers.cpp +++ b/src/glsl/link_uniform_initializers.cpp @@ -49,7 +49,7 @@ get_uniform_block_index(const gl_shader_program *shProg, const char *uniformBlockName) { for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - if (!strcmp(shProg->UniformBlocks[i].Name, uniformBlockName)) + if (!strcmp(shProg->BufferInterfaceBlocks[i].Name, uniformBlockName)) return i; } @@ -169,7 +169,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding) if (stage_index != -1) { struct gl_shader *sh = prog->_LinkedShaders[i]; - sh->UniformBlocks[stage_index].Binding = binding; + sh->BufferInterfaceBlocks[stage_index].Binding = binding; } } } diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp index 0ccd9c8c865..5465687a954 100644 --- a/src/glsl/link_uniforms.cpp +++ b/src/glsl/link_uniforms.cpp @@ -502,9 +502,9 @@ public: for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) { if (strncmp(var->get_interface_type()->name, - prog->UniformBlocks[i].Name, + prog->BufferInterfaceBlocks[i].Name, l) == 0 - && prog->UniformBlocks[i].Name[l] == '[') { + && prog->BufferInterfaceBlocks[i].Name[l] == '[') { ubo_block_index = i; break; } @@ -512,7 +512,7 @@ public: } else { for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) { if (strcmp(var->get_interface_type()->name, - prog->UniformBlocks[i].Name) == 0) { + prog->BufferInterfaceBlocks[i].Name) == 0) { ubo_block_index = i; break; } @@ -530,7 +530,7 @@ public: ubo_byte_offset = 0; } else { const struct gl_uniform_block *const block = - &prog->UniformBlocks[ubo_block_index]; + &prog->BufferInterfaceBlocks[ubo_block_index]; assert(var->data.location != -1); @@ -971,10 +971,10 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) } const unsigned l = strlen(var->name); - for (unsigned i = 0; i < shader->NumUniformBlocks; i++) { - for (unsigned j = 0; j < shader->UniformBlocks[i].NumUniforms; j++) { + for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) { + for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) { if (sentinel) { - const char *begin = shader->UniformBlocks[i].Uniforms[j].Name; + const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name; const char *end = strchr(begin, sentinel); if (end == NULL) @@ -989,7 +989,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) break; } } else if (!strcmp(var->name, - shader->UniformBlocks[i].Uniforms[j].Name)) { + shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) { found = true; var->data.location = j; break; @@ -1115,10 +1115,10 @@ link_assign_uniform_locations(struct gl_shader_program *prog, sh->num_uniform_components = uniform_size.num_shader_uniform_components; sh->num_combined_uniform_components = sh->num_uniform_components; - for (unsigned i = 0; i < sh->NumUniformBlocks; i++) { - if (!sh->UniformBlocks[i].IsShaderStorage) { + for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) { + if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) { sh->num_combined_uniform_components += - sh->UniformBlocks[i].UniformBufferSize / 4; + sh->BufferInterfaceBlocks[i].UniformBufferSize / 4; } } } diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index a97b4ef0a32..8d30bea8cf0 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1161,7 +1161,7 @@ cross_validate_uniforms(struct gl_shader_program *prog) } /** - * Accumulates the array of prog->UniformBlocks and checks that all + * Accumulates the array of prog->BufferInterfaceBlocks and checks that all * definitons of blocks agree on their contents. */ static bool @@ -1170,7 +1170,7 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) unsigned max_num_uniform_blocks = 0; for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { if (prog->_LinkedShaders[i]) - max_num_uniform_blocks += prog->_LinkedShaders[i]->NumUniformBlocks; + max_num_uniform_blocks += prog->_LinkedShaders[i]->NumBufferInterfaceBlocks; } for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { @@ -1184,15 +1184,15 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) if (sh == NULL) continue; - for (unsigned int j = 0; j < sh->NumUniformBlocks; j++) { + for (unsigned int j = 0; j < sh->NumBufferInterfaceBlocks; j++) { int index = link_cross_validate_uniform_block(prog, - &prog->UniformBlocks, + &prog->BufferInterfaceBlocks, &prog->NumBufferInterfaceBlocks, - &sh->UniformBlocks[j]); + &sh->BufferInterfaceBlocks[j]); if (index == -1) { linker_error(prog, "uniform block `%s' has mismatching definitions\n", - sh->UniformBlocks[j].Name); + sh->BufferInterfaceBlocks[j].Name); return false; } @@ -2064,9 +2064,9 @@ link_intrastage_shaders(void *mem_ctx, linked->ir = new(linked) exec_list; clone_ir_list(mem_ctx, linked->ir, main->ir); - linked->UniformBlocks = uniform_blocks; - linked->NumUniformBlocks = num_uniform_blocks; - ralloc_steal(linked, linked->UniformBlocks); + linked->BufferInterfaceBlocks = uniform_blocks; + linked->NumBufferInterfaceBlocks = num_uniform_blocks; + ralloc_steal(linked, linked->BufferInterfaceBlocks); link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders); link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders); @@ -2804,19 +2804,19 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog) for (unsigned i = 0; i < prog->NumBufferInterfaceBlocks; i++) { /* Don't check SSBOs for Uniform Block Size */ - if (!prog->UniformBlocks[i].IsShaderStorage && - prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) { + if (!prog->BufferInterfaceBlocks[i].IsShaderStorage && + prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) { linker_error(prog, "Uniform block %s too big (%d/%d)\n", - prog->UniformBlocks[i].Name, - prog->UniformBlocks[i].UniformBufferSize, + prog->BufferInterfaceBlocks[i].Name, + prog->BufferInterfaceBlocks[i].UniformBufferSize, ctx->Const.MaxUniformBlockSize); } - if (prog->UniformBlocks[i].IsShaderStorage && - prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) { + if (prog->BufferInterfaceBlocks[i].IsShaderStorage && + prog->BufferInterfaceBlocks[i].UniformBufferSize > ctx->Const.MaxShaderStorageBlockSize) { linker_error(prog, "Shader storage block %s too big (%d/%d)\n", - prog->UniformBlocks[i].Name, - prog->UniformBlocks[i].UniformBufferSize, + prog->BufferInterfaceBlocks[i].Name, + prog->BufferInterfaceBlocks[i].UniformBufferSize, ctx->Const.MaxShaderStorageBlockSize); } @@ -2824,7 +2824,7 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog) if (prog->UniformBlockStageIndex[j][i] != -1) { struct gl_shader *sh = prog->_LinkedShaders[j]; int stage_index = prog->UniformBlockStageIndex[j][i]; - if (sh && sh->UniformBlocks[stage_index].IsShaderStorage) { + if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) { shader_blocks[j]++; total_shader_storage_blocks++; } else { @@ -2941,7 +2941,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog) for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) { int stage_index = prog->UniformBlockStageIndex[i][j]; - if (stage_index != -1 && sh->UniformBlocks[stage_index].IsShaderStorage) + if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) total_shader_storage_blocks++; } @@ -3147,7 +3147,7 @@ should_add_buffer_variable(struct gl_shader_program *shProg, return true; for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - block_name = shProg->UniformBlocks[i].Name; + block_name = shProg->BufferInterfaceBlocks[i].Name; if (strncmp(block_name, name, strlen(block_name)) == 0) { found_interface = true; break; @@ -3480,10 +3480,10 @@ build_program_resource_list(struct gl_shader_program *shProg) /* Add program uniform blocks and shader storage blocks. */ for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - bool is_shader_storage = shProg->UniformBlocks[i].IsShaderStorage; + bool is_shader_storage = shProg->BufferInterfaceBlocks[i].IsShaderStorage; GLenum type = is_shader_storage ? GL_SHADER_STORAGE_BLOCK : GL_UNIFORM_BLOCK; if (!add_program_resource(shProg, type, - &shProg->UniformBlocks[i], 0)) + &shProg->BufferInterfaceBlocks[i], 0)) return; } diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index c8ec5c19f41..6886f1439ec 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -279,8 +279,8 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, /* Locate the ubo block by interface name */ this->uniform_block = NULL; - for (unsigned i = 0; i < shader->NumUniformBlocks; i++) { - if (strcmp(field_name, shader->UniformBlocks[i].Name) == 0) { + for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) { + if (strcmp(field_name, shader->BufferInterfaceBlocks[i].Name) == 0) { ir_constant *index = new(mem_ctx) ir_constant(i); @@ -292,9 +292,9 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, this->uniform_block = index; } - this->is_shader_storage = shader->UniformBlocks[i].IsShaderStorage; + this->is_shader_storage = shader->BufferInterfaceBlocks[i].IsShaderStorage; - struct gl_uniform_block *block = &shader->UniformBlocks[i]; + struct gl_uniform_block *block = &shader->BufferInterfaceBlocks[i]; this->ubo_var = var->is_interface_instance() ? &block->Uniforms[0] : &block->Uniforms[var->data.location]; diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 4b9201e2f60..6f67b1dae5b 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -152,7 +152,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.name = ralloc_asprintf(shader, "GLSL%d", sh->Name); shader->info.num_textures = num_textures; - shader->info.num_ubos = sh->NumUniformBlocks; + shader->info.num_ubos = sh->NumBufferInterfaceBlocks; shader->info.num_abos = shader_prog->NumAtomicBuffers; shader->info.num_ssbos = shader_prog->NumBufferInterfaceBlocks; shader->info.num_images = sh->NumImages; diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp index f08e2d53506..59527927776 100644 --- a/src/glsl/standalone_scaffolding.cpp +++ b/src/glsl/standalone_scaffolding.cpp @@ -107,8 +107,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) ralloc_free(shProg->InfoLog); shProg->InfoLog = ralloc_strdup(shProg, ""); - ralloc_free(shProg->UniformBlocks); - shProg->UniformBlocks = NULL; + ralloc_free(shProg->BufferInterfaceBlocks); + shProg->BufferInterfaceBlocks = NULL; shProg->NumBufferInterfaceBlocks = 0; for (i = 0; i < MESA_SHADER_STAGES; i++) { ralloc_free(shProg->UniformBlockStageIndex[i]); diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 0f743fb43c1..b41e842005a 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -1128,9 +1128,9 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage, next_binding_table_offset += num_textures; if (shader) { - assert(shader->NumUniformBlocks <= BRW_MAX_COMBINED_UBO_SSBO); + assert(shader->NumBufferInterfaceBlocks <= BRW_MAX_COMBINED_UBO_SSBO); stage_prog_data->binding_table.ubo_start = next_binding_table_offset; - next_binding_table_offset += shader->NumUniformBlocks; + next_binding_table_offset += shader->NumBufferInterfaceBlocks; } else { stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0; } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index d73f657edc7..3c019771603 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -929,17 +929,17 @@ brw_upload_ubo_surfaces(struct brw_context *brw, uint32_t *surf_offsets = &stage_state->surf_offset[prog_data->binding_table.ubo_start]; - for (int i = 0; i < shader->NumUniformBlocks; i++) { + for (int i = 0; i < shader->NumBufferInterfaceBlocks; i++) { struct intel_buffer_object *intel_bo; /* Because behavior for referencing outside of the binding's size in the * glBindBufferRange case is undefined, we can just bind the whole buffer * glBindBufferBase wants and be a correct implementation. */ - if (!shader->UniformBlocks[i].IsShaderStorage) { + if (!shader->BufferInterfaceBlocks[i].IsShaderStorage) { struct gl_uniform_buffer_binding *binding; binding = - &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding]; + &ctx->UniformBufferBindings[shader->BufferInterfaceBlocks[i].Binding]; if (binding->BufferObject == ctx->Shared->NullBufferObj) { brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]); } else { @@ -956,7 +956,7 @@ brw_upload_ubo_surfaces(struct brw_context *brw, } else { struct gl_shader_storage_buffer_binding *binding; binding = - &ctx->ShaderStorageBufferBindings[shader->UniformBlocks[i].Binding]; + &ctx->ShaderStorageBufferBindings[shader->BufferInterfaceBlocks[i].Binding]; if (binding->BufferObject == ctx->Shared->NullBufferObj) { brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]); } else { @@ -973,7 +973,7 @@ brw_upload_ubo_surfaces(struct brw_context *brw, } } - if (shader->NumUniformBlocks) + if (shader->NumBufferInterfaceBlocks) brw->ctx.NewDriverState |= BRW_NEW_SURFACES; } diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 62eb5927637..f62ad3416ea 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2288,8 +2288,8 @@ struct gl_shader * * These fields are only set post-linking. */ - unsigned NumUniformBlocks; - struct gl_uniform_block *UniformBlocks; + unsigned NumBufferInterfaceBlocks; + struct gl_uniform_block *BufferInterfaceBlocks; struct exec_list *ir; struct exec_list *packed_varyings; @@ -2688,7 +2688,7 @@ struct gl_shader_program unsigned LastClipDistanceArraySize; unsigned NumBufferInterfaceBlocks; - struct gl_uniform_block *UniformBlocks; + struct gl_uniform_block *BufferInterfaceBlocks; /** * Indices into the _LinkedShaders's UniformBlocks[] array for each stage diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index 324e1a68fa4..50b44fa9e16 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -927,7 +927,7 @@ program_resource_top_level_array_size(struct gl_shader_program *shProg, int array_size = -1; char *var_name = get_top_level_name(name); char *interface_name = - get_top_level_name(shProg->UniformBlocks[block_index].Name); + get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); if (strcmp(var_name, interface_name) == 0) { /* Deal with instanced array of SSBOs */ @@ -997,7 +997,7 @@ program_resource_top_level_array_stride(struct gl_shader_program *shProg, int array_stride = -1; char *var_name = get_top_level_name(name); char *interface_name = - get_top_level_name(shProg->UniformBlocks[block_index].Name); + get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); if (strcmp(var_name, interface_name) == 0) { /* Deal with instanced array of SSBOs */ diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index 9dd1054c8ee..6a2f60db77e 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -716,7 +716,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname, for (i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { /* Add one for the terminating NUL character. */ - const GLint len = strlen(shProg->UniformBlocks[i].Name) + 1; + const GLint len = strlen(shProg->BufferInterfaceBlocks[i].Name) + 1; if (len > max_len) max_len = len; @@ -731,7 +731,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname, *params = 0; for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - if (!shProg->UniformBlocks[i].IsShaderStorage) + if (!shProg->BufferInterfaceBlocks[i].IsShaderStorage) (*params)++; } return; diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c index 4e85fda24b4..ffc71931fec 100644 --- a/src/mesa/main/shaderobj.c +++ b/src/mesa/main/shaderobj.c @@ -290,8 +290,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) ralloc_free(shProg->InfoLog); shProg->InfoLog = ralloc_strdup(shProg, ""); - ralloc_free(shProg->UniformBlocks); - shProg->UniformBlocks = NULL; + ralloc_free(shProg->BufferInterfaceBlocks); + shProg->BufferInterfaceBlocks = NULL; shProg->NumBufferInterfaceBlocks = 0; for (i = 0; i < MESA_SHADER_STAGES; i++) { ralloc_free(shProg->UniformBlockStageIndex[i]); diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c index 04cc81f9809..bc235380d97 100644 --- a/src/mesa/main/uniforms.c +++ b/src/mesa/main/uniforms.c @@ -1016,21 +1016,21 @@ _mesa_UniformBlockBinding(GLuint program, return; } - if (shProg->UniformBlocks[uniformBlockIndex].Binding != + if (shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding != uniformBlockBinding) { int i; FLUSH_VERTICES(ctx, 0); ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer; - shProg->UniformBlocks[uniformBlockIndex].Binding = uniformBlockBinding; + shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding; for (i = 0; i < MESA_SHADER_STAGES; i++) { int stage_index = shProg->UniformBlockStageIndex[i][uniformBlockIndex]; if (stage_index != -1) { struct gl_shader *sh = shProg->_LinkedShaders[i]; - sh->UniformBlocks[stage_index].Binding = uniformBlockBinding; + sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding; } } } @@ -1069,21 +1069,21 @@ _mesa_ShaderStorageBlockBinding(GLuint program, return; } - if (shProg->UniformBlocks[shaderStorageBlockIndex].Binding != + if (shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding != shaderStorageBlockBinding) { int i; FLUSH_VERTICES(ctx, 0); ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer; - shProg->UniformBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding; + shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding; for (i = 0; i < MESA_SHADER_STAGES; i++) { int stage_index = shProg->UniformBlockStageIndex[i][shaderStorageBlockIndex]; if (stage_index != -1) { struct gl_shader *sh = shProg->_LinkedShaders[i]; - sh->UniformBlocks[stage_index].Binding = shaderStorageBlockBinding; + sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding; } } } diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c index 6affb4d84d5..69e26cb6c26 100644 --- a/src/mesa/state_tracker/st_atom_constbuf.c +++ b/src/mesa/state_tracker/st_atom_constbuf.c @@ -234,11 +234,11 @@ static void st_bind_ubos(struct st_context *st, if (!shader) return; - for (i = 0; i < shader->NumUniformBlocks; i++) { + for (i = 0; i < shader->NumBufferInterfaceBlocks; i++) { struct gl_uniform_buffer_binding *binding; struct st_buffer_object *st_obj; - binding = &st->ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding]; + binding = &st->ctx->UniformBufferBindings[shader->BufferInterfaceBlocks[i].Binding]; st_obj = st_buffer_object(binding->BufferObject); cb.buffer = st_obj->buffer; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index cdd80f167d0..06f510db536 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -5388,10 +5388,10 @@ st_translate_program( } if (program->shader) { - unsigned num_ubos = program->shader->NumUniformBlocks; + unsigned num_ubos = program->shader->NumBufferInterfaceBlocks; for (i = 0; i < num_ubos; i++) { - unsigned size = program->shader->UniformBlocks[i].UniformBufferSize; + unsigned size = program->shader->BufferInterfaceBlocks[i].UniformBufferSize; unsigned num_const_vecs = (size + 15) / 16; unsigned first, last; assert(num_const_vecs > 0); From d31f98a272e429d5782192919b7628494ad1adf3 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 1 Oct 2015 10:17:30 +0200 Subject: [PATCH 109/270] mesa: Add {Num}UniformBlocks and {Num}ShaderStorageBlocks to gl_shader{_program} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These arrays provide backends with separate index spaces for UBOS and SSBOs. Reviewed-by: Kristian Høgsberg --- src/glsl/linker.cpp | 61 +++++++++++++++++++++++++++++ src/glsl/standalone_scaffolding.cpp | 9 +++++ src/mesa/main/mtypes.h | 49 ++++++++++++++++++++++- 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 8d30bea8cf0..972bd40fa9f 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -3599,6 +3599,42 @@ link_assign_subroutine_types(struct gl_shader_program *prog) } } +static void +split_ubos_and_ssbos(void *mem_ctx, + struct gl_uniform_block *blocks, + unsigned num_blocks, + struct gl_uniform_block ***ubos, + unsigned *num_ubos, + struct gl_uniform_block ***ssbos, + unsigned *num_ssbos) +{ + unsigned num_ubo_blocks = 0; + unsigned num_ssbo_blocks = 0; + + for (unsigned i = 0; i < num_blocks; i++) { + if (blocks[i].IsShaderStorage) + num_ssbo_blocks++; + else + num_ubo_blocks++; + } + + *ubos = ralloc_array(mem_ctx, gl_uniform_block *, num_ubo_blocks); + *num_ubos = 0; + + *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks); + *num_ssbos = 0; + + for (unsigned i = 0; i < num_blocks; i++) { + if (blocks[i].IsShaderStorage) { + (*ssbos)[(*num_ssbos)++] = &blocks[i]; + } else { + (*ubos)[(*num_ubos)++] = &blocks[i]; + } + } + + assert(*num_ubos + *num_ssbos == num_blocks); +} + void link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) { @@ -4110,6 +4146,31 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) } } + /* Split BufferInterfaceBlocks into UniformBlocks and ShaderStorageBlocks + * for gl_shader_program and gl_shader, so that drivers that need separate + * index spaces for each set can have that. + */ + for (unsigned i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) { + if (prog->_LinkedShaders[i] != NULL) { + gl_shader *sh = prog->_LinkedShaders[i]; + split_ubos_and_ssbos(sh, + sh->BufferInterfaceBlocks, + sh->NumBufferInterfaceBlocks, + &sh->UniformBlocks, + &sh->NumUniformBlocks, + &sh->ShaderStorageBlocks, + &sh->NumShaderStorageBlocks); + } + } + + split_ubos_and_ssbos(prog, + prog->BufferInterfaceBlocks, + prog->NumBufferInterfaceBlocks, + &prog->UniformBlocks, + &prog->NumUniformBlocks, + &prog->ShaderStorageBlocks, + &prog->NumShaderStorageBlocks); + /* FINISHME: Assign fragment shader output locations. */ done: diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp index 59527927776..eccf094b5cd 100644 --- a/src/glsl/standalone_scaffolding.cpp +++ b/src/glsl/standalone_scaffolding.cpp @@ -110,6 +110,15 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) ralloc_free(shProg->BufferInterfaceBlocks); shProg->BufferInterfaceBlocks = NULL; shProg->NumBufferInterfaceBlocks = 0; + + ralloc_free(shProg->UniformBlocks); + shProg->UniformBlocks = NULL; + shProg->NumUniformBlocks = 0; + + ralloc_free(shProg->ShaderStorageBlocks); + shProg->ShaderStorageBlocks = NULL; + shProg->NumShaderStorageBlocks = 0; + for (i = 0; i < MESA_SHADER_STAGES; i++) { ralloc_free(shProg->UniformBlockStageIndex[i]); shProg->UniformBlockStageIndex[i] = NULL; diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index f62ad3416ea..f7118c1e7a6 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2284,13 +2284,35 @@ struct gl_shader unsigned num_combined_uniform_components; /** - * This shader's uniform block information. + * This shader's uniform/ssbo block information. * * These fields are only set post-linking. + * + * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is + * useful during the linking process so that we don't have to handle SSBOs + * specifically. + * + * UniformBlocks is a list of UBOs. This is useful for backends that need + * or prefer to see separate index spaces for UBOS and SSBOs like the GL + * API specifies. + * + * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that + * need or prefer to see separate index spaces for UBOS and SSBOs like the + * GL API specifies. + * + * UniformBlocks and ShaderStorageBlocks only have pointers into + * BufferInterfaceBlocks so the actual resource information is not + * duplicated. */ unsigned NumBufferInterfaceBlocks; struct gl_uniform_block *BufferInterfaceBlocks; + unsigned NumUniformBlocks; + struct gl_uniform_block **UniformBlocks; + + unsigned NumShaderStorageBlocks; + struct gl_uniform_block **ShaderStorageBlocks; + struct exec_list *ir; struct exec_list *packed_varyings; struct glsl_symbol_table *symbols; @@ -2687,9 +2709,34 @@ struct gl_shader_program */ unsigned LastClipDistanceArraySize; + /** + * This shader's uniform/ssbo block information. + * + * BufferInterfaceBlocks is a list containing both UBOs and SSBOs. This is + * useful during the linking process so that we don't have to handle SSBOs + * specifically. + * + * UniformBlocks is a list of UBOs. This is useful for backends that need + * or prefer to see separate index spaces for UBOS and SSBOs like the GL + * API specifies. + * + * ShaderStorageBlocks is a list of SSBOs. This is useful for backends that + * need or prefer to see separate index spaces for UBOS and SSBOs like the + * GL API specifies. + * + * UniformBlocks and ShaderStorageBlocks only have pointers into + * BufferInterfaceBlocks so the actual resource information is not + * duplicated and are only set after linking. + */ unsigned NumBufferInterfaceBlocks; struct gl_uniform_block *BufferInterfaceBlocks; + unsigned NumUniformBlocks; + struct gl_uniform_block **UniformBlocks; + + unsigned NumShaderStorageBlocks; + struct gl_uniform_block **ShaderStorageBlocks; + /** * Indices into the _LinkedShaders's UniformBlocks[] array for each stage * they're used in, or -1. From 56e2bdbca36a20f2601d32830a7b4ef556803ebe Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 9 Oct 2015 13:54:41 +0200 Subject: [PATCH 110/270] glsl/lower_ubo_reference: lower UBOs and SSBOs to separate index spaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Kristian Høgsberg --- src/glsl/lower_ubo_reference.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index 6886f1439ec..da2713e4ab5 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -277,10 +277,20 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, interface_field_name(mem_ctx, (char *) var->get_interface_type()->name, deref, &nonconst_block_index); - /* Locate the ubo block by interface name */ + /* Locate the block by interface name */ + this->is_shader_storage = var->is_in_shader_storage_block(); + unsigned num_blocks; + struct gl_uniform_block **blocks; + if (this->is_shader_storage) { + num_blocks = shader->NumShaderStorageBlocks; + blocks = shader->ShaderStorageBlocks; + } else { + num_blocks = shader->NumUniformBlocks; + blocks = shader->UniformBlocks; + } this->uniform_block = NULL; - for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) { - if (strcmp(field_name, shader->BufferInterfaceBlocks[i].Name) == 0) { + for (unsigned i = 0; i < num_blocks; i++) { + if (strcmp(field_name, blocks[i]->Name) == 0) { ir_constant *index = new(mem_ctx) ir_constant(i); @@ -292,12 +302,8 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, this->uniform_block = index; } - this->is_shader_storage = shader->BufferInterfaceBlocks[i].IsShaderStorage; - - struct gl_uniform_block *block = &shader->BufferInterfaceBlocks[i]; - this->ubo_var = var->is_interface_instance() - ? &block->Uniforms[0] : &block->Uniforms[var->data.location]; + ? &blocks[i]->Uniforms[0] : &blocks[i]->Uniforms[var->data.location]; break; } From d3f45888045c84b2bc382a34d169a0ede4774a24 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 9 Oct 2015 14:41:21 +0200 Subject: [PATCH 111/270] i965: Adapt SSBOs to work with their own separate index space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Kristian Høgsberg --- src/mesa/drivers/dri/i965/brw_context.h | 4 +- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 24 +++--- src/mesa/drivers/dri/i965/brw_shader.cpp | 9 ++- src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 24 +++--- .../drivers/dri/i965/brw_wm_surface_state.c | 79 +++++++++---------- 5 files changed, 71 insertions(+), 69 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index e59478a448a..4aba7b814c6 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -352,6 +352,7 @@ struct brw_stage_prog_data { uint32_t texture_start; uint32_t gather_texture_start; uint32_t ubo_start; + uint32_t ssbo_start; uint32_t abo_start; uint32_t image_start; uint32_t shader_time_start; @@ -717,9 +718,6 @@ struct brw_vs_prog_data { /** Max number of SSBOs in a shader */ #define BRW_MAX_SSBO 12 -/** Max number of combined UBOs and SSBOs in a shader */ -#define BRW_MAX_COMBINED_UBO_SSBO (BRW_MAX_UBO + BRW_MAX_SSBO) - /** Max number of atomic counter buffer objects in a shader */ #define BRW_MAX_ABO 16 diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 21d2967935a..05f3f63204b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1482,21 +1482,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg surf_index; if (const_uniform_block) { - unsigned index = stage_prog_data->binding_table.ubo_start + + unsigned index = stage_prog_data->binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = fs_reg(index); brw_mark_surface_used(prog_data, index); } else { surf_index = vgrf(glsl_type::uint_type); bld.ADD(surf_index, get_nir_src(instr->src[0]), - fs_reg(stage_prog_data->binding_table.ubo_start)); + fs_reg(stage_prog_data->binding_table.ssbo_start)); surf_index = bld.emit_uniformize(surf_index); /* Assume this may touch any UBO. It would be nice to provide * a tighter bound, but the array information is already lowered away. */ brw_mark_surface_used(prog_data, - stage_prog_data->binding_table.ubo_start + + stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -1738,18 +1738,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[1]); if (const_uniform_block) { - unsigned index = stage_prog_data->binding_table.ubo_start + + unsigned index = stage_prog_data->binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = fs_reg(index); brw_mark_surface_used(prog_data, index); } else { surf_index = vgrf(glsl_type::uint_type); bld.ADD(surf_index, get_nir_src(instr->src[1]), - fs_reg(stage_prog_data->binding_table.ubo_start)); + fs_reg(stage_prog_data->binding_table.ssbo_start)); surf_index = bld.emit_uniformize(surf_index); brw_mark_surface_used(prog_data, - stage_prog_data->binding_table.ubo_start + + stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -1864,7 +1864,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_get_buffer_size: { nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); - unsigned ubo_index = const_uniform_block ? const_uniform_block->u[0] : 0; + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; int reg_width = dispatch_width / 8; /* Set LOD = 0 */ @@ -1875,7 +1875,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr BRW_REGISTER_TYPE_UD); bld.LOAD_PAYLOAD(src_payload, &source, 1, 0); - fs_reg surf_index = fs_reg(prog_data->binding_table.ubo_start + ubo_index); + fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index); fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest, src_payload, surf_index); inst->header_size = 0; @@ -1928,20 +1928,20 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, fs_reg surface; nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); if (const_surface) { - unsigned surf_index = stage_prog_data->binding_table.ubo_start + + unsigned surf_index = stage_prog_data->binding_table.ssbo_start + const_surface->u[0]; surface = fs_reg(surf_index); brw_mark_surface_used(prog_data, surf_index); } else { surface = vgrf(glsl_type::uint_type); bld.ADD(surface, get_nir_src(instr->src[0]), - fs_reg(stage_prog_data->binding_table.ubo_start)); + fs_reg(stage_prog_data->binding_table.ssbo_start)); - /* Assume this may touch any UBO. This is the same we do for other + /* Assume this may touch any SSBO. This is the same we do for other * UBO/SSBO accesses with non-constant surface. */ brw_mark_surface_used(prog_data, - stage_prog_data->binding_table.ubo_start + + stage_prog_data->binding_table.ssbo_start + nir->info.num_ssbos - 1); } diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index b41e842005a..7ee0c66468c 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -1128,11 +1128,16 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage, next_binding_table_offset += num_textures; if (shader) { - assert(shader->NumBufferInterfaceBlocks <= BRW_MAX_COMBINED_UBO_SSBO); + assert(shader->NumUniformBlocks <= BRW_MAX_UBO); stage_prog_data->binding_table.ubo_start = next_binding_table_offset; - next_binding_table_offset += shader->NumBufferInterfaceBlocks; + next_binding_table_offset += shader->NumUniformBlocks; + + assert(shader->NumShaderStorageBlocks <= BRW_MAX_SSBO); + stage_prog_data->binding_table.ssbo_start = next_binding_table_offset; + next_binding_table_offset += shader->NumShaderStorageBlocks; } else { stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0; + stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0; } if (INTEL_DEBUG & DEBUG_SHADER_TIME) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 9e095fb52c8..0025f3647a1 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -423,10 +423,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_get_buffer_size: { nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); - unsigned ubo_index = const_uniform_block ? const_uniform_block->u[0] : 0; + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; - src_reg surf_index = src_reg(prog_data->base.binding_table.ubo_start + - ubo_index); + src_reg surf_index = src_reg(prog_data->base.binding_table.ssbo_start + + ssbo_index); dst_reg result_dst = get_nir_dest(instr->dest); vec4_instruction *inst = new(mem_ctx) vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst); @@ -456,18 +456,18 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[1]); if (const_uniform_block) { - unsigned index = prog_data->base.binding_table.ubo_start + + unsigned index = prog_data->base.binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = src_reg(index); brw_mark_surface_used(&prog_data->base, index); } else { surf_index = src_reg(this, glsl_type::uint_type); emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1), - src_reg(prog_data->base.binding_table.ubo_start))); + src_reg(prog_data->base.binding_table.ssbo_start))); surf_index = emit_uniformize(surf_index); brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.ubo_start + + prog_data->base.binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -599,7 +599,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg surf_index; if (const_uniform_block) { - unsigned index = prog_data->base.binding_table.ubo_start + + unsigned index = prog_data->base.binding_table.ssbo_start + const_uniform_block->u[0]; surf_index = src_reg(index); @@ -607,14 +607,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) } else { surf_index = src_reg(this, glsl_type::uint_type); emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1), - src_reg(prog_data->base.binding_table.ubo_start))); + src_reg(prog_data->base.binding_table.ssbo_start))); surf_index = emit_uniformize(surf_index); /* Assume this may touch any UBO. It would be nice to provide * a tighter bound, but the array information is already lowered away. */ brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.ubo_start + + prog_data->base.binding_table.ssbo_start + nir->info.num_ssbos - 1); } @@ -821,20 +821,20 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) src_reg surface; nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); if (const_surface) { - unsigned surf_index = prog_data->base.binding_table.ubo_start + + unsigned surf_index = prog_data->base.binding_table.ssbo_start + const_surface->u[0]; surface = src_reg(surf_index); brw_mark_surface_used(&prog_data->base, surf_index); } else { surface = src_reg(this, glsl_type::uint_type); emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]), - src_reg(prog_data->base.binding_table.ubo_start))); + src_reg(prog_data->base.binding_table.ssbo_start))); /* Assume this may touch any UBO. This is the same we do for other * UBO/SSBO accesses with non-constant surface. */ brw_mark_surface_used(&prog_data->base, - prog_data->base.binding_table.ubo_start + + prog_data->base.binding_table.ssbo_start + nir->info.num_ssbos - 1); } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 3c019771603..a304eec3249 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -926,50 +926,49 @@ brw_upload_ubo_surfaces(struct brw_context *brw, if (!shader) return; - uint32_t *surf_offsets = + uint32_t *ubo_surf_offsets = &stage_state->surf_offset[prog_data->binding_table.ubo_start]; - for (int i = 0; i < shader->NumBufferInterfaceBlocks; i++) { - struct intel_buffer_object *intel_bo; + for (int i = 0; i < shader->NumUniformBlocks; i++) { + struct gl_uniform_buffer_binding *binding = + &ctx->UniformBufferBindings[shader->UniformBlocks[i]->Binding]; - /* Because behavior for referencing outside of the binding's size in the - * glBindBufferRange case is undefined, we can just bind the whole buffer - * glBindBufferBase wants and be a correct implementation. - */ - if (!shader->BufferInterfaceBlocks[i].IsShaderStorage) { - struct gl_uniform_buffer_binding *binding; - binding = - &ctx->UniformBufferBindings[shader->BufferInterfaceBlocks[i].Binding]; - if (binding->BufferObject == ctx->Shared->NullBufferObj) { - brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]); - } else { - intel_bo = intel_buffer_object(binding->BufferObject); - drm_intel_bo *bo = - intel_bufferobj_buffer(brw, intel_bo, - binding->Offset, - binding->BufferObject->Size - binding->Offset); - brw_create_constant_surface(brw, bo, binding->Offset, - binding->BufferObject->Size - binding->Offset, - &surf_offsets[i], - dword_pitch); - } + if (binding->BufferObject == ctx->Shared->NullBufferObj) { + brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &ubo_surf_offsets[i]); } else { - struct gl_shader_storage_buffer_binding *binding; - binding = - &ctx->ShaderStorageBufferBindings[shader->BufferInterfaceBlocks[i].Binding]; - if (binding->BufferObject == ctx->Shared->NullBufferObj) { - brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &surf_offsets[i]); - } else { - intel_bo = intel_buffer_object(binding->BufferObject); - drm_intel_bo *bo = - intel_bufferobj_buffer(brw, intel_bo, - binding->Offset, - binding->BufferObject->Size - binding->Offset); - brw_create_buffer_surface(brw, bo, binding->Offset, - binding->BufferObject->Size - binding->Offset, - &surf_offsets[i], - dword_pitch); - } + struct intel_buffer_object *intel_bo = + intel_buffer_object(binding->BufferObject); + drm_intel_bo *bo = + intel_bufferobj_buffer(brw, intel_bo, + binding->Offset, + binding->BufferObject->Size - binding->Offset); + brw_create_constant_surface(brw, bo, binding->Offset, + binding->BufferObject->Size - binding->Offset, + &ubo_surf_offsets[i], + dword_pitch); + } + } + + uint32_t *ssbo_surf_offsets = + &stage_state->surf_offset[prog_data->binding_table.ssbo_start]; + + for (int i = 0; i < shader->NumShaderStorageBlocks; i++) { + struct gl_shader_storage_buffer_binding *binding = + &ctx->ShaderStorageBufferBindings[shader->ShaderStorageBlocks[i]->Binding]; + + if (binding->BufferObject == ctx->Shared->NullBufferObj) { + brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, &ssbo_surf_offsets[i]); + } else { + struct intel_buffer_object *intel_bo = + intel_buffer_object(binding->BufferObject); + drm_intel_bo *bo = + intel_bufferobj_buffer(brw, intel_bo, + binding->Offset, + binding->BufferObject->Size - binding->Offset); + brw_create_buffer_surface(brw, bo, binding->Offset, + binding->BufferObject->Size - binding->Offset, + &ssbo_surf_offsets[i], + dword_pitch); } } From b76159b09617a07b9e3d53d64d31d4835702827f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tapani=20P=C3=A4lli?= Date: Tue, 13 Oct 2015 14:17:49 +0300 Subject: [PATCH 112/270] glsl: add top level array size and stride to gl_uniform_storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch adds 2 new fields to gl_uniform_storage so that we don't need to calculate these values during runtime shader queries. This is required by upcoming changes to free GLSL IR after linking. Patch moves 3 booleans inside structure so that structure size stays the same after this change. Signed-off-by: Tapani Pälli Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/ir_uniform.h | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/glsl/ir_uniform.h b/src/glsl/ir_uniform.h index 50fe76b7ea2..1854279925b 100644 --- a/src/glsl/ir_uniform.h +++ b/src/glsl/ir_uniform.h @@ -161,6 +161,22 @@ struct gl_uniform_storage { /** @} */ + /** + * This is a compiler-generated uniform that should not be advertised + * via the API. + */ + bool hidden; + + /** + * This is a built-in uniform that should not be modified through any gl API. + */ + bool builtin; + + /** + * This is a shader storage buffer variable, not an uniform. + */ + bool is_shader_storage; + /** * Index within gl_shader_program::AtomicBuffers[] of the atomic * counter buffer this uniform is stored in, or -1 if this is not @@ -181,20 +197,16 @@ struct gl_uniform_storage { unsigned num_compatible_subroutines; /** - * This is a compiler-generated uniform that should not be advertised - * via the API. + * A single integer identifying the number of active array elements of + * the top-level shader storage block member (GL_TOP_LEVEL_ARRAY_SIZE). */ - bool hidden; + unsigned top_level_array_size; /** - * This is a built-in uniform that should not be modified through any gl API. + * A single integer identifying the stride between array elements of the + * top-level shader storage block member. (GL_TOP_LEVEL_ARRAY_STRIDE). */ - bool builtin; - - /** - * This is a shader storage buffer variable, not an uniform. - */ - bool is_shader_storage; + unsigned top_level_array_stride; }; #ifdef __cplusplus From ac257f1070add308004c5c79a8acfdef5a6778da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tapani=20P=C3=A4lli?= Date: Wed, 14 Oct 2015 11:01:29 +0300 Subject: [PATCH 113/270] glsl: calculate TOP_LEVEL_ARRAY_SIZE and STRIDE when adding resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch moves existing calculation code from shader_query.cpp to happen during program resource list creation. No Piglit or CTS regressions were observed during testing. Signed-off-by: Tapani Pälli Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/linker.cpp | 241 ++++++++++++++++++++++++++++++++ src/mesa/main/shader_query.cpp | 244 +-------------------------------- 2 files changed, 243 insertions(+), 242 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 972bd40fa9f..d787b88cd39 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -3389,6 +3389,242 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage) return true; } +static char* +get_top_level_name(const char *name) +{ + const char *first_dot = strchr(name, '.'); + const char *first_square_bracket = strchr(name, '['); + int name_size = 0; + /* From ARB_program_interface_query spec: + * + * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying the + * number of active array elements of the top-level shader storage block + * member containing to the active variable is written to . If the + * top-level block member is not declared as an array, the value one is + * written to . If the top-level block member is an array with no + * declared size, the value zero is written to . + */ + + /* The buffer variable is on top level.*/ + if (!first_square_bracket && !first_dot) + name_size = strlen(name); + else if ((!first_square_bracket || + (first_dot && first_dot < first_square_bracket))) + name_size = first_dot - name; + else + name_size = first_square_bracket - name; + + return strndup(name, name_size); +} + +static char* +get_var_name(const char *name) +{ + const char *first_dot = strchr(name, '.'); + + if (!first_dot) + return strdup(name); + + return strndup(first_dot+1, strlen(first_dot) - 1); +} + +static bool +is_top_level_shader_storage_block_member(const char* name, + const char* interface_name, + const char* field_name) +{ + bool result = false; + + /* If the given variable is already a top-level shader storage + * block member, then return array_size = 1. + * We could have two possibilities: if we have an instanced + * shader storage block or not instanced. + * + * For the first, we check create a name as it was in top level and + * compare it with the real name. If they are the same, then + * the variable is already at top-level. + * + * Full instanced name is: interface name + '.' + var name + + * NULL character + */ + int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1; + char *full_instanced_name = (char *) calloc(name_length, sizeof(char)); + if (!full_instanced_name) { + fprintf(stderr, "%s: Cannot allocate space for name\n", __func__); + return false; + } + + snprintf(full_instanced_name, name_length, "%s.%s", + interface_name, field_name); + + /* Check if its top-level shader storage block member of an + * instanced interface block, or of a unnamed interface block. + */ + if (strcmp(name, full_instanced_name) == 0 || + strcmp(name, field_name) == 0) + result = true; + + free(full_instanced_name); + return result; +} + +static void +calculate_array_size(struct gl_shader_program *shProg, + struct gl_uniform_storage *uni) +{ + int block_index = uni->block_index; + int array_size = -1; + char *var_name = get_top_level_name(uni->name); + char *interface_name = + get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); + + if (strcmp(var_name, interface_name) == 0) { + /* Deal with instanced array of SSBOs */ + char *temp_name = get_var_name(uni->name); + free(var_name); + var_name = get_top_level_name(temp_name); + free(temp_name); + } + + for (unsigned i = 0; i < shProg->NumShaders; i++) { + if (shProg->Shaders[i] == NULL) + continue; + + const gl_shader *stage = shProg->Shaders[i]; + foreach_in_list(ir_instruction, node, stage->ir) { + ir_variable *var = node->as_variable(); + if (!var || !var->get_interface_type() || + var->data.mode != ir_var_shader_storage) + continue; + + const glsl_type *interface = var->get_interface_type(); + + if (strcmp(interface_name, interface->name) != 0) + continue; + + for (unsigned i = 0; i < interface->length; i++) { + const glsl_struct_field *field = &interface->fields.structure[i]; + if (strcmp(field->name, var_name) != 0) + continue; + /* From GL_ARB_program_interface_query spec: + * + * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer + * identifying the number of active array elements of the top-level + * shader storage block member containing to the active variable is + * written to . If the top-level block member is not + * declared as an array, the value one is written to . If + * the top-level block member is an array with no declared size, + * the value zero is written to . + */ + if (is_top_level_shader_storage_block_member(uni->name, + interface_name, + var_name)) + array_size = 1; + else if (field->type->is_unsized_array()) + array_size = 0; + else if (field->type->is_array()) + array_size = field->type->length; + else + array_size = 1; + + goto found_top_level_array_size; + } + } + } +found_top_level_array_size: + free(interface_name); + free(var_name); + uni->top_level_array_size = array_size; +} + +static void +calculate_array_stride(struct gl_shader_program *shProg, + struct gl_uniform_storage *uni) +{ + int block_index = uni->block_index; + int array_stride = -1; + char *var_name = get_top_level_name(uni->name); + char *interface_name = + get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); + + if (strcmp(var_name, interface_name) == 0) { + /* Deal with instanced array of SSBOs */ + char *temp_name = get_var_name(uni->name); + free(var_name); + var_name = get_top_level_name(temp_name); + free(temp_name); + } + + for (unsigned i = 0; i < shProg->NumShaders; i++) { + if (shProg->Shaders[i] == NULL) + continue; + + const gl_shader *stage = shProg->Shaders[i]; + foreach_in_list(ir_instruction, node, stage->ir) { + ir_variable *var = node->as_variable(); + if (!var || !var->get_interface_type() || + var->data.mode != ir_var_shader_storage) + continue; + + const glsl_type *interface = var->get_interface_type(); + + if (strcmp(interface_name, interface->name) != 0) { + continue; + } + + for (unsigned i = 0; i < interface->length; i++) { + const glsl_struct_field *field = &interface->fields.structure[i]; + if (strcmp(field->name, var_name) != 0) + continue; + /* From GL_ARB_program_interface_query: + * + * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer + * identifying the stride between array elements of the top-level + * shader storage block member containing the active variable is + * written to . For top-level block members declared as + * arrays, the value written is the difference, in basic machine + * units, between the offsets of the active variable for + * consecutive elements in the top-level array. For top-level + * block members not declared as an array, zero is written to + * ." + */ + if (field->type->is_array()) { + const enum glsl_matrix_layout matrix_layout = + glsl_matrix_layout(field->matrix_layout); + bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; + const glsl_type *array_type = field->type->fields.array; + + if (is_top_level_shader_storage_block_member(uni->name, + interface_name, + var_name)) { + array_stride = 0; + goto found_top_level_array_stride; + } + if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) { + if (array_type->is_record() || array_type->is_array()) { + array_stride = array_type->std140_size(row_major); + array_stride = glsl_align(array_stride, 16); + } else { + unsigned element_base_align = 0; + element_base_align = array_type->std140_base_alignment(row_major); + array_stride = MAX2(element_base_align, 16); + } + } else { + array_stride = array_type->std430_array_stride(row_major); + } + } else { + array_stride = 0; + } + goto found_top_level_array_stride; + } + } + } +found_top_level_array_stride: + free(interface_name); + free(var_name); + uni->top_level_array_stride = array_stride; +} + /** * Builds up a list of program resources that point to existing * resource data. @@ -3473,6 +3709,11 @@ build_program_resource_list(struct gl_shader_program *shProg) shProg->UniformStorage[i].name)) continue; + if (is_shader_storage) { + calculate_array_size(shProg, &shProg->UniformStorage[i]); + calculate_array_stride(shProg, &shProg->UniformStorage[i]); + } + if (!add_program_resource(shProg, type, &shProg->UniformStorage[i], stageref)) return; diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index 50b44fa9e16..8182d3dcc04 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -839,244 +839,6 @@ program_resource_location(struct gl_shader_program *shProg, } } -static char* -get_top_level_name(const char *name) -{ - const char *first_dot = strchr(name, '.'); - const char *first_square_bracket = strchr(name, '['); - int name_size = 0; - /* From ARB_program_interface_query spec: - * - * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer identifying the - * number of active array elements of the top-level shader storage block - * member containing to the active variable is written to . If the - * top-level block member is not declared as an array, the value one is - * written to . If the top-level block member is an array with no - * declared size, the value zero is written to . - */ - - /* The buffer variable is on top level.*/ - if (!first_square_bracket && !first_dot) - name_size = strlen(name); - else if ((!first_square_bracket || - (first_dot && first_dot < first_square_bracket))) - name_size = first_dot - name; - else - name_size = first_square_bracket - name; - - return strndup(name, name_size); -} - -static char* -get_var_name(const char *name) -{ - const char *first_dot = strchr(name, '.'); - - if (!first_dot) - return strdup(name); - - return strndup(first_dot+1, strlen(first_dot) - 1); -} - -static bool -is_top_level_shader_storage_block_member(const char* name, - const char* interface_name, - const char* field_name) -{ - bool result = false; - - /* If the given variable is already a top-level shader storage - * block member, then return array_size = 1. - * We could have two possibilities: if we have an instanced - * shader storage block or not instanced. - * - * For the first, we check create a name as it was in top level and - * compare it with the real name. If they are the same, then - * the variable is already at top-level. - * - * Full instanced name is: interface name + '.' + var name + - * NULL character - */ - int name_length = strlen(interface_name) + 1 + strlen(field_name) + 1; - char *full_instanced_name = (char *) calloc(name_length, sizeof(char)); - if (!full_instanced_name) { - fprintf(stderr, "%s: Cannot allocate space for name\n", __func__); - return false; - } - - snprintf(full_instanced_name, name_length, "%s.%s", - interface_name, field_name); - - /* Check if its top-level shader storage block member of an - * instanced interface block, or of a unnamed interface block. - */ - if (strcmp(name, full_instanced_name) == 0 || - strcmp(name, field_name) == 0) - result = true; - - free(full_instanced_name); - return result; -} - -static GLint -program_resource_top_level_array_size(struct gl_shader_program *shProg, - struct gl_program_resource *res, - const char *name) -{ - int block_index = RESOURCE_UNI(res)->block_index; - int array_size = -1; - char *var_name = get_top_level_name(name); - char *interface_name = - get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); - - if (strcmp(var_name, interface_name) == 0) { - /* Deal with instanced array of SSBOs */ - char *temp_name = get_var_name(name); - free(var_name); - var_name = get_top_level_name(temp_name); - free(temp_name); - } - - for (unsigned i = 0; i < shProg->NumShaders; i++) { - if (shProg->Shaders[i] == NULL) - continue; - - const gl_shader *stage = shProg->Shaders[i]; - foreach_in_list(ir_instruction, node, stage->ir) { - ir_variable *var = node->as_variable(); - if (!var || !var->get_interface_type() || - var->data.mode != ir_var_shader_storage) - continue; - - const glsl_type *interface = var->get_interface_type(); - - if (strcmp(interface_name, interface->name) != 0) - continue; - - for (unsigned i = 0; i < interface->length; i++) { - const glsl_struct_field *field = &interface->fields.structure[i]; - if (strcmp(field->name, var_name) != 0) - continue; - /* From GL_ARB_program_interface_query spec: - * - * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer - * identifying the number of active array elements of the top-level - * shader storage block member containing to the active variable is - * written to . If the top-level block member is not - * declared as an array, the value one is written to . If - * the top-level block member is an array with no declared size, - * the value zero is written to . - */ - if (is_top_level_shader_storage_block_member(name, - interface_name, - var_name)) - array_size = 1; - else if (field->type->is_unsized_array()) - array_size = 0; - else if (field->type->is_array()) - array_size = field->type->length; - else - array_size = 1; - - goto found_top_level_array_size; - } - } - } -found_top_level_array_size: - free(interface_name); - free(var_name); - return array_size; -} - -static GLint -program_resource_top_level_array_stride(struct gl_shader_program *shProg, - struct gl_program_resource *res, - const char *name) -{ - int block_index = RESOURCE_UNI(res)->block_index; - int array_stride = -1; - char *var_name = get_top_level_name(name); - char *interface_name = - get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name); - - if (strcmp(var_name, interface_name) == 0) { - /* Deal with instanced array of SSBOs */ - char *temp_name = get_var_name(name); - free(var_name); - var_name = get_top_level_name(temp_name); - free(temp_name); - } - - for (unsigned i = 0; i < shProg->NumShaders; i++) { - if (shProg->Shaders[i] == NULL) - continue; - - const gl_shader *stage = shProg->Shaders[i]; - foreach_in_list(ir_instruction, node, stage->ir) { - ir_variable *var = node->as_variable(); - if (!var || !var->get_interface_type() || - var->data.mode != ir_var_shader_storage) - continue; - - const glsl_type *interface = var->get_interface_type(); - - if (strcmp(interface_name, interface->name) != 0) { - continue; - } - - for (unsigned i = 0; i < interface->length; i++) { - const glsl_struct_field *field = &interface->fields.structure[i]; - if (strcmp(field->name, var_name) != 0) - continue; - /* From GL_ARB_program_interface_query: - * - * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer - * identifying the stride between array elements of the top-level - * shader storage block member containing the active variable is - * written to . For top-level block members declared as - * arrays, the value written is the difference, in basic machine - * units, between the offsets of the active variable for - * consecutive elements in the top-level array. For top-level - * block members not declared as an array, zero is written to - * ." - */ - if (field->type->is_array()) { - const enum glsl_matrix_layout matrix_layout = - glsl_matrix_layout(field->matrix_layout); - bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; - const glsl_type *array_type = field->type->fields.array; - - if (is_top_level_shader_storage_block_member(name, - interface_name, - var_name)) { - array_stride = 0; - goto found_top_level_array_stride; - } - if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) { - if (array_type->is_record() || array_type->is_array()) { - array_stride = array_type->std140_size(row_major); - array_stride = glsl_align(array_stride, 16); - } else { - unsigned element_base_align = 0; - element_base_align = array_type->std140_base_alignment(row_major); - array_stride = MAX2(element_base_align, 16); - } - } else { - array_stride = array_type->std430_array_stride(row_major); - } - } else { - array_stride = 0; - } - goto found_top_level_array_stride; - } - } - } -found_top_level_array_stride: - free(interface_name); - free(var_name); - return array_stride; -} - /** * Function implements following location queries: * glGetUniformLocation @@ -1444,14 +1206,12 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, case GL_TOP_LEVEL_ARRAY_SIZE: VALIDATE_TYPE(GL_BUFFER_VARIABLE); - *val = program_resource_top_level_array_size(shProg, res, - _mesa_program_resource_name(res)); + *val = RESOURCE_UNI(res)->top_level_array_size; return 1; case GL_TOP_LEVEL_ARRAY_STRIDE: VALIDATE_TYPE(GL_BUFFER_VARIABLE); - *val = program_resource_top_level_array_stride(shProg, res, - _mesa_program_resource_name(res)); + *val = RESOURCE_UNI(res)->top_level_array_stride; return 1; /* GL_ARB_tessellation_shader */ From 5423c1e855c65ae6f562895791aac982141266db Mon Sep 17 00:00:00 2001 From: Jose Fonseca Date: Wed, 14 Oct 2015 11:50:06 +0100 Subject: [PATCH 114/270] glsl: Include util/strndup.h. Fixes Windows builds. Trivial. --- src/glsl/linker.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index d787b88cd39..c61c76eff44 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -65,6 +65,7 @@ */ #include +#include "util/strndup.h" #include "main/core.h" #include "glsl_symbol_table.h" #include "glsl_parser_extras.h" From 93267887a06e760b4b20618523df5e8aa4e70307 Mon Sep 17 00:00:00 2001 From: Marta Lofstedt Date: Wed, 14 Oct 2015 13:35:32 +0200 Subject: [PATCH 115/270] glsl: Enable split of lower UBOs and SSBO also for compute shaders The split of Uniform blocks and shader storage block only loops up to MESA_SHADER_FRAGMENT and igonres compute shaders. This cause segfault when running the OpenGL ES 3.1 CTS tests with GL_ARB_compute_shader enabled. V2: Changed to use MESA_SHADER_STAGES instead of MESA_SHADER_COMPUTE Reviewed-by: Francisco Jerez Signed-off-by: Marta Lofstedt --- src/glsl/linker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index c61c76eff44..c15034ba904 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -4392,7 +4392,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) * for gl_shader_program and gl_shader, so that drivers that need separate * index spaces for each set can have that. */ - for (unsigned i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) { + for (unsigned i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++) { if (prog->_LinkedShaders[i] != NULL) { gl_shader *sh = prog->_LinkedShaders[i]; split_ubos_and_ssbos(sh, From 14f7ce42484c31a45fcb6aabdf503f7496a9a94c Mon Sep 17 00:00:00 2001 From: Krzysztof Sobiecki Date: Wed, 14 Oct 2015 10:03:00 -0600 Subject: [PATCH 116/270] st/fbo: use pipe_surface_release instead of pipe_surface_reference pipe_surface_reference have problems with deleted contexts, so use of pipe_surface_release might be more appropriate. Fixes Wasteland 2 Director's Cut crash on start. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Brian Paul --- src/mesa/state_tracker/st_cb_fbo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c index ff703fa41cb..2a2eb0992c8 100644 --- a/src/mesa/state_tracker/st_cb_fbo.c +++ b/src/mesa/state_tracker/st_cb_fbo.c @@ -456,7 +456,7 @@ st_update_renderbuffer_surface(struct st_context *st, surf_tmpl.u.tex.first_layer = first_layer; surf_tmpl.u.tex.last_layer = last_layer; - pipe_surface_reference(&strb->surface, NULL); + pipe_surface_release(pipe, &strb->surface); strb->surface = pipe->create_surface(pipe, resource, &surf_tmpl); } From 77eef8137056314c4d458f215a899e3eec42e910 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 13 Oct 2015 09:32:34 -0600 Subject: [PATCH 117/270] mesa: remove unused texUnit local in _mesa_BindTextureUnit() The texture unit is error-checked before this and the texUnit var is unused, so remove it. Reviewed-by: Anuj Phogat --- src/mesa/main/texobj.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index b571b1b2ff6..31829202944 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -1759,19 +1759,12 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture) { GET_CURRENT_CONTEXT(ctx); struct gl_texture_object *texObj; - struct gl_texture_unit *texUnit; if (unit >= _mesa_max_tex_unit(ctx)) { _mesa_error(ctx, GL_INVALID_VALUE, "glBindTextureUnit(unit=%u)", unit); return; } - texUnit = _mesa_get_tex_unit(ctx, unit); - assert(texUnit); - if (!texUnit) { - return; - } - if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE)) _mesa_debug(ctx, "glBindTextureUnit %s %d\n", _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture); From 9d4ce8073611355d94ec675500a9bc209790e86a Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 13 Oct 2015 09:34:53 -0600 Subject: [PATCH 118/270] mesa: minor indentation fix in _mesa_BindTextureUnit() --- src/mesa/main/texobj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index 31829202944..547055ecf39 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -1786,7 +1786,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture) /* Error checking */ if (!texObj) { _mesa_error(ctx, GL_INVALID_OPERATION, - "glBindTextureUnit(non-gen name)"); + "glBindTextureUnit(non-gen name)"); return; } if (texObj->Target == 0) { From 9abbf65d0ae8bd3e1e50d4bdfdf0d80191ace9aa Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 13 Oct 2015 20:12:56 -0600 Subject: [PATCH 119/270] mesa: remove unused functions in program.c replace_registers() and adjust_param_indexes() were unused. Reviewed-by: Matt Turner --- src/mesa/program/program.c | 51 -------------------------------------- 1 file changed, 51 deletions(-) diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index c35a89b5983..86de5e965f1 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -449,57 +449,6 @@ _mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count) } -/** - * Search instructions for registers that match (oldFile, oldIndex), - * replacing them with (newFile, newIndex). - */ -static void -replace_registers(struct prog_instruction *inst, GLuint numInst, - GLuint oldFile, GLuint oldIndex, - GLuint newFile, GLuint newIndex) -{ - GLuint i, j; - for (i = 0; i < numInst; i++) { - /* src regs */ - for (j = 0; j < _mesa_num_inst_src_regs(inst[i].Opcode); j++) { - if (inst[i].SrcReg[j].File == oldFile && - inst[i].SrcReg[j].Index == oldIndex) { - inst[i].SrcReg[j].File = newFile; - inst[i].SrcReg[j].Index = newIndex; - } - } - /* dst reg */ - if (inst[i].DstReg.File == oldFile && inst[i].DstReg.Index == oldIndex) { - inst[i].DstReg.File = newFile; - inst[i].DstReg.Index = newIndex; - } - } -} - - -/** - * Search instructions for references to program parameters. When found, - * increment the parameter index by 'offset'. - * Used when combining programs. - */ -static void -adjust_param_indexes(struct prog_instruction *inst, GLuint numInst, - GLuint offset) -{ - GLuint i, j; - for (i = 0; i < numInst; i++) { - for (j = 0; j < _mesa_num_inst_src_regs(inst[i].Opcode); j++) { - GLuint f = inst[i].SrcReg[j].File; - if (f == PROGRAM_CONSTANT || - f == PROGRAM_UNIFORM || - f == PROGRAM_STATE_VAR) { - inst[i].SrcReg[j].Index += offset; - } - } - } -} - - /** * Populate the 'used' array with flags indicating which registers (TEMPs, * INPUTs, OUTPUTs, etc, are used by the given program. From 63728dac57c18df0f45bb2482f60188fac2d1efe Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Mon, 12 Oct 2015 17:18:51 -0700 Subject: [PATCH 120/270] i965/fs: Simplify FS in brw_nir_lower_inputs to only support scalar mode Signed-off-by: Jordan Justen Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_nir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 4f35d81fc7e..5459ab59c1a 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -96,8 +96,9 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) } break; case MESA_SHADER_FRAGMENT: + assert(is_scalar); nir_assign_var_locations(&nir->inputs, &nir->num_inputs, - is_scalar ? type_size_scalar : type_size_vec4); + type_size_scalar); break; default: unreachable("unsupported shader stage"); From 0d1eef536bc744f5c4dcdf854ad6adfdfe4f4dcb Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Wed, 14 Oct 2015 11:33:03 -0700 Subject: [PATCH 121/270] i965/fs: Ignore compute shaders in brw_nir_lower_inputs The commit shown below caused compute shaders to hit the unreachable in the default of the switch block. Since compute shaders don't have any inputs, we can make brw_nir_lower_inputs a no-op for CS. commit 2953c3d76178d7589947e6ea1dbd902b7b02b3d4 Author: Kenneth Graunke Date: Fri Aug 14 15:15:11 2015 -0700 i965/vs: Map scalar VS input locations properly; avoid tons of MOVs. Signed-off-by: Jordan Justen Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_nir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 5459ab59c1a..af9d0414d51 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -100,6 +100,10 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) nir_assign_var_locations(&nir->inputs, &nir->num_inputs, type_size_scalar); break; + case MESA_SHADER_COMPUTE: + /* Compute shaders have no inputs. */ + assert(exec_list_is_empty(&nir->inputs)); + break; default: unreachable("unsupported shader stage"); } From ab04adcf63cb4553c66b703645c2991340b5637d Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Mon, 12 Oct 2015 20:22:14 -0700 Subject: [PATCH 122/270] glsl: Support uint index in do_vec_index_to_cond_assign MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ES31-CTS.compute_shader.pipeline-compute-chain test case generates an unsigned index by using gl_LocalInvocationID.x and gl_LocalInvocationID.y as array indices. Signed-off-by: Jordan Justen Reviewed-by: Tapani Pälli --- src/glsl/lower_vec_index_to_cond_assign.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/glsl/lower_vec_index_to_cond_assign.cpp b/src/glsl/lower_vec_index_to_cond_assign.cpp index 0c3394a504b..b6238825f8a 100644 --- a/src/glsl/lower_vec_index_to_cond_assign.cpp +++ b/src/glsl/lower_vec_index_to_cond_assign.cpp @@ -88,7 +88,9 @@ ir_vec_index_to_cond_assign_visitor::convert_vec_index_to_cond_assign(void *mem_ exec_list list; /* Store the index to a temporary to avoid reusing its tree. */ - index = new(base_ir) ir_variable(glsl_type::int_type, + assert(orig_index->type == glsl_type::int_type || + orig_index->type == glsl_type::uint_type); + index = new(base_ir) ir_variable(orig_index->type, "vec_index_tmp_i", ir_var_temporary); list.push_tail(index); From a274eff9ffffaa7726e7e36f59c1051cd0dfa701 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Mon, 12 Oct 2015 20:28:28 -0700 Subject: [PATCH 123/270] glsl: Support uint index in lower_vector_insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ES31-CTS.compute_shader.pipeline-compute-chain test case generates an unsigned index by using gl_LocalInvocationID.x and gl_LocalInvocationID.y as array indices. Signed-off-by: Jordan Justen Reviewed-by: Tapani Pälli --- src/glsl/lower_vector_insert.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/glsl/lower_vector_insert.cpp b/src/glsl/lower_vector_insert.cpp index 6d7cfa94262..26d31b03c12 100644 --- a/src/glsl/lower_vector_insert.cpp +++ b/src/glsl/lower_vector_insert.cpp @@ -108,9 +108,13 @@ vector_insert_visitor::handle_rvalue(ir_rvalue **rv) factory.emit(assign(temp, expr->operands[0])); factory.emit(assign(src_temp, expr->operands[1])); + assert(expr->operands[2]->type == glsl_type::int_type || + expr->operands[2]->type == glsl_type::uint_type); + for (unsigned i = 0; i < expr->type->vector_elements; i++) { ir_constant *const cmp_index = - new(factory.mem_ctx) ir_constant(int(i)); + ir_constant::zero(factory.mem_ctx, expr->operands[2]->type); + cmp_index->value.u[0] = i; ir_variable *const cmp_result = factory.make_temp(glsl_type::bool_type, "index_condition"); From ff31c243e38332999b617d479a0dca61b15bc1c6 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 13 Oct 2015 15:15:57 -0700 Subject: [PATCH 124/270] i965: Don't hardcode FS in "validation failed!" message. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead, print "Scalar VS" or "Scalar FS". Otherwise it's really confusing which stage is broken. Signed-off-by: Kenneth Graunke Reviewed-by: Kristian Høgsberg --- src/mesa/drivers/dri/i965/brw_fs_validate.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp index d0e04f3bf47..814c551f1be 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp @@ -32,7 +32,7 @@ #define fsv_assert(cond) \ if (!(cond)) { \ - fprintf(stderr, "ASSERT: FS validation failed!\n"); \ + fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", stage_abbrev); \ dump_instruction(inst, stderr); \ fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, #cond); \ abort(); \ From db280e951a1bcb2318240cb6db296b31abac37cd Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sun, 5 Jul 2015 15:18:10 +1000 Subject: [PATCH 125/270] glsl: Add support for linking uniform arrays of arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V3: Fix setting of data.location for struct AoA UBO members V2: Handle arrays of arrays in the same way structures are handled The ARB_arrays_of_arrays spec doesn't give very many details on how AoA uniforms are intended to be implemented. However in the ARB_program_interface_query spec there are details that show AoA are intended to be handled in a similar way to structs. Issues 7 from the ARB_program_interface_query spec: We define rules consistent with our enumeration rules for other complex types. For existing one-dimensional arrays, we enumerate a single entry if the array is an array of basic types, or separate entries for each array element if the array is an array of structures. We follow similar rules here. For a uniform array such as: uniform vec4 a[5][4][3]; we enumerate twenty different entries ("a[0][0][0]" through "a[4][3][0]"), each of which is treated as an array with three elements. This is morally equivalent to what you'd get if you worked around the limitation in current GLSL via: struct ArrayBottom { vec4 c[3]; }; struct ArrayMid { ArrayBottom b[3]; }; uniform ArrayMid a[5]; which would enumerate "a[0].b[0].c[0]" through "a[4].b[3].c[0]". Reviewed-by: Samuel Iglesias Gonsálvez Reviewed-by: Ian Romanick --- src/glsl/link_uniform_initializers.cpp | 4 +++- src/glsl/link_uniforms.cpp | 16 +++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp index c48ca69c641..f929acab1ce 100644 --- a/src/glsl/link_uniform_initializers.cpp +++ b/src/glsl/link_uniform_initializers.cpp @@ -179,6 +179,7 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog, const char *name, const glsl_type *type, ir_constant *val, unsigned int boolean_true) { + const glsl_type *t_without_array = type->without_array(); if (type->is_record()) { ir_constant *field_constant; @@ -193,7 +194,8 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog, field_constant = (ir_constant *)field_constant->next; } return; - } else if (type->is_array() && type->fields.array->is_record()) { + } else if (t_without_array->is_record() || + (type->is_array() && type->fields.array->is_array())) { const glsl_type *const element_type = type->fields.array; for (unsigned int i = 0; i < type->length; i++) { diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp index 5465687a954..647aa2bbdd8 100644 --- a/src/glsl/link_uniforms.cpp +++ b/src/glsl/link_uniforms.cpp @@ -149,7 +149,8 @@ program_resource_visitor::process(ir_variable *var) recursion(var->type, &name, strlen(name), row_major, NULL, packing, false, record_array_count); ralloc_free(name); - } else if (t->without_array()->is_record()) { + } else if (t_without_array->is_record() || + (t->is_array() && t->fields.array->is_array())) { char *name = ralloc_strdup(NULL, var->name); recursion(var->type, &name, strlen(name), row_major, NULL, packing, false, record_array_count); @@ -231,7 +232,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, this->leave_record(t, *name, row_major, packing); } } else if (t->without_array()->is_record() || - t->without_array()->is_interface()) { + t->without_array()->is_interface() || + (t->is_array() && t->fields.array->is_array())) { if (record_type == NULL && t->fields.array->is_record()) record_type = t->fields.array; @@ -387,6 +389,7 @@ private: { assert(!type->without_array()->is_record()); assert(!type->without_array()->is_interface()); + assert(!(type->is_array() && type->fields.array->is_array())); (void) row_major; @@ -712,6 +715,7 @@ private: { assert(!type->without_array()->is_record()); assert(!type->without_array()->is_interface()); + assert(!(type->is_array() && type->fields.array->is_array())); unsigned id; bool found = this->map->get(id, name); @@ -804,10 +808,11 @@ private: if (type->is_array()) { if (packing == GLSL_INTERFACE_PACKING_STD430) this->uniforms[id].array_stride = - type->fields.array->std430_array_stride(row_major); + type->without_array()->std430_array_stride(row_major); else this->uniforms[id].array_stride = - glsl_align(type->fields.array->std140_size(row_major), 16); + glsl_align(type->without_array()->std140_size(row_major), + 16); } else { this->uniforms[id].array_stride = 0; } @@ -966,7 +971,8 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) if (var->type->is_record()) { sentinel = '.'; - } else if (var->type->without_array()->is_record()) { + } else if (var->type->is_array() && (var->type->fields.array->is_array() + || var->type->without_array()->is_record())) { sentinel = '['; } From 296a7ea471fd327ab60d9723bd395e6b34dc9334 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Wed, 29 Jul 2015 11:57:44 +1000 Subject: [PATCH 126/270] glsl: add support for initialising sampler AoA Reviewed-by: Ian Romanick --- src/glsl/link_uniform_initializers.cpp | 83 +++++++++++++++----------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp index f929acab1ce..682a4eef13c 100644 --- a/src/glsl/link_uniform_initializers.cpp +++ b/src/glsl/link_uniform_initializers.cpp @@ -106,51 +106,64 @@ copy_constant_to_storage(union gl_constant_value *storage, * they have no storage and should be handled elsewhere. */ void -set_opaque_binding(gl_shader_program *prog, const char *name, int binding) +set_opaque_binding(void *mem_ctx, gl_shader_program *prog, + const glsl_type *type, const char *name, int *binding) { - struct gl_uniform_storage *const storage = - get_storage(prog->UniformStorage, prog->NumUniformStorage, name); - if (storage == NULL) { - assert(storage != NULL); - return; - } + if (type->is_array() && type->fields.array->is_array()) { + const glsl_type *const element_type = type->fields.array; - const unsigned elements = MAX2(storage->array_elements, 1); + for (unsigned int i = 0; i < type->length; i++) { + const char *element_name = ralloc_asprintf(mem_ctx, "%s[%d]", name, i); - /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec - * says: - * - * "If the binding identifier is used with an array, the first element - * of the array takes the specified unit and each subsequent element - * takes the next consecutive unit." - */ - for (unsigned int i = 0; i < elements; i++) { - storage->storage[i].i = binding + i; - } + set_opaque_binding(mem_ctx, prog, element_type, + element_name, binding); + } + } else { + struct gl_uniform_storage *const storage = + get_storage(prog->UniformStorage, prog->NumUniformStorage, name); - for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) { - gl_shader *shader = prog->_LinkedShaders[sh]; + if (storage == NULL) { + assert(storage != NULL); + return; + } - if (shader) { - if (storage->type->base_type == GLSL_TYPE_SAMPLER && - storage->opaque[sh].active) { - for (unsigned i = 0; i < elements; i++) { - const unsigned index = storage->opaque[sh].index + i; - shader->SamplerUnits[index] = storage->storage[i].i; - } + const unsigned elements = MAX2(storage->array_elements, 1); - } else if (storage->type->base_type == GLSL_TYPE_IMAGE && + /* Section 4.4.4 (Opaque-Uniform Layout Qualifiers) of the GLSL 4.20 spec + * says: + * + * "If the binding identifier is used with an array, the first element + * of the array takes the specified unit and each subsequent element + * takes the next consecutive unit." + */ + for (unsigned int i = 0; i < elements; i++) { + storage->storage[i].i = (*binding)++; + } + + for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) { + gl_shader *shader = prog->_LinkedShaders[sh]; + + if (shader) { + if (storage->type->base_type == GLSL_TYPE_SAMPLER && + storage->opaque[sh].active) { + for (unsigned i = 0; i < elements; i++) { + const unsigned index = storage->opaque[sh].index + i; + shader->SamplerUnits[index] = storage->storage[i].i; + } + + } else if (storage->type->base_type == GLSL_TYPE_IMAGE && storage->opaque[sh].active) { - for (unsigned i = 0; i < elements; i++) { - const unsigned index = storage->opaque[sh].index + i; - shader->ImageUnits[index] = storage->storage[i].i; + for (unsigned i = 0; i < elements; i++) { + const unsigned index = storage->opaque[sh].index + i; + shader->ImageUnits[index] = storage->storage[i].i; + } } } } - } - storage->initialized = true; + storage->initialized = true; + } } void @@ -285,7 +298,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog, if (type->without_array()->is_sampler() || type->without_array()->is_image()) { - linker::set_opaque_binding(prog, var->name, var->data.binding); + int binding = var->data.binding; + linker::set_opaque_binding(mem_ctx, prog, var->type, + var->name, &binding); } else if (var->is_in_buffer_block()) { const glsl_type *const iface_type = var->get_interface_type(); From 3129359ed7461b90fe6ea70641ec7a858dd656de Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 15 Oct 2015 14:32:41 +1100 Subject: [PATCH 127/270] glsl: allow AoA to be sized by initializer or constructor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V2: Split out unsized array validation to its own patch as suggested by Samuel. Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/ast.h | 15 ++-------- src/glsl/ast_array_index.cpp | 7 ++--- src/glsl/ast_function.cpp | 33 ++++++++++++++++++++- src/glsl/ast_to_hir.cpp | 57 +++++++++++++++++++++++++----------- src/glsl/glsl_parser.yy | 11 +++---- 5 files changed, 82 insertions(+), 41 deletions(-) diff --git a/src/glsl/ast.h b/src/glsl/ast.h index 67faacd0ef8..57d432d4b02 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -183,6 +183,7 @@ enum ast_operators { ast_post_dec, ast_field_selection, ast_array_index, + ast_unsized_array_dim, ast_function_call, @@ -324,16 +325,7 @@ public: class ast_array_specifier : public ast_node { public: - /** Unsized array specifier ([]) */ - explicit ast_array_specifier(const struct YYLTYPE &locp) - : is_unsized_array(true) - { - set_location(locp); - } - - /** Sized array specifier ([dim]) */ ast_array_specifier(const struct YYLTYPE &locp, ast_expression *dim) - : is_unsized_array(false) { set_location(locp); array_dimensions.push_tail(&dim->link); @@ -346,11 +338,8 @@ public: virtual void print(void) const; - /* If true, this means that the array has an unsized outermost dimension. */ - bool is_unsized_array; - /* This list contains objects of type ast_node containing the - * sized dimensions only, in outermost-to-innermost order. + * array dimensions in outermost-to-innermost order. */ exec_list array_dimensions; }; diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp index 5e8f49d70b0..7855e0a6190 100644 --- a/src/glsl/ast_array_index.cpp +++ b/src/glsl/ast_array_index.cpp @@ -28,13 +28,10 @@ void ast_array_specifier::print(void) const { - if (this->is_unsized_array) { - printf("[ ] "); - } - foreach_list_typed (ast_node, array_dimension, link, &this->array_dimensions) { printf("[ "); - array_dimension->print(); + if (((ast_expression*)array_dimension)->oper != ast_unsized_array_dim) + array_dimension->print(); printf("] "); } } diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp index b72eb3ffb9e..c5c5cae333b 100644 --- a/src/glsl/ast_function.cpp +++ b/src/glsl/ast_function.cpp @@ -991,6 +991,7 @@ process_array_constructor(exec_list *instructions, } bool all_parameters_are_constant = true; + const glsl_type *element_type = constructor_type->fields.array; /* Type cast each parameter and, if possible, fold constants. */ foreach_in_list_safe(ir_rvalue, ir, &actual_parameters) { @@ -1017,12 +1018,34 @@ process_array_constructor(exec_list *instructions, } } - if (result->type != constructor_type->fields.array) { + if (constructor_type->fields.array->is_unsized_array()) { + /* As the inner parameters of the constructor are created without + * knowledge of each other we need to check to make sure unsized + * parameters of unsized constructors all end up with the same size. + * + * e.g we make sure to fail for a constructor like this: + * vec4[][] a = vec4[][](vec4[](vec4(0.0), vec4(1.0)), + * vec4[](vec4(0.0), vec4(1.0), vec4(1.0)), + * vec4[](vec4(0.0), vec4(1.0))); + */ + if (element_type->is_unsized_array()) { + /* This is the first parameter so just get the type */ + element_type = result->type; + } else if (element_type != result->type) { + _mesa_glsl_error(loc, state, "type error in array constructor: " + "expected: %s, found %s", + element_type->name, + result->type->name); + return ir_rvalue::error_value(ctx); + } + } else if (result->type != constructor_type->fields.array) { _mesa_glsl_error(loc, state, "type error in array constructor: " "expected: %s, found %s", constructor_type->fields.array->name, result->type->name); return ir_rvalue::error_value(ctx); + } else { + element_type = result->type; } /* Attempt to convert the parameter to a constant valued expression. @@ -1039,6 +1062,14 @@ process_array_constructor(exec_list *instructions, ir->replace_with(result); } + if (constructor_type->fields.array->is_unsized_array()) { + constructor_type = + glsl_type::get_array_instance(element_type, + parameter_count); + assert(constructor_type != NULL); + assert(constructor_type->length == parameter_count); + } + if (all_parameters_are_constant) return new(ctx) ir_constant(constructor_type, &actual_parameters); diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index c04db3505c1..fb2c0f7026c 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -782,8 +782,30 @@ validate_assignment(struct _mesa_glsl_parse_state *state, * Note: Whole-array assignments are not permitted in GLSL 1.10, but this * is handled by ir_dereference::is_lvalue. */ - if (lhs->type->is_unsized_array() && rhs->type->is_array() - && (lhs->type->fields.array == rhs->type->fields.array)) { + const glsl_type *lhs_t = lhs->type; + const glsl_type *rhs_t = rhs->type; + bool unsized_array = false; + while(lhs_t->is_array()) { + if (rhs_t == lhs_t) + break; /* the rest of the inner arrays match so break out early */ + if (!rhs_t->is_array()) { + unsized_array = false; + break; /* number of dimensions mismatch */ + } + if (lhs_t->length == rhs_t->length) { + lhs_t = lhs_t->fields.array; + rhs_t = rhs_t->fields.array; + continue; + } else if (lhs_t->is_unsized_array()) { + unsized_array = true; + } else { + unsized_array = false; + break; /* sized array mismatch */ + } + lhs_t = lhs_t->fields.array; + rhs_t = rhs_t->fields.array; + } + if (unsized_array) { if (is_initializer) { return rhs; } else { @@ -1810,6 +1832,10 @@ ast_expression::do_hir(exec_list *instructions, break; } + case ast_unsized_array_dim: + assert(!"ast_unsized_array_dim: Should never get here."); + break; + case ast_function_call: /* Should *NEVER* get here. ast_function_call should always be handled * by ast_function_expression::hir. @@ -2047,6 +2073,14 @@ process_array_size(exec_node *node, exec_list dummy_instructions; ast_node *array_size = exec_node_data(ast_node, node, link); + + /** + * Dimensions other than the outermost dimension can by unsized if they + * are immediately sized by a constructor or initializer. + */ + if (((ast_expression*)array_size)->oper == ast_unsized_array_dim) + return 0; + ir_rvalue *const ir = array_size->hir(& dummy_instructions, state); YYLTYPE loc = array_size->get_location(); @@ -2115,14 +2149,6 @@ process_array_type(YYLTYPE *loc, const glsl_type *base, base->name); return glsl_type::error_type; } - - if (base->length == 0) { - _mesa_glsl_error(loc, state, - "only the outermost array dimension can " - "be unsized", - base->name); - return glsl_type::error_type; - } } for (exec_node *node = array_specifier->array_dimensions.tail_pred; @@ -2130,9 +2156,6 @@ process_array_type(YYLTYPE *loc, const glsl_type *base, unsigned array_size = process_array_size(node, state); array_type = glsl_type::get_array_instance(array_type, array_size); } - - if (array_specifier->is_unsized_array) - array_type = glsl_type::get_array_instance(array_type, 0); } return array_type; @@ -6453,6 +6476,9 @@ ast_interface_block::hir(exec_list *instructions, ir_variable *var; if (this->array_specifier != NULL) { + const glsl_type *block_array_type = + process_array_type(&loc, block_type, this->array_specifier, state); + /* Section 4.3.7 (Interface Blocks) of the GLSL 1.50 spec says: * * For uniform blocks declared an array, each individual array @@ -6476,7 +6502,7 @@ ast_interface_block::hir(exec_list *instructions, * tessellation control shader output, and tessellation evaluation * shader input. */ - if (this->array_specifier->is_unsized_array) { + if (block_array_type->is_unsized_array()) { bool allow_inputs = state->stage == MESA_SHADER_GEOMETRY || state->stage == MESA_SHADER_TESS_CTRL || state->stage == MESA_SHADER_TESS_EVAL; @@ -6503,9 +6529,6 @@ ast_interface_block::hir(exec_list *instructions, } } - const glsl_type *block_array_type = - process_array_type(&loc, block_type, this->array_specifier, state); - /* From section 4.3.9 (Interface Blocks) of the GLSL ES 3.10 spec: * * * Arrays of arrays of blocks are not allowed diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy index c1bcccc34f4..16c91710bbb 100644 --- a/src/glsl/glsl_parser.yy +++ b/src/glsl/glsl_parser.yy @@ -1962,7 +1962,9 @@ array_specifier: '[' ']' { void *ctx = state; - $$ = new(ctx) ast_array_specifier(@1); + $$ = new(ctx) ast_array_specifier(@1, new(ctx) ast_expression( + ast_unsized_array_dim, NULL, + NULL, NULL)); $$->set_location_range(@1, @2); } | '[' constant_expression ']' @@ -1973,17 +1975,16 @@ array_specifier: } | array_specifier '[' ']' { + void *ctx = state; $$ = $1; if (!state->ARB_arrays_of_arrays_enable) { _mesa_glsl_error(& @1, state, "GL_ARB_arrays_of_arrays " "required for defining arrays of arrays"); - } else { - _mesa_glsl_error(& @1, state, - "only the outermost array dimension can " - "be unsized"); } + $$->add_dimension(new(ctx) ast_expression(ast_unsized_array_dim, NULL, + NULL, NULL)); } | array_specifier '[' constant_expression ']' { From dea0af8f826ca9ad638a158fdaacb2a4436f11d7 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 15 Oct 2015 14:35:41 +1100 Subject: [PATCH 128/270] glsl: check that only the outermost array is unsized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/ast_to_hir.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index fb2c0f7026c..7b54eaf4390 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -2694,6 +2694,25 @@ is_conflicting_fragcoord_redeclaration(struct _mesa_glsl_parse_state *state, return false; } +static inline void +validate_array_dimensions(const glsl_type *t, + struct _mesa_glsl_parse_state *state, + YYLTYPE *loc) { + if (t->is_array()) { + t = t->fields.array; + while (t->is_array()) { + if (t->is_unsized_array()) { + _mesa_glsl_error(loc, state, + "only the outermost array dimension can " + "be unsized", + t->name); + break; + } + t = t->fields.array; + } + } +} + static void apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, ir_variable *var, @@ -4436,6 +4455,8 @@ ast_declarator_list::hir(exec_list *instructions, result = process_initializer((earlier == NULL) ? var : earlier, decl, this->type, &initializer_instructions, state); + } else { + validate_array_dimensions(var_type, state, &loc); } /* From page 23 (page 29 of the PDF) of the GLSL 1.10 spec: @@ -5961,6 +5982,7 @@ ast_process_structure_or_interface_block(exec_list *instructions, const struct glsl_type *field_type = process_array_type(&loc, decl_type, decl->array_specifier, state); + validate_array_dimensions(field_type, state, &loc); fields[i].type = field_type; fields[i].name = decl->identifier; fields[i].location = -1; From d337da81f28d6a5a65d0a09f9b1ddf905dc7c3aa Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Tue, 24 Jun 2014 07:43:05 +1000 Subject: [PATCH 129/270] glsl: dont allow gl_PerVertex to be redeclared as an array of arrays V3: move patch after fixes to ast for AoA and add const to helper as suggested by Ian V2: move single dimensional array detection into a helper Signed-off-by: Timothy Arceri Reviewed-by: Ian Romanick --- src/glsl/ast.h | 6 ++++++ src/glsl/ast_to_hir.cpp | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/glsl/ast.h b/src/glsl/ast.h index 57d432d4b02..e803e6d7675 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -336,6 +336,12 @@ public: array_dimensions.push_tail(&dim->link); } + const bool is_single_dimension() + { + return this->array_dimensions.tail_pred->prev != NULL && + this->array_dimensions.tail_pred->prev->is_head_sentinel(); + } + virtual void print(void) const; /* This list contains objects of type ast_node containing the diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 7b54eaf4390..db617cb41ac 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -6335,7 +6335,8 @@ ast_interface_block::hir(exec_list *instructions, _mesa_shader_stage_to_string(state->stage)); } if (this->instance_name == NULL || - strcmp(this->instance_name, "gl_in") != 0 || this->array_specifier == NULL) { + strcmp(this->instance_name, "gl_in") != 0 || this->array_specifier == NULL || + !this->array_specifier->is_single_dimension()) { _mesa_glsl_error(&loc, state, "gl_PerVertex input must be redeclared as " "gl_in[]"); From 410609c9688d3f6ad808d33928a7d9589a708a40 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sat, 11 Jul 2015 21:38:54 +1000 Subject: [PATCH 130/270] glsl: remove dead code in a single pass Currently only one ir assignment is removed for each var in a single dead code optimisation pass. This means if a var has more than one assignment, then it requires all the glsl optimisations to be run again for each additional assignment to be removed. Another pass is also required to remove the variable itself. With this change all assignments and the variable are removed in a single pass. Some of the arrays of arrays conformance tests that were looping through 8 dimensions ended up with a var with hundreds of assignments. This change helps ES31-CTS.arrays_of_arrays.InteractionFunctionCalls1 go from around 3 min 20 sec -> 2 min ES31-CTS.arrays_of_arrays.InteractionFunctionCalls2 went from around 9 min 20 sec to 7 min 30 sec I had difficulty getting the public shader-db to give a consistent result with or without this change but the results seemed unchanged at between 15-20 seconds. Thomas Helland measured change with shader-db on his machine from approx 117 secs to 112 secs. V3: Simplify freeing of list as suggested by Ian, and spelling fixes. V2: Add assert to be sure references are counted before assignments. Reviewed-by: Ian Romanick Tested-By: Thomas Helland Tested-by: Ian Romanick --- src/glsl/ir_variable_refcount.cpp | 26 +++++++++++++++++++++--- src/glsl/ir_variable_refcount.h | 13 +++++++++++- src/glsl/opt_dead_code.cpp | 33 ++++++++++++++++++++----------- src/glsl/opt_tree_grafting.cpp | 2 -- 4 files changed, 57 insertions(+), 17 deletions(-) diff --git a/src/glsl/ir_variable_refcount.cpp b/src/glsl/ir_variable_refcount.cpp index e4d825c454b..790627bd1e3 100644 --- a/src/glsl/ir_variable_refcount.cpp +++ b/src/glsl/ir_variable_refcount.cpp @@ -46,6 +46,15 @@ static void free_entry(struct hash_entry *entry) { ir_variable_refcount_entry *ivre = (ir_variable_refcount_entry *) entry->data; + + /* Free assignment list */ + exec_node *n; + while ((n = ivre->assign_list.pop_head()) != NULL) { + struct assignment_entry *assignment_entry = + exec_node_data(struct assignment_entry, n, link); + free(assignment_entry); + } + delete ivre; } @@ -59,7 +68,6 @@ ir_variable_refcount_visitor::~ir_variable_refcount_visitor() ir_variable_refcount_entry::ir_variable_refcount_entry(ir_variable *var) { this->var = var; - assign = NULL; assigned_count = 0; declaration = false; referenced_count = 0; @@ -125,8 +133,20 @@ ir_variable_refcount_visitor::visit_leave(ir_assignment *ir) entry = this->get_variable_entry(ir->lhs->variable_referenced()); if (entry) { entry->assigned_count++; - if (entry->assign == NULL) - entry->assign = ir; + + /* Build a list for dead code optimisation. Don't add assignment if it + * was declared out of scope (outside the instruction stream). Also don't + * bother adding any more to the list if there are more references than + * assignments as this means the variable is used and won't be optimised + * out. + */ + assert(entry->referenced_count >= entry->assigned_count); + if (entry->referenced_count == entry->assigned_count) { + struct assignment_entry *assignment_entry = + (struct assignment_entry *)calloc(1, sizeof(*assignment_entry)); + assignment_entry->assign = ir; + entry->assign_list.push_head(&assignment_entry->link); + } } return visit_continue; diff --git a/src/glsl/ir_variable_refcount.h b/src/glsl/ir_variable_refcount.h index c15e8110d04..5c74c314781 100644 --- a/src/glsl/ir_variable_refcount.h +++ b/src/glsl/ir_variable_refcount.h @@ -33,13 +33,24 @@ #include "ir_visitor.h" #include "glsl_types.h" +struct assignment_entry { + exec_node link; + ir_assignment *assign; +}; + class ir_variable_refcount_entry { public: ir_variable_refcount_entry(ir_variable *var); ir_variable *var; /* The key: the variable's pointer. */ - ir_assignment *assign; /* An assignment to the variable, if any */ + + /** + * List of assignments to the variable, if any. + * This is intended to be used for dead code optimisation and may + * not be a complete list. + */ + exec_list assign_list; /** Number of times the variable is referenced, including assignments. */ unsigned referenced_count; diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp index 071485ad31b..c5be166e75a 100644 --- a/src/glsl/opt_dead_code.cpp +++ b/src/glsl/opt_dead_code.cpp @@ -75,24 +75,35 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned) || !entry->declaration) continue; - if (entry->assign) { - /* Remove a single dead assignment to the variable we found. - * Don't do so if it's a shader or function output or a shader - * storage variable though. + if (!entry->assign_list.is_empty()) { + /* Remove all the dead assignments to the variable we found. + * Don't do so if it's a shader or function output, though. */ if (entry->var->data.mode != ir_var_function_out && entry->var->data.mode != ir_var_function_inout && entry->var->data.mode != ir_var_shader_out && entry->var->data.mode != ir_var_shader_storage) { - entry->assign->remove(); - progress = true; - if (debug) { - printf("Removed assignment to %s@%p\n", - entry->var->name, (void *) entry->var); - } + while (!entry->assign_list.is_empty()) { + struct assignment_entry *assignment_entry = + exec_node_data(struct assignment_entry, + entry->assign_list.head, link); + + assignment_entry->assign->remove(); + + if (debug) { + printf("Removed assignment to %s@%p\n", + entry->var->name, (void *) entry->var); + } + + assignment_entry->link.remove(); + free(assignment_entry); + } + progress = true; } - } else { + } + + if (entry->assign_list.is_empty()) { /* If there are no assignments or references to the variable left, * then we can remove its declaration. */ diff --git a/src/glsl/opt_tree_grafting.cpp b/src/glsl/opt_tree_grafting.cpp index a7a219c55ca..e38a0e93058 100644 --- a/src/glsl/opt_tree_grafting.cpp +++ b/src/glsl/opt_tree_grafting.cpp @@ -373,8 +373,6 @@ tree_grafting_basic_block(ir_instruction *bb_first, entry->referenced_count != 2) continue; - assert(assign == entry->assign); - /* Found a possibly graftable assignment. Now, walk through the * rest of the BB seeing if the deref is here, and if nothing interfered with * pasting its expression's values in between. From be822b89ac81e1fa66dfa16d108bf1deb29d4db2 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 17 Sep 2015 14:17:17 +1000 Subject: [PATCH 131/270] glsl: calculate AoA uniform offset correctly for structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows the correct offset to be calculated for use in indirect indexing of samplers. Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/glsl_types.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp index b9cb97cbeae..575ff0e1c2b 100644 --- a/src/glsl/glsl_types.cpp +++ b/src/glsl/glsl_types.cpp @@ -1071,7 +1071,22 @@ glsl_type::record_location_offset(unsigned length) const const glsl_type *wa = st->without_array(); if (wa->is_record()) { unsigned r_offset = wa->record_location_offset(wa->length); - offset += st->is_array() ? st->length * r_offset : r_offset; + offset += st->is_array() ? + st->arrays_of_arrays_size() * r_offset : r_offset; + } else if (st->is_array() && st->fields.array->is_array()) { + unsigned outer_array_size = st->length; + const glsl_type *base_type = st->fields.array; + + /* For arrays of arrays the outer arrays take up a uniform + * slot for each element. The innermost array elements share a + * single slot so we ignore the innermost array when calculating + * the offset. + */ + while (base_type->fields.array->is_array()) { + outer_array_size = outer_array_size * base_type->length; + base_type = base_type->fields.array; + } + offset += outer_array_size; } else { /* We dont worry about arrays here because unless the array * contains a structure or another array it only takes up a single From 176e6930e6c24dfce7cc730faa2612d27689a4df Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sat, 18 Jul 2015 17:24:22 +1000 Subject: [PATCH 132/270] i965: add arrays of arrays support for varyings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V2: get the correct vector elements value for outputs Reviewed-by: Samuel Iglesias Gonsálvez --- src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++-- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index d000f16f49a..01a7c99a4a6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1048,11 +1048,11 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name, unsigned int array_elements; if (type->is_array()) { - array_elements = type->length; + array_elements = type->arrays_of_arrays_size(); if (array_elements == 0) { fail("dereferenced array '%s' has length 0\n", name); } - type = type->fields.array; + type = type->without_array(); } else { array_elements = 1; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 05f3f63204b..0e044d01f1e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -89,9 +89,7 @@ fs_visitor::nir_setup_outputs() nir_foreach_variable(var, &nir->outputs) { fs_reg reg = offset(nir_outputs, bld, var->data.driver_location); - int vector_elements = - var->type->is_array() ? var->type->fields.array->vector_elements - : var->type->vector_elements; + int vector_elements = var->type->without_array()->vector_elements; switch (stage) { case MESA_SHADER_VERTEX: From 261a434996079b29a476a3c67e1e580cf3db76ba Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Fri, 18 Sep 2015 22:51:40 +1000 Subject: [PATCH 133/270] glsl: add std140 layout support for AoA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Samuel Iglesias Gonsálvez Reviewed-by: Ian Romanick --- src/glsl/glsl_types.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp index 575ff0e1c2b..27934478e2d 100644 --- a/src/glsl/glsl_types.cpp +++ b/src/glsl/glsl_types.cpp @@ -1330,8 +1330,8 @@ glsl_type::std140_size(bool row_major) const unsigned int array_len; if (this->is_array()) { - element_type = this->fields.array; - array_len = this->length; + element_type = this->without_array(); + array_len = this->arrays_of_arrays_size(); } else { element_type = this; array_len = 1; @@ -1364,12 +1364,13 @@ glsl_type::std140_size(bool row_major) const * the array are laid out in order, according to rule (9). */ if (this->is_array()) { - if (this->fields.array->is_record()) { - return this->length * this->fields.array->std140_size(row_major); + if (this->without_array()->is_record()) { + return this->arrays_of_arrays_size() * + this->without_array()->std140_size(row_major); } else { - unsigned element_base_align = - this->fields.array->std140_base_alignment(row_major); - return this->length * MAX2(element_base_align, 16); + unsigned element_base_align = + this->without_array()->std140_base_alignment(row_major); + return this->arrays_of_arrays_size() * MAX2(element_base_align, 16); } } From 2d7a98de18e4cdd69daa63b09b504d01ad4c9f50 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sun, 30 Aug 2015 21:08:22 +1000 Subject: [PATCH 134/270] glsl: add AoA support for atomic counters This marks all counters in an AoA as active. For AoA all but the innermost array are treated as separate counters/uniforms. The Nvidia binary also goes further and finds inactive counters in the AoA, in future we should do this too, however this gets things working for the time being. This change also removes the use of UniformHash for atomic counters, this avoids having to generate name strings used as hash keys. Reviewed-by: Ian Romanick --- src/glsl/link_atomics.cpp | 77 +++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp index 100d03c4e8f..70ef0e1c891 100644 --- a/src/glsl/link_atomics.cpp +++ b/src/glsl/link_atomics.cpp @@ -33,7 +33,7 @@ namespace { * Atomic counter as seen by the program. */ struct active_atomic_counter { - unsigned id; + unsigned uniform_loc; ir_variable *var; }; @@ -52,7 +52,7 @@ namespace { free(counters); } - void push_back(unsigned id, ir_variable *var) + void push_back(unsigned uniform_loc, ir_variable *var) { active_atomic_counter *new_counters; @@ -66,7 +66,7 @@ namespace { } counters = new_counters; - counters[num_counters].id = id; + counters[num_counters].uniform_loc = uniform_loc; counters[num_counters].var = var; num_counters++; } @@ -95,6 +95,50 @@ namespace { y->data.atomic.offset < x->data.atomic.offset + x->type->atomic_size())); } + void + process_atomic_variable(const glsl_type *t, struct gl_shader_program *prog, + unsigned *uniform_loc, ir_variable *var, + active_atomic_buffer *const buffers, + unsigned *num_buffers, int *offset, + const unsigned shader_stage) + { + /* FIXME: Arrays of arrays get counted separately. For example: + * x1[3][3][2] = 9 counters + * x2[3][2] = 3 counters + * x3[2] = 1 counter + * + * However this code marks all the counters as active even when they + * might not be used. + */ + if (t->is_array() && t->fields.array->is_array()) { + for (unsigned i = 0; i < t->length; i++) { + process_atomic_variable(t->fields.array, prog, uniform_loc, + var, buffers, num_buffers, offset, + shader_stage); + } + } else { + active_atomic_buffer *buf = &buffers[var->data.binding]; + gl_uniform_storage *const storage = + &prog->UniformStorage[*uniform_loc]; + + /* If this is the first time the buffer is used, increment + * the counter of buffers used. + */ + if (buf->size == 0) + (*num_buffers)++; + + buf->push_back(*uniform_loc, var); + + buf->stage_references[shader_stage]++; + buf->size = MAX2(buf->size, *offset + t->atomic_size()); + + storage->offset = *offset; + *offset += t->atomic_size(); + + (*uniform_loc)++; + } + } + active_atomic_buffer * find_active_atomic_counters(struct gl_context *ctx, struct gl_shader_program *prog, @@ -114,23 +158,10 @@ namespace { ir_variable *var = node->as_variable(); if (var && var->type->contains_atomic()) { - unsigned id = 0; - bool found = prog->UniformHash->get(id, var->name); - assert(found); - (void) found; - active_atomic_buffer *buf = &buffers[var->data.binding]; - - /* If this is the first time the buffer is used, increment - * the counter of buffers used. - */ - if (buf->size == 0) - (*num_buffers)++; - - buf->push_back(id, var); - - buf->stage_references[i]++; - buf->size = MAX2(buf->size, var->data.atomic.offset + - var->type->atomic_size()); + int offset = var->data.atomic.offset; + unsigned uniform_loc = var->data.location; + process_atomic_variable(var->type, prog, &uniform_loc, + var, buffers, num_buffers, &offset, i); } } } @@ -197,10 +228,10 @@ link_assign_atomic_counter_resources(struct gl_context *ctx, /* Assign counter-specific fields. */ for (unsigned j = 0; j < ab.num_counters; j++) { ir_variable *const var = ab.counters[j].var; - const unsigned id = ab.counters[j].id; - gl_uniform_storage *const storage = &prog->UniformStorage[id]; + gl_uniform_storage *const storage = + &prog->UniformStorage[ab.counters[j].uniform_loc]; - mab.Uniforms[j] = id; + mab.Uniforms[j] = ab.counters[j].uniform_loc; if (!var->data.explicit_binding) var->data.binding = i; From dd89880dc0924162e751141f9388e5dba089d1c0 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Mon, 12 May 2014 20:25:26 +1000 Subject: [PATCH 135/270] glsl: avoid hitting assert for arrays of arrays Also add TODO comment about adding proper support Signed-off-by: Timothy Arceri Reviewed-by: Ian Romanick --- src/glsl/ir_set_program_inouts.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp index b7a0f6e95ba..d7c29b00f88 100644 --- a/src/glsl/ir_set_program_inouts.cpp +++ b/src/glsl/ir_set_program_inouts.cpp @@ -242,6 +242,12 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var, type = type->fields.array; } + /* TODO: implement proper arrays of arrays support + * for now let the caller mark whole variable as used. + */ + if (type->is_array() && type->fields.array->is_array()) + return false; + /* The code below only handles: * * - Indexing into matrices From d1d05c0f85daf3445d9b4c9cebb3940e6a251fa6 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 12 Mar 2015 19:52:47 +1100 Subject: [PATCH 136/270] glsl: add AoA support for linking interface blocks with unsized members Reviewed-by: Ian Romanick --- src/glsl/ir.cpp | 4 ++-- src/glsl/linker.cpp | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index 4c228437d15..8933b230177 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -1689,8 +1689,8 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name, if (type->is_interface()) this->init_interface_type(type); - else if (type->is_array() && type->fields.array->is_interface()) - this->init_interface_type(type->fields.array); + else if (type->without_array()->is_interface()) + this->init_interface_type(type->without_array()); } } diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index c15034ba904..25ca928aa43 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1387,8 +1387,10 @@ public: virtual ir_visitor_status visit(ir_variable *var) { + const glsl_type *type_without_array; fixup_type(&var->type, var->data.max_array_access, var->data.from_ssbo_unsized_array); + type_without_array = var->type->without_array(); if (var->type->is_interface()) { if (interface_contains_unsized_arrays(var->type)) { const glsl_type *new_type = @@ -1398,11 +1400,10 @@ public: var->type = new_type; var->change_interface_type(new_type); } - } else if (var->type->is_array() && - var->type->fields.array->is_interface()) { - if (interface_contains_unsized_arrays(var->type->fields.array)) { + } else if (type_without_array->is_interface()) { + if (interface_contains_unsized_arrays(type_without_array)) { const glsl_type *new_type = - resize_interface_members(var->type->fields.array, + resize_interface_members(type_without_array, var->get_max_ifc_array_access(), var->is_in_shader_storage_block()); var->change_interface_type(new_type); From 132b9e9dd97a2ab7d3be7945b3d990e94fd1513a Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sun, 19 Jul 2015 14:08:44 +1000 Subject: [PATCH 137/270] glsl: add AoA support for an inteface with unsized array members Add support for setting the max access of an unsized member of an interface array of arrays. For example ifc[j][k].foo[i] where foo is unsized. Reviewed-by: Ian Romanick --- src/glsl/ast_array_index.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp index 7855e0a6190..5927c91079b 100644 --- a/src/glsl/ast_array_index.cpp +++ b/src/glsl/ast_array_index.cpp @@ -61,21 +61,29 @@ update_max_array_access(ir_rvalue *ir, int idx, YYLTYPE *loc, } } else if (ir_dereference_record *deref_record = ir->as_dereference_record()) { - /* There are two possibilities we need to consider: + /* There are three possibilities we need to consider: * * - Accessing an element of an array that is a member of a named * interface block (e.g. ifc.foo[i]) * * - Accessing an element of an array that is a member of a named * interface block array (e.g. ifc[j].foo[i]). + * + * - Accessing an element of an array that is a member of a named + * interface block array of arrays (e.g. ifc[j][k].foo[i]). */ ir_dereference_variable *deref_var = deref_record->record->as_dereference_variable(); if (deref_var == NULL) { - if (ir_dereference_array *deref_array = - deref_record->record->as_dereference_array()) { - deref_var = deref_array->array->as_dereference_variable(); + ir_dereference_array *deref_array = + deref_record->record->as_dereference_array(); + ir_dereference_array *deref_array_prev = NULL; + while (deref_array != NULL) { + deref_array_prev = deref_array; + deref_array = deref_array->array->as_dereference_array(); } + if (deref_array_prev != NULL) + deref_var = deref_array_prev->array->as_dereference_variable(); } if (deref_var != NULL) { From 082b1ca2fe75a9b4c2936d4d918b5d0d4f0c6c50 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Mon, 16 Mar 2015 15:31:11 +1100 Subject: [PATCH 138/270] glsl: Add support for lowering interface block arrays of arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V2: make array processing functions static Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/lower_named_interface_blocks.cpp | 52 +++++++++++++++++------ 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp index 01bbdd0587e..276a2dedf47 100644 --- a/src/glsl/lower_named_interface_blocks.cpp +++ b/src/glsl/lower_named_interface_blocks.cpp @@ -65,6 +65,39 @@ #include "ir_rvalue_visitor.h" #include "program/hash_table.h" +static const glsl_type * +process_array_type(const glsl_type *type, unsigned idx) +{ + const glsl_type *element_type = type->fields.array; + if (element_type->is_array()) { + const glsl_type *new_array_type = process_array_type(element_type, idx); + return glsl_type::get_array_instance(new_array_type, type->length); + } else { + return glsl_type::get_array_instance( + element_type->fields.structure[idx].type, type->length); + } +} + +static ir_rvalue * +process_array_ir(void * const mem_ctx, + ir_dereference_array *deref_array_prev, + ir_rvalue *deref_var) +{ + ir_dereference_array *deref_array = + deref_array_prev->array->as_dereference_array(); + + if (deref_array == NULL) { + return new(mem_ctx) ir_dereference_array(deref_var, + deref_array_prev->array_index); + } else { + deref_array = (ir_dereference_array *) process_array_ir(mem_ctx, + deref_array, + deref_var); + return new(mem_ctx) ir_dereference_array(deref_array, + deref_array_prev->array_index); + } +} + namespace { class flatten_named_interface_blocks_declarations : public ir_rvalue_visitor @@ -112,15 +145,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) var->data.mode == ir_var_shader_storage) continue; - const glsl_type * iface_t = var->type; - const glsl_type * array_t = NULL; + const glsl_type * iface_t = var->type->without_array(); exec_node *insert_pos = var; - if (iface_t->is_array()) { - array_t = iface_t; - iface_t = array_t->fields.array; - } - assert (iface_t->is_interface()); for (unsigned i = 0; i < iface_t->length; i++) { @@ -137,7 +164,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) ir_variable *new_var; char *var_name = ralloc_strdup(mem_ctx, iface_t->fields.structure[i].name); - if (array_t == NULL) { + if (!var->type->is_array()) { new_var = new(mem_ctx) ir_variable(iface_t->fields.structure[i].type, var_name, @@ -145,9 +172,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) new_var->data.from_named_ifc_block_nonarray = 1; } else { const glsl_type *new_array_type = - glsl_type::get_array_instance( - iface_t->fields.structure[i].type, - array_t->length); + process_array_type(var->type, i); new_var = new(mem_ctx) ir_variable(new_array_type, var_name, @@ -236,9 +261,8 @@ flatten_named_interface_blocks_declarations::handle_rvalue(ir_rvalue **rvalue) ir_dereference_array *deref_array = ir->record->as_dereference_array(); if (deref_array != NULL) { - *rvalue = - new(mem_ctx) ir_dereference_array(deref_var, - deref_array->array_index); + *rvalue = process_array_ir(mem_ctx, deref_array, + (ir_rvalue *)deref_var); } else { *rvalue = deref_var; } From d9f1f2bbc6690e7ef85d79a857dbb64cf5fd1cd8 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 4 Jun 2015 11:19:05 +1000 Subject: [PATCH 139/270] glsl: Add AoA support when checking for non-const index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When checking for non-const indexing of interfaces take into account arrays of arrays Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/ast_array_index.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp index 5927c91079b..74d403fdb65 100644 --- a/src/glsl/ast_array_index.cpp +++ b/src/glsl/ast_array_index.cpp @@ -235,7 +235,7 @@ _mesa_ast_array_index_to_hir(void *mem_ctx, ir_var_shader_storage) { _mesa_glsl_error(&loc, state, "unsized array index must be constant"); } - } else if (array->type->fields.array->is_interface() + } else if (array->type->without_array()->is_interface() && (array->variable_referenced()->data.mode == ir_var_uniform || array->variable_referenced()->data.mode == ir_var_shader_storage) && !state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) { From 8cf1333b189b73dccdbccaca7e113f7b467982fa Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Tue, 25 Aug 2015 14:15:25 +1000 Subject: [PATCH 140/270] glsl: link uniform block arrays of arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for setting up the UniformBlock structures for AoA and also adds support for resizing AoA blocks with a packed layout. Reviewed-by: Samuel Iglesias Gonsálvez --- .../link_uniform_block_active_visitor.cpp | 166 ++++++++++++------ src/glsl/link_uniform_block_active_visitor.h | 15 +- src/glsl/link_uniform_blocks.cpp | 160 +++++++++++------ 3 files changed, 229 insertions(+), 112 deletions(-) diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp index bcf17fef758..422739af063 100644 --- a/src/glsl/link_uniform_block_active_visitor.cpp +++ b/src/glsl/link_uniform_block_active_visitor.cpp @@ -71,6 +71,88 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var) return NULL; } +/* For arrays of arrays this function will give us a middle ground between + * detecting inactive uniform blocks and structuring them in a way that makes + * it easy to calculate the offset for indirect indexing. + * + * For example given the shader: + * + * uniform ArraysOfArraysBlock + * { + * vec4 a; + * } i[3][4][5]; + * + * void main() + * { + * vec4 b = i[0][1][1].a; + * gl_Position = i[2][2][3].a + b; + * } + * + * There are only 2 active blocks above but for the sake of indirect indexing + * and not over complicating the code we will end up with a count of 8. + * Here each dimension has 2 different indices counted so we end up with 2*2*2 + */ +struct uniform_block_array_elements ** +process_arrays(void *mem_ctx, ir_dereference_array *ir, + struct link_uniform_block_active *block) +{ + if (ir) { + struct uniform_block_array_elements **ub_array_ptr = + process_arrays(mem_ctx, ir->array->as_dereference_array(), block); + if (*ub_array_ptr == NULL) { + *ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements); + (*ub_array_ptr)->ir = ir; + } + + struct uniform_block_array_elements *ub_array = *ub_array_ptr; + ir_constant *c = ir->array_index->as_constant(); + if (c) { + /* Index is a constant, so mark just that element used, + * if not already. + */ + const unsigned idx = c->get_uint_component(0); + + unsigned i; + for (i = 0; i < ub_array->num_array_elements; i++) { + if (ub_array->array_elements[i] == idx) + break; + } + + assert(i <= ub_array->num_array_elements); + + if (i == ub_array->num_array_elements) { + ub_array->array_elements = reralloc(mem_ctx, + ub_array->array_elements, + unsigned, + ub_array->num_array_elements + 1); + + ub_array->array_elements[ub_array->num_array_elements] = idx; + + ub_array->num_array_elements++; + } + } else { + /* The array index is not a constant, + * so mark the entire array used. + */ + assert(ir->array->type->is_array()); + if (ub_array->num_array_elements < ir->array->type->length) { + ub_array->num_array_elements = ir->array->type->length; + ub_array->array_elements = reralloc(mem_ctx, + ub_array->array_elements, + unsigned, + ub_array->num_array_elements); + + for (unsigned i = 0; i < ub_array->num_array_elements; i++) { + ub_array->array_elements[i] = i; + } + } + } + return &ub_array->array; + } else { + return &block->array; + } +} + ir_visitor_status link_uniform_block_active_visitor::visit(ir_variable *var) { @@ -101,24 +183,30 @@ link_uniform_block_active_visitor::visit(ir_variable *var) return visit_stop; } - assert(b->num_array_elements == 0); - assert(b->array_elements == NULL); + assert(b->array == NULL); assert(b->type != NULL); assert(!b->type->is_array() || b->has_instance_name); /* For uniform block arrays declared with a shared or std140 layout * qualifier, mark all its instances as used. */ - if (b->type->is_array() && b->type->length > 0) { - b->num_array_elements = b->type->length; - b->array_elements = reralloc(this->mem_ctx, - b->array_elements, - unsigned, - b->num_array_elements); + const glsl_type *type = b->type; + struct uniform_block_array_elements **ub_array = &b->array; + while (type->is_array()) { + assert(b->type->length > 0); - for (unsigned i = 0; i < b->num_array_elements; i++) { - b->array_elements[i] = i; + *ub_array = rzalloc(this->mem_ctx, struct uniform_block_array_elements); + (*ub_array)->num_array_elements = type->length; + (*ub_array)->array_elements = reralloc(this->mem_ctx, + (*ub_array)->array_elements, + unsigned, + (*ub_array)->num_array_elements); + + for (unsigned i = 0; i < (*ub_array)->num_array_elements; i++) { + (*ub_array)->array_elements[i] = i; } + ub_array = &(*ub_array)->array; + type = type->fields.array; } return visit_continue; @@ -127,7 +215,13 @@ link_uniform_block_active_visitor::visit(ir_variable *var) ir_visitor_status link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir) { - ir_dereference_variable *const d = ir->array->as_dereference_variable(); + /* cycle through arrays of arrays */ + ir_dereference_array *base_ir = ir; + while (base_ir->array->ir_type == ir_type_dereference_array) + base_ir = base_ir->array->as_dereference_array(); + + ir_dereference_variable *const d = + base_ir->array->as_dereference_variable(); ir_variable *const var = (d == NULL) ? NULL : d->var; /* If the r-value being dereferenced is not a variable (e.g., a field of a @@ -158,55 +252,16 @@ link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir) /* Block arrays must be declared with an instance name. */ assert(b->has_instance_name); - assert((b->num_array_elements == 0) == (b->array_elements == NULL)); assert(b->type != NULL); /* If the block array was declared with a shared or * std140 layout qualifier, all its instances have been already marked * as used in link_uniform_block_active_visitor::visit(ir_variable *). */ - if (var->get_interface_type()->interface_packing != - GLSL_INTERFACE_PACKING_PACKED) - return visit_continue_with_parent; - - ir_constant *c = ir->array_index->as_constant(); - - if (c) { - /* Index is a constant, so mark just that element used, if not already */ - const unsigned idx = c->get_uint_component(0); - - unsigned i; - for (i = 0; i < b->num_array_elements; i++) { - if (b->array_elements[i] == idx) - break; - } - - assert(i <= b->num_array_elements); - - if (i == b->num_array_elements) { - b->array_elements = reralloc(this->mem_ctx, - b->array_elements, - unsigned, - b->num_array_elements + 1); - - b->array_elements[b->num_array_elements] = idx; - - b->num_array_elements++; - } - } else { - /* The array index is not a constant, so mark the entire array used. */ - assert(b->type->is_array()); - if (b->num_array_elements < b->type->length) { - b->num_array_elements = b->type->length; - b->array_elements = reralloc(this->mem_ctx, - b->array_elements, - unsigned, - b->num_array_elements); - - for (unsigned i = 0; i < b->num_array_elements; i++) { - b->array_elements[i] = i; - } - } + if (var->get_interface_type()->interface_packing == + GLSL_INTERFACE_PACKING_PACKED) { + b->var = var; + process_arrays(this->mem_ctx, ir, b); } return visit_continue_with_parent; @@ -234,8 +289,7 @@ link_uniform_block_active_visitor::visit(ir_dereference_variable *ir) return visit_stop; } - assert(b->num_array_elements == 0); - assert(b->array_elements == NULL); + assert(b->array == NULL); assert(b->type != NULL); return visit_continue; diff --git a/src/glsl/link_uniform_block_active_visitor.h b/src/glsl/link_uniform_block_active_visitor.h index b663a884db4..afb52c14a37 100644 --- a/src/glsl/link_uniform_block_active_visitor.h +++ b/src/glsl/link_uniform_block_active_visitor.h @@ -28,12 +28,21 @@ #include "ir.h" #include "util/hash_table.h" -struct link_uniform_block_active { - const glsl_type *type; - +struct uniform_block_array_elements { unsigned *array_elements; unsigned num_array_elements; + ir_dereference_array *ir; + + struct uniform_block_array_elements *array; +}; + +struct link_uniform_block_active { + const glsl_type *type; + ir_variable *var; + + struct uniform_block_array_elements *array; + unsigned binding; bool has_instance_name; diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp index 7ceffee799e..5285d8d01e4 100644 --- a/src/glsl/link_uniform_blocks.cpp +++ b/src/glsl/link_uniform_blocks.cpp @@ -116,7 +116,7 @@ private: char *open_bracket = strchr(v->IndexName, '['); assert(open_bracket != NULL); - char *close_bracket = strchr(open_bracket, ']'); + char *close_bracket = strchr(open_bracket, '.') - 1; assert(close_bracket != NULL); /* Length of the tail without the ']' but with the NUL. @@ -185,6 +185,91 @@ struct block { bool has_instance_name; }; +static void +process_block_array(struct uniform_block_array_elements *ub_array, char **name, + size_t name_length, gl_uniform_block *blocks, + ubo_visitor *parcel, gl_uniform_buffer_variable *variables, + const struct link_uniform_block_active *const b, + unsigned *block_index, unsigned *binding_offset, + struct gl_context *ctx, struct gl_shader_program *prog) +{ + if (ub_array) { + for (unsigned j = 0; j < ub_array->num_array_elements; j++) { + size_t new_length = name_length; + + /* Append the subscript to the current variable name */ + ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", + ub_array->array_elements[j]); + + process_block_array(ub_array->array, name, new_length, blocks, + parcel, variables, b, block_index, + binding_offset, ctx, prog); + } + } else { + unsigned i = *block_index; + const glsl_type *type = b->type->without_array(); + + blocks[i].Name = ralloc_strdup(blocks, *name); + blocks[i].Uniforms = &variables[(*parcel).index]; + + /* The GL_ARB_shading_language_420pack spec says: + * + * "If the binding identifier is used with a uniform block + * instanced as an array then the first element of the array + * takes the specified block binding and each subsequent + * element takes the next consecutive uniform block binding + * point." + */ + blocks[i].Binding = (b->has_binding) ? b->binding + *binding_offset : 0; + + blocks[i].UniformBufferSize = 0; + blocks[i]._Packing = gl_uniform_block_packing(type->interface_packing); + + parcel->process(type, blocks[i].Name); + + blocks[i].UniformBufferSize = parcel->buffer_size; + + /* Check SSBO size is lower than maximum supported size for SSBO */ + if (b->is_shader_storage && + parcel->buffer_size > ctx->Const.MaxShaderStorageBlockSize) { + linker_error(prog, "shader storage block `%s' has size %d, " + "which is larger than than the maximum allowed (%d)", + b->type->name, + parcel->buffer_size, + ctx->Const.MaxShaderStorageBlockSize); + } + blocks[i].NumUniforms = + (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms); + blocks[i].IsShaderStorage = b->is_shader_storage; + + *block_index = *block_index + 1; + *binding_offset = *binding_offset + 1; + } +} + +/* This function resizes the array types of the block so that later we can use + * this new size to correctly calculate the offest for indirect indexing. + */ +const glsl_type * +resize_block_array(const glsl_type *type, + struct uniform_block_array_elements *ub_array) +{ + if (type->is_array()) { + struct uniform_block_array_elements *child_array = + type->fields.array->is_array() ? ub_array->array : NULL; + const glsl_type *new_child_type = + resize_block_array(type->fields.array, child_array); + + const glsl_type *new_type = + glsl_type::get_array_instance(new_child_type, + ub_array->num_array_elements); + ub_array->ir->array->type = new_type; + return new_type; + } else { + return type; + } +} + unsigned link_uniform_blocks(void *mem_ctx, struct gl_context *ctx, @@ -223,21 +308,25 @@ link_uniform_blocks(void *mem_ctx, struct hash_entry *entry; hash_table_foreach (block_hash, entry) { - const struct link_uniform_block_active *const b = - (const struct link_uniform_block_active *) entry->data; + struct link_uniform_block_active *const b = + (struct link_uniform_block_active *) entry->data; - const glsl_type *const block_type = - b->type->is_array() ? b->type->fields.array : b->type; + assert((b->array != NULL) == b->type->is_array()); - assert((b->num_array_elements > 0) == b->type->is_array()); + if (b->array != NULL && + (b->type->without_array()->interface_packing == + GLSL_INTERFACE_PACKING_PACKED)) { + b->type = resize_block_array(b->type, b->array); + b->var->type = b->type; + } block_size.num_active_uniforms = 0; - block_size.process(block_type, ""); + block_size.process(b->type->without_array(), ""); - if (b->num_array_elements > 0) { - num_blocks += b->num_array_elements; - num_variables += b->num_array_elements - * block_size.num_active_uniforms; + if (b->array != NULL) { + unsigned aoa_size = b->type->arrays_of_arrays_size(); + num_blocks += aoa_size; + num_variables += aoa_size * block_size.num_active_uniforms; } else { num_blocks++; num_variables += block_size.num_active_uniforms; @@ -281,50 +370,15 @@ link_uniform_blocks(void *mem_ctx, (const struct link_uniform_block_active *) entry->data; const glsl_type *block_type = b->type; - if (b->num_array_elements > 0) { - const char *const name = block_type->fields.array->name; + if (b->array != NULL) { + unsigned binding_offset = 0; + char *name = ralloc_strdup(NULL, block_type->without_array()->name); + size_t name_length = strlen(name); assert(b->has_instance_name); - for (unsigned j = 0; j < b->num_array_elements; j++) { - blocks[i].Name = ralloc_asprintf(blocks, "%s[%u]", name, - b->array_elements[j]); - blocks[i].Uniforms = &variables[parcel.index]; - - /* The GL_ARB_shading_language_420pack spec says: - * - * "If the binding identifier is used with a uniform block - * instanced as an array then the first element of the array - * takes the specified block binding and each subsequent - * element takes the next consecutive uniform block binding - * point." - */ - blocks[i].Binding = (b->has_binding) ? b->binding + j : 0; - - blocks[i].UniformBufferSize = 0; - blocks[i]._Packing = - gl_uniform_block_packing(block_type->interface_packing); - - parcel.process(block_type->fields.array, - blocks[i].Name); - - blocks[i].UniformBufferSize = parcel.buffer_size; - - /* Check SSBO size is lower than maximum supported size for SSBO */ - if (b->is_shader_storage && - parcel.buffer_size > ctx->Const.MaxShaderStorageBlockSize) { - linker_error(prog, "shader storage block `%s' has size %d, " - "which is larger than than the maximum allowed (%d)", - block_type->name, - parcel.buffer_size, - ctx->Const.MaxShaderStorageBlockSize); - } - blocks[i].NumUniforms = - (unsigned)(ptrdiff_t)(&variables[parcel.index] - blocks[i].Uniforms); - - blocks[i].IsShaderStorage = b->is_shader_storage; - - i++; - } + process_block_array(b->array, &name, name_length, blocks, &parcel, + variables, b, &i, &binding_offset, ctx, prog); + ralloc_free(name); } else { blocks[i].Name = ralloc_strdup(blocks, block_type->name); blocks[i].Uniforms = &variables[parcel.index]; From bb5aeb854915ba67abc56257f830d002c956439e Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Fri, 9 Oct 2015 21:54:09 +1100 Subject: [PATCH 141/270] glsl: build ubo name and indexing offset for AoA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V2: split out unrelated change as suggested by Samuel Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/lower_ubo_reference.cpp | 116 +++++++++++++++++++++++-------- 1 file changed, 86 insertions(+), 30 deletions(-) diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index da2713e4ab5..96fb91b4526 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -203,55 +203,113 @@ static const char * interface_field_name(void *mem_ctx, char *base_name, ir_rvalue *d, ir_rvalue **nonconst_block_index) { - ir_rvalue *previous_index = NULL; *nonconst_block_index = NULL; + char *name_copy = NULL; + size_t base_length = 0; + + /* Loop back through the IR until we find the uniform block */ + ir_rvalue *ir = d; + while (ir != NULL) { + switch (ir->ir_type) { + case ir_type_dereference_variable: { + /* Exit loop */ + ir = NULL; + break; + } + + case ir_type_dereference_record: { + ir_dereference_record *r = (ir_dereference_record *) ir; + ir = r->record->as_dereference(); + + /* If we got here it means any previous array subscripts belong to + * block members and not the block itself so skip over them in the + * next pass. + */ + d = ir; + break; + } + + case ir_type_dereference_array: { + ir_dereference_array *a = (ir_dereference_array *) ir; + ir = a->array->as_dereference(); + break; + } + + case ir_type_swizzle: { + ir_swizzle *s = (ir_swizzle *) ir; + ir = s->val->as_dereference(); + break; + } + + default: + assert(!"Should not get here."); + break; + } + } while (d != NULL) { switch (d->ir_type) { case ir_type_dereference_variable: { ir_dereference_variable *v = (ir_dereference_variable *) d; - if (previous_index - && v->var->is_interface_instance() - && v->var->type->is_array()) { - - ir_constant *const_index = previous_index->as_constant(); - if (!const_index) { - *nonconst_block_index = previous_index; - return ralloc_asprintf(mem_ctx, "%s[0]", base_name); - } else { - return ralloc_asprintf(mem_ctx, - "%s[%d]", - base_name, - const_index->get_uint_component(0)); - } + if (name_copy != NULL && + v->var->is_interface_instance() && + v->var->type->is_array()) { + return name_copy; } else { + *nonconst_block_index = NULL; return base_name; } break; } - case ir_type_dereference_record: { - ir_dereference_record *r = (ir_dereference_record *) d; - - d = r->record->as_dereference(); - break; - } - case ir_type_dereference_array: { ir_dereference_array *a = (ir_dereference_array *) d; + size_t new_length; + + if (name_copy == NULL) { + name_copy = ralloc_strdup(mem_ctx, base_name); + base_length = strlen(name_copy); + } + + /* For arrays of arrays we start at the innermost array and work our + * way out so we need to insert the subscript at the base of the + * name string rather than just attaching it to the end. + */ + new_length = base_length; + ir_constant *const_index = a->array_index->as_constant(); + char *end = ralloc_strdup(NULL, &name_copy[new_length]); + if (!const_index) { + ir_rvalue *array_index = a->array_index; + if (array_index->type != glsl_type::uint_type) + array_index = i2u(array_index); + + if (a->array->type->fields.array->is_array()) { + ir_constant *base_size = new(mem_ctx) + ir_constant(a->array->type->fields.array->arrays_of_arrays_size()); + array_index = mul(array_index, base_size); + } + + if (*nonconst_block_index) { + *nonconst_block_index = add(*nonconst_block_index, array_index); + } else { + *nonconst_block_index = array_index; + } + + ralloc_asprintf_rewrite_tail(&name_copy, &new_length, "[0]%s", + end); + } else { + ralloc_asprintf_rewrite_tail(&name_copy, &new_length, "[%d]%s", + const_index->get_uint_component(0), + end); + } + ralloc_free(end); d = a->array->as_dereference(); - previous_index = a->array_index; break; } - case ir_type_swizzle: { - ir_swizzle *s = (ir_swizzle *) d; - d = s->val->as_dereference(); - break; - } default: assert(!"Should not get here."); break; @@ -295,8 +353,6 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, ir_constant *index = new(mem_ctx) ir_constant(i); if (nonconst_block_index) { - if (nonconst_block_index->type != glsl_type::uint_type) - nonconst_block_index = i2u(nonconst_block_index); this->uniform_block = add(nonconst_block_index, index); } else { this->uniform_block = index; From f22b7933e2e9c31b3730f5b1d9c060d2e1377d20 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Fri, 9 Oct 2015 22:00:20 +1100 Subject: [PATCH 142/270] glsl: allow for AoA in calculating offset to ubo start region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Samuel Iglesias Gonsálvez --- src/glsl/lower_ubo_reference.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index 96fb91b4526..1fbb09de0b1 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -397,7 +397,7 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, if (deref_array->array->type->is_double()) array_stride *= 2; *matrix_columns = deref_array->array->type->matrix_columns; - } else if (deref_array->type->is_interface()) { + } else if (deref_array->type->without_array()->is_interface()) { /* We're processing an array dereference of an interface instance * array. The thing being dereferenced *must* be a variable * dereference because interfaces cannot be embedded in other @@ -406,7 +406,6 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, * interface instance array will have the same offsets relative to * the base of the block that backs them. */ - assert(deref_array->array->as_dereference_variable()); deref = deref_array->array->as_dereference(); break; } else { From 8da9e154b7a2463369b32a10742af3a5695eb2ab Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sat, 6 Jun 2015 09:10:55 +1000 Subject: [PATCH 143/270] glsl: Allow arrays of arrays in GLSL ES 3.10 and GLSL 4.30 V3: use a check_*_allowed style function for requirements checking rather than has_* which doesn't encapsulate the error message V2: add missing 's' to the extension name in error messages and add decimal place in version string Reviewed-by: Marta Lofstedt --- src/glsl/ast_to_hir.cpp | 7 +------ src/glsl/glsl_parser.yy | 17 +++++------------ src/glsl/glsl_parser_extras.h | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index db617cb41ac..cd40fe343e3 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -2141,12 +2141,7 @@ process_array_type(YYLTYPE *loc, const glsl_type *base, * * "Only one-dimensional arrays may be declared." */ - if (!state->ARB_arrays_of_arrays_enable) { - _mesa_glsl_error(loc, state, - "invalid array of `%s'" - "GL_ARB_arrays_of_arrays " - "required for defining arrays of arrays", - base->name); + if (!state->check_arrays_of_arrays_allowed(loc)) { return glsl_type::error_type; } } diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy index 16c91710bbb..cd00f6e085b 100644 --- a/src/glsl/glsl_parser.yy +++ b/src/glsl/glsl_parser.yy @@ -1978,25 +1978,18 @@ array_specifier: void *ctx = state; $$ = $1; - if (!state->ARB_arrays_of_arrays_enable) { - _mesa_glsl_error(& @1, state, - "GL_ARB_arrays_of_arrays " - "required for defining arrays of arrays"); + if (state->check_arrays_of_arrays_allowed(& @1)) { + $$->add_dimension(new(ctx) ast_expression(ast_unsized_array_dim, NULL, + NULL, NULL)); } - $$->add_dimension(new(ctx) ast_expression(ast_unsized_array_dim, NULL, - NULL, NULL)); } | array_specifier '[' constant_expression ']' { $$ = $1; - if (!state->ARB_arrays_of_arrays_enable) { - _mesa_glsl_error(& @1, state, - "GL_ARB_arrays_of_arrays " - "required for defining arrays of arrays"); + if (state->check_arrays_of_arrays_allowed(& @1)) { + $$->add_dimension($3); } - - $$->add_dimension($3); } ; diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h index 7fee43ece52..e8740f9ecb9 100644 --- a/src/glsl/glsl_parser_extras.h +++ b/src/glsl/glsl_parser_extras.h @@ -115,6 +115,20 @@ struct _mesa_glsl_parse_state { unsigned required_glsl_es_version, YYLTYPE *locp, const char *fmt, ...) PRINTFLIKE(5, 6); + bool check_arrays_of_arrays_allowed(YYLTYPE *locp) + { + if (!(ARB_arrays_of_arrays_enable || is_version(430, 310))) { + const char *const requirement = this->es_shader + ? "GLSL ES 3.10" + : "GL_ARB_arrays_of_arrays or GLSL 4.30"; + _mesa_glsl_error(locp, this, + "%s required for defining arrays of arrays.", + requirement); + return false; + } + return true; + } + bool check_precision_qualifiers_allowed(YYLTYPE *locp) { return check_version(130, 100, locp, From 2034bdd46ce757a18fdb3498f6a0232db16522f3 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Sat, 10 Oct 2015 16:13:38 +0100 Subject: [PATCH 144/270] nir: include nir_instr_set.h in the tarball Signed-off-by: Emil Velikov --- src/glsl/Makefile.sources | 1 + 1 file changed, 1 insertion(+) diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index 436949cd760..4da64f43873 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -34,6 +34,7 @@ NIR_FILES = \ nir/nir_intrinsics.c \ nir/nir_intrinsics.h \ nir/nir_instr_set.c \ + nir/nir_instr_set.h \ nir/nir_live_variables.c \ nir/nir_lower_alu_to_scalar.c \ nir/nir_lower_atomics.c \ From bcb56c2c69dd1695d7828d831d71c957e7e497c6 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Tue, 13 Oct 2015 11:26:09 +0100 Subject: [PATCH 145/270] program: convert _mesa_init_gl_program() to take struct gl_program * MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than accepting a void pointer, only to down and up cast around it, convert the function to take the base (struct gl_program) pointer. Signed-off-by: Emil Velikov Reviewed-by: Marek Olšák --- src/mesa/drivers/dri/i915/i915_fragprog.c | 9 +-- src/mesa/drivers/dri/i965/brw_program.c | 6 +- .../dri/i965/test_fs_cmod_propagation.cpp | 2 +- .../dri/i965/test_fs_saturate_propagation.cpp | 2 +- .../dri/i965/test_vec4_copy_propagation.cpp | 2 +- .../dri/i965/test_vec4_register_coalesce.cpp | 2 +- src/mesa/drivers/dri/r200/r200_vertprog.c | 17 +++--- src/mesa/program/program.c | 55 +++++++++---------- src/mesa/program/program.h | 2 +- src/mesa/state_tracker/st_cb_program.c | 38 +++++++------ 10 files changed, 68 insertions(+), 67 deletions(-) diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c index 237d219289b..59d795998c6 100644 --- a/src/mesa/drivers/dri/i915/i915_fragprog.c +++ b/src/mesa/drivers/dri/i915/i915_fragprog.c @@ -1315,9 +1315,10 @@ static struct gl_program * i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id) { switch (target) { - case GL_VERTEX_PROGRAM_ARB: - return _mesa_init_gl_program(CALLOC_STRUCT(gl_vertex_program), - target, id); + case GL_VERTEX_PROGRAM_ARB: { + struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } case GL_FRAGMENT_PROGRAM_ARB:{ struct i915_fragment_program *prog = @@ -1325,7 +1326,7 @@ i915NewProgram(struct gl_context * ctx, GLenum target, GLuint id) if (prog) { i915_init_program(I915_CONTEXT(ctx), prog); - return _mesa_init_gl_program(prog, target, id); + return _mesa_init_gl_program(&prog->FragProg.Base, target, id); } else return NULL; diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 164c3d76c99..b547d07f0ca 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -69,7 +69,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_gl_program(&prog->program, target, id); + return _mesa_init_gl_program(&prog->program.Base, target, id); } else return NULL; @@ -80,7 +80,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_gl_program(&prog->program, target, id); + return _mesa_init_gl_program(&prog->program.Base, target, id); } else return NULL; @@ -102,7 +102,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx, if (prog) { prog->id = get_new_program_id(brw->intelScreen); - return _mesa_init_gl_program(&prog->program, target, id); + return _mesa_init_gl_program(&prog->program.Base, target, id); } else { return NULL; } diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp index 7eee42630a6..5f80f90a91d 100644 --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp @@ -66,7 +66,7 @@ void cmod_propagation_test::SetUp() v = new cmod_propagation_fs_visitor(compiler, prog_data, shader); - _mesa_init_gl_program(&fp->program, GL_FRAGMENT_SHADER, 0); + _mesa_init_gl_program(&fp->program.Base, GL_FRAGMENT_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp index fefde4bb7bf..32e8b8f8867 100644 --- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp @@ -66,7 +66,7 @@ void saturate_propagation_test::SetUp() v = new saturate_propagation_fs_visitor(compiler, prog_data, shader); - _mesa_init_gl_program(&fp->program, GL_FRAGMENT_SHADER, 0); + _mesa_init_gl_program(&fp->program.Base, GL_FRAGMENT_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp index 4a87e6eff96..e80b71b558d 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp @@ -98,7 +98,7 @@ void copy_propagation_test::SetUp() v = new copy_propagation_vec4_visitor(compiler, shader); - _mesa_init_gl_program(&vp->program, GL_VERTEX_SHADER, 0); + _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp index 92d75e79837..2f824617454 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp @@ -101,7 +101,7 @@ void register_coalesce_test::SetUp() v = new register_coalesce_vec4_visitor(compiler, shader); - _mesa_init_gl_program(&vp->program, GL_VERTEX_SHADER, 0); + _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0); devinfo->gen = 4; } diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c index d173605d110..628c5708090 100644 --- a/src/mesa/drivers/dri/r200/r200_vertprog.c +++ b/src/mesa/drivers/dri/r200/r200_vertprog.c @@ -1200,18 +1200,19 @@ r200BindProgram(struct gl_context *ctx, GLenum target, struct gl_program *prog) static struct gl_program * r200NewProgram(struct gl_context *ctx, GLenum target, GLuint id) { - struct r200_vertex_program *vp; - switch(target){ - case GL_VERTEX_PROGRAM_ARB: - vp = CALLOC_STRUCT(r200_vertex_program); - return _mesa_init_gl_program(&vp->mesa_program, target, id); - case GL_FRAGMENT_PROGRAM_ARB: - return _mesa_init_gl_program(CALLOC_STRUCT(gl_fragment_program), target, id); + case GL_VERTEX_PROGRAM_ARB: { + struct r200_vertex_program *vp = CALLOC_STRUCT(r200_vertex_program); + return _mesa_init_gl_program(&vp->mesa_program.Base, target, id); + } + case GL_FRAGMENT_PROGRAM_ARB: { + struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } default: _mesa_problem(ctx, "Bad target in r200NewProgram"); + return NULL; } - return NULL; } diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index 86de5e965f1..0e78e6ab25d 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -176,9 +176,8 @@ _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string) * Initialize a new gl_program object. */ struct gl_program * -_mesa_init_gl_program(void *_prog, GLenum target, GLuint id) +_mesa_init_gl_program(struct gl_program *prog, GLenum target, GLuint id) { - struct gl_program *prog = (struct gl_program*)_prog; GLuint i; if (!prog) @@ -214,38 +213,36 @@ _mesa_init_gl_program(void *_prog, GLenum target, GLuint id) struct gl_program * _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id) { - struct gl_program *prog; switch (target) { - case GL_VERTEX_PROGRAM_ARB: /* == GL_VERTEX_PROGRAM_NV */ - prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_vertex_program), - target, id); - break; + case GL_VERTEX_PROGRAM_ARB: { /* == GL_VERTEX_PROGRAM_NV */ + struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } case GL_FRAGMENT_PROGRAM_NV: - case GL_FRAGMENT_PROGRAM_ARB: - prog =_mesa_init_gl_program(CALLOC_STRUCT(gl_fragment_program), - target, id); - break; - case GL_GEOMETRY_PROGRAM_NV: - prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_geometry_program), - target, id); - break; - case GL_TESS_CONTROL_PROGRAM_NV: - prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_tess_ctrl_program), - target, id); - break; - case GL_TESS_EVALUATION_PROGRAM_NV: - prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_tess_eval_program), - target, id); - break; - case GL_COMPUTE_PROGRAM_NV: - prog = _mesa_init_gl_program(CALLOC_STRUCT(gl_compute_program), - target, id); - break; + case GL_FRAGMENT_PROGRAM_ARB: { + struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_GEOMETRY_PROGRAM_NV: { + struct gl_geometry_program *prog = CALLOC_STRUCT(gl_geometry_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_TESS_CONTROL_PROGRAM_NV: { + struct gl_tess_ctrl_program *prog = CALLOC_STRUCT(gl_tess_ctrl_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_TESS_EVALUATION_PROGRAM_NV: { + struct gl_tess_eval_program *prog = CALLOC_STRUCT(gl_tess_eval_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_COMPUTE_PROGRAM_NV: { + struct gl_compute_program *prog = CALLOC_STRUCT(gl_compute_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } default: _mesa_problem(ctx, "bad target in _mesa_new_program"); - prog = NULL; + return NULL; } - return prog; } diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index 51e10a1708b..24e05974dc3 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -64,7 +64,7 @@ extern void _mesa_set_program_error(struct gl_context *ctx, GLint pos, const char *string); extern struct gl_program * -_mesa_init_gl_program(void *prog, GLenum target, GLuint id); +_mesa_init_gl_program(struct gl_program *prog, GLenum target, GLuint id); extern struct gl_program * _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id); diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 40f2af0e550..26d128abd38 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -102,29 +102,31 @@ st_use_program(struct gl_context *ctx, struct gl_shader_program *shProg) static struct gl_program * st_new_program(struct gl_context *ctx, GLenum target, GLuint id) { - struct gl_program *prog; - switch (target) { - case GL_VERTEX_PROGRAM_ARB: - prog = (struct gl_program*)ST_CALLOC_STRUCT(st_vertex_program); - break; - case GL_FRAGMENT_PROGRAM_ARB: - prog = (struct gl_program*)ST_CALLOC_STRUCT(st_fragment_program); - break; - case GL_GEOMETRY_PROGRAM_NV: - prog = (struct gl_program*)ST_CALLOC_STRUCT(st_geometry_program); - break; - case GL_TESS_CONTROL_PROGRAM_NV: - prog = (struct gl_program*)ST_CALLOC_STRUCT(st_tessctrl_program); - break; - case GL_TESS_EVALUATION_PROGRAM_NV: - prog = (struct gl_program*)ST_CALLOC_STRUCT(st_tesseval_program); - break; + case GL_VERTEX_PROGRAM_ARB: { + struct st_vertex_program *prog = ST_CALLOC_STRUCT(st_vertex_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_FRAGMENT_PROGRAM_ARB: { + struct st_fragment_program *prog = ST_CALLOC_STRUCT(st_fragment_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_GEOMETRY_PROGRAM_NV: { + struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_TESS_CONTROL_PROGRAM_NV: { + struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } + case GL_TESS_EVALUATION_PROGRAM_NV: { + struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program); + return _mesa_init_gl_program(&prog->Base, target, id); + } default: assert(0); return NULL; } - return _mesa_init_gl_program(prog, target, id); } From 45f0392ceb463612ec2f31c52adedf5440fffc32 Mon Sep 17 00:00:00 2001 From: Mauro Rossi Date: Sun, 11 Oct 2015 13:49:38 +0200 Subject: [PATCH 146/270] i965: android: add the i965_compile_FILES sources to the driver i965_compile_FILES are needed otherwise we'll error out as below: target SharedLib: i915_dri (out/target/product/x86/obj/SHARED_LIBRARIES/i915_dri_intermediates/LINKED/i915_dri.so) external/mesa/src/mesa/drivers/dri/i965/brw_ir_fs.h:181: error: undefined reference to 'fs_inst::~fs_inst()' ... ... external/mesa/src/mesa/drivers/dri/i965/intel_screen.c:1484: error: undefined reference to 'brw_compiler_create' collect2: error: ld returned 1 exit status build/core/shared_library.mk:81: recipe for target 'out/target/product/x86/obj/SHARED_LIBRARIES/i965_dri_intermediates/LINKED/i965_dri.so' failed make: *** [out/target/product/x86/obj/SHARED_LIBRARIES/i965_dri_intermediates/LINKED/i965_dri.so] Error 1 [Emil Velikov: tweak commit message] Signed-off-by: Emil Velikov --- src/mesa/drivers/dri/i965/Android.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk index a9b963a9eca..d30a053e10f 100644 --- a/src/mesa/drivers/dri/i965/Android.mk +++ b/src/mesa/drivers/dri/i965/Android.mk @@ -48,6 +48,7 @@ LOCAL_C_INCLUDES := \ $(MESA_DRI_C_INCLUDES) LOCAL_SRC_FILES := \ + $(i965_compiler_FILES) \ $(i965_FILES) LOCAL_WHOLE_STATIC_LIBRARIES := \ From 67d8518a0e5a3df400a6e70de667d69e4b6ce9c5 Mon Sep 17 00:00:00 2001 From: Chih-Wei Huang Date: Mon, 12 Oct 2015 23:36:59 +0800 Subject: [PATCH 147/270] mesa: android: Fix the incorrect path of sse_minmax.c Cc: "10.6 11.0" Fixes: 669cfc267a1 (android: mesa: fix the path of the SSE4_1 optimisations) Signed-off-by: Chih-Wei Huang Reviewed-by: Emil Velikov --- src/mesa/Android.libmesa_dricore.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk index 2e308b83733..fef76c8582c 100644 --- a/src/mesa/Android.libmesa_dricore.mk +++ b/src/mesa/Android.libmesa_dricore.mk @@ -50,7 +50,7 @@ endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) LOCAL_SRC_FILES += \ main/streaming-load-memcpy.c \ - mesa/main/sse_minmax.c + main/sse_minmax.c LOCAL_CFLAGS := \ -msse4.1 \ -DUSE_SSE41 From 0de5e0f3fb0f3671a3ecec6ab4473f9131ecd0ae Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:08:50 -0600 Subject: [PATCH 148/270] mesa: remove FLUSH_VERTICES() in _mesa_MatrixMode() Changing the matrix mode alone has no effect on rendering and does not need to trigger a flush or state validation. Reviewed-by: Eric Anholt --- src/mesa/main/matrix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c index 2b8016a4a72..5ff5ac5bfe1 100644 --- a/src/mesa/main/matrix.c +++ b/src/mesa/main/matrix.c @@ -151,7 +151,6 @@ _mesa_MatrixMode( GLenum mode ) if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE) return; - FLUSH_VERTICES(ctx, _NEW_TRANSFORM); switch (mode) { case GL_MODELVIEW: From 083b3f5cb4c5bd701d6a371282d7dc8c4f5fcaa8 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:10:19 -0600 Subject: [PATCH 149/270] mesa: short-cut new_state == _NEW_LINE in _mesa_update_state_locked() We can skip to the end of _mesa_update_state_locked() if only the _NEW_LINE flag is set since none of the derived state depends on it (just like _NEW_CURRENT_ATTRIB). Note that we still call the ctx->Driver.UpdateState() function, of course. v2: use bitmask-based test, per Eric. Reviewed-by: Eric Anholt --- src/mesa/main/state.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c index d3b1c72b08d..4043c4f2057 100644 --- a/src/mesa/main/state.c +++ b/src/mesa/main/state.c @@ -391,8 +391,12 @@ _mesa_update_state_locked( struct gl_context *ctx ) GLbitfield new_state = ctx->NewState; GLbitfield prog_flags = _NEW_PROGRAM; GLbitfield new_prog_state = 0x0; + const GLbitfield computed_states = ~(_NEW_CURRENT_ATTRIB | _NEW_LINE); - if (new_state == _NEW_CURRENT_ATTRIB) + /* we can skip a bunch of state validation checks if the dirty + * state matches one or more bits in 'computed_states'. + */ + if ((new_state & computed_states) == 0) goto out; if (MESA_VERBOSE & VERBOSE_STATE) From 6fd29e6c31e14e7b0f3c530798a1fc983eee17af Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:31:41 -0600 Subject: [PATCH 150/270] mesa: optimize no-change check in _mesa_BlendFuncSeparate() Streamline the checking for no state change in _mesa_BlendFuncSeparate() (and _mesa_BlendFunc()). If _BlendFuncPerBuffer is false, we only need to check the 0th buffer state. Move argument validation after the no-op check. I'm looking at an app that issues about 1000 redundant glBlendFunc() calls per frame! Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 47 +++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index dee5e29d5b8..98d28581d26 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -203,7 +203,7 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorA, GLenum dfactorA ) { GLuint buf, numBuffers; - GLboolean changed; + bool changed = false; GET_CURRENT_CONTEXT(ctx); if (MESA_VERBOSE & VERBOSE_API) @@ -213,28 +213,41 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, _mesa_enum_to_string(sfactorA), _mesa_enum_to_string(dfactorA)); + numBuffers = ctx->Extensions.ARB_draw_buffers_blend + ? ctx->Const.MaxDrawBuffers : 1; + + /* Check if we're really changing any state. If not, return early. */ + if (ctx->Color._BlendFuncPerBuffer) { + /* Check all per-buffer states */ + for (buf = 0; buf < numBuffers; buf++) { + if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB || + ctx->Color.Blend[buf].DstRGB != dfactorRGB || + ctx->Color.Blend[buf].SrcA != sfactorA || + ctx->Color.Blend[buf].DstA != dfactorA) { + changed = true; + break; + } + } + } + else { + /* only need to check 0th per-buffer state */ + if (ctx->Color.Blend[0].SrcRGB != sfactorRGB || + ctx->Color.Blend[0].DstRGB != dfactorRGB || + ctx->Color.Blend[0].SrcA != sfactorA || + ctx->Color.Blend[0].DstA != dfactorA) { + changed = true; + } + } + + if (!changed) + return; + if (!validate_blend_factors(ctx, "glBlendFuncSeparate", sfactorRGB, dfactorRGB, sfactorA, dfactorA)) { return; } - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - - changed = GL_FALSE; - for (buf = 0; buf < numBuffers; buf++) { - if (ctx->Color.Blend[buf].SrcRGB != sfactorRGB || - ctx->Color.Blend[buf].DstRGB != dfactorRGB || - ctx->Color.Blend[buf].SrcA != sfactorA || - ctx->Color.Blend[buf].DstA != dfactorA) { - changed = GL_TRUE; - break; - } - } - if (!changed) - return; - FLUSH_VERTICES(ctx, _NEW_COLOR); for (buf = 0; buf < numBuffers; buf++) { From 2dfedf105d07e9a1f65f9bc76369cb33edf59cc9 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:41:11 -0600 Subject: [PATCH 151/270] mesa: optimize no-change check in _mesa_BlendEquation() Same story as preceeding change to _mesa_BlendFuncSeparate(). Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index 98d28581d26..01b69194814 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -345,33 +345,44 @@ void GLAPIENTRY _mesa_BlendEquation( GLenum mode ) { GLuint buf, numBuffers; - GLboolean changed; + bool changed = false; GET_CURRENT_CONTEXT(ctx); if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendEquation(%s)\n", _mesa_enum_to_string(mode)); + numBuffers = ctx->Extensions.ARB_draw_buffers_blend + ? ctx->Const.MaxDrawBuffers : 1; + + if (ctx->Color._BlendEquationPerBuffer) { + /* Check all per-buffer states */ + for (buf = 0; buf < numBuffers; buf++) { + if (ctx->Color.Blend[buf].EquationRGB != mode || + ctx->Color.Blend[buf].EquationA != mode) { + changed = true; + break; + } + } + } + else { + /* only need to check 0th per-buffer state */ + if (ctx->Color.Blend[0].EquationRGB != mode || + ctx->Color.Blend[0].EquationA != mode) { + changed = true; + } + } + + if (!changed) + return; + if (!legal_blend_equation(ctx, mode)) { _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation"); return; } - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - - changed = GL_FALSE; - for (buf = 0; buf < numBuffers; buf++) { - if (ctx->Color.Blend[buf].EquationRGB != mode || - ctx->Color.Blend[buf].EquationA != mode) { - changed = GL_TRUE; - break; - } - } - if (!changed) - return; - FLUSH_VERTICES(ctx, _NEW_COLOR); + for (buf = 0; buf < numBuffers; buf++) { ctx->Color.Blend[buf].EquationRGB = mode; ctx->Color.Blend[buf].EquationA = mode; From 34de3c4c1635a42c884da3321fc35ef07be34a6e Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:45:36 -0600 Subject: [PATCH 152/270] mesa: optimize no-change check in _mesa_BlendEquationSeparate() Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index 01b69194814..14742d0bb6a 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -432,7 +432,7 @@ void GLAPIENTRY _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA ) { GLuint buf, numBuffers; - GLboolean changed; + bool changed = false; GET_CURRENT_CONTEXT(ctx); if (MESA_VERBOSE & VERBOSE_API) @@ -440,6 +440,30 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA ) _mesa_enum_to_string(modeRGB), _mesa_enum_to_string(modeA)); + numBuffers = ctx->Extensions.ARB_draw_buffers_blend + ? ctx->Const.MaxDrawBuffers : 1; + + if (ctx->Color._BlendEquationPerBuffer) { + /* Check all per-buffer states */ + for (buf = 0; buf < numBuffers; buf++) { + if (ctx->Color.Blend[buf].EquationRGB != modeRGB || + ctx->Color.Blend[buf].EquationA != modeA) { + changed = true; + break; + } + } + } + else { + /* only need to check 0th per-buffer state */ + if (ctx->Color.Blend[0].EquationRGB != modeRGB || + ctx->Color.Blend[0].EquationA != modeA) { + changed = true; + } + } + + if (!changed) + return; + if ( (modeRGB != modeA) && !ctx->Extensions.EXT_blend_equation_separate ) { _mesa_error(ctx, GL_INVALID_OPERATION, "glBlendEquationSeparateEXT not supported by driver"); @@ -456,21 +480,8 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA ) return; } - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - - changed = GL_FALSE; - for (buf = 0; buf < numBuffers; buf++) { - if (ctx->Color.Blend[buf].EquationRGB != modeRGB || - ctx->Color.Blend[buf].EquationA != modeA) { - changed = GL_TRUE; - break; - } - } - if (!changed) - return; - FLUSH_VERTICES(ctx, _NEW_COLOR); + for (buf = 0; buf < numBuffers; buf++) { ctx->Color.Blend[buf].EquationRGB = modeRGB; ctx->Color.Blend[buf].EquationA = modeA; From 1d751655012c7b8aafbeb90d02e5314a8eedc890 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:35:53 -0600 Subject: [PATCH 153/270] mesa: move validate_blend_factors() call after no-change check A redundant call to glBlendFuncSeparateiARB() is more likely than getting invalid values, so do the no-op check first. Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index 14742d0bb6a..d225f3d171c 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -296,18 +296,18 @@ _mesa_BlendFuncSeparateiARB(GLuint buf, GLenum sfactorRGB, GLenum dfactorRGB, return; } - if (!validate_blend_factors(ctx, "glBlendFuncSeparatei", - sfactorRGB, dfactorRGB, - sfactorA, dfactorA)) { - return; - } - if (ctx->Color.Blend[buf].SrcRGB == sfactorRGB && ctx->Color.Blend[buf].DstRGB == dfactorRGB && ctx->Color.Blend[buf].SrcA == sfactorA && ctx->Color.Blend[buf].DstA == dfactorA) return; /* no change */ + if (!validate_blend_factors(ctx, "glBlendFuncSeparatei", + sfactorRGB, dfactorRGB, + sfactorA, dfactorA)) { + return; + } + FLUSH_VERTICES(ctx, _NEW_COLOR); ctx->Color.Blend[buf].SrcRGB = sfactorRGB; From d21e17f48f8e155918de9420378a3b233e4ca98c Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:34:39 -0600 Subject: [PATCH 154/270] mesa: fix incorrect error string in _mesa_BlendEquationiARB() Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index d225f3d171c..a7b7c5b7e8a 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -407,7 +407,7 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode) buf, _mesa_enum_to_string(mode)); if (buf >= ctx->Const.MaxDrawBuffers) { - _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)", + _mesa_error(ctx, GL_INVALID_VALUE, "glBlendEquationi(buffer=%u)", buf); return; } From dfbd62e772d4373f4ab7553b556931085a70488a Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 10:08:18 -0600 Subject: [PATCH 155/270] mesa: optimize _UsesDualSrc blend flag setting For glBlendFunc and glBlendFuncSeparate(), the _UsesDualSrc flag will be the same for all buffers, so no need to compute it N times. Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index a7b7c5b7e8a..9dec2d41408 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -255,8 +255,13 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, ctx->Color.Blend[buf].DstRGB = dfactorRGB; ctx->Color.Blend[buf].SrcA = sfactorA; ctx->Color.Blend[buf].DstA = dfactorA; - update_uses_dual_src(ctx, buf); } + + update_uses_dual_src(ctx, 0); + for (buf = 1; buf < numBuffers; buf++) { + ctx->Color.Blend[buf]._UsesDualSrc = ctx->Color.Blend[0]._UsesDualSrc; + } + ctx->Color._BlendFuncPerBuffer = GL_FALSE; if (ctx->Driver.BlendFuncSeparate) { From d8c23d156d9b014a52d83d1a2eb051981b5203f3 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:53:26 -0600 Subject: [PATCH 156/270] mesa: add num_buffers() helper in blend.c Reviewed-by: Eric Anholt --- src/mesa/main/blend.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index 9dec2d41408..20aa4980935 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -190,6 +190,19 @@ update_uses_dual_src(struct gl_context *ctx, int buf) blend_factor_is_dual_src(ctx->Color.Blend[buf].DstA)); } + +/** + * Return the number of per-buffer blend states to update in + * glBlendFunc, glBlendFuncSeparate, glBlendEquation, etc. + */ +static inline unsigned +num_buffers(const struct gl_context *ctx) +{ + return ctx->Extensions.ARB_draw_buffers_blend + ? ctx->Const.MaxDrawBuffers : 1; +} + + /** * Set the separate blend source/dest factors for all draw buffers. * @@ -202,9 +215,10 @@ void GLAPIENTRY _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorA, GLenum dfactorA ) { - GLuint buf, numBuffers; - bool changed = false; GET_CURRENT_CONTEXT(ctx); + const unsigned numBuffers = num_buffers(ctx); + unsigned buf; + bool changed = false; if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n", @@ -213,9 +227,6 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB, _mesa_enum_to_string(sfactorA), _mesa_enum_to_string(dfactorA)); - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - /* Check if we're really changing any state. If not, return early. */ if (ctx->Color._BlendFuncPerBuffer) { /* Check all per-buffer states */ @@ -349,17 +360,15 @@ legal_blend_equation(const struct gl_context *ctx, GLenum mode) void GLAPIENTRY _mesa_BlendEquation( GLenum mode ) { - GLuint buf, numBuffers; - bool changed = false; GET_CURRENT_CONTEXT(ctx); + const unsigned numBuffers = num_buffers(ctx); + unsigned buf; + bool changed = false; if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendEquation(%s)\n", _mesa_enum_to_string(mode)); - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - if (ctx->Color._BlendEquationPerBuffer) { /* Check all per-buffer states */ for (buf = 0; buf < numBuffers; buf++) { @@ -436,18 +445,16 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode) void GLAPIENTRY _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA ) { - GLuint buf, numBuffers; - bool changed = false; GET_CURRENT_CONTEXT(ctx); + const unsigned numBuffers = num_buffers(ctx); + unsigned buf; + bool changed = false; if (MESA_VERBOSE & VERBOSE_API) _mesa_debug(ctx, "glBlendEquationSeparateEXT(%s %s)\n", _mesa_enum_to_string(modeRGB), _mesa_enum_to_string(modeA)); - numBuffers = ctx->Extensions.ARB_draw_buffers_blend - ? ctx->Const.MaxDrawBuffers : 1; - if (ctx->Color._BlendEquationPerBuffer) { /* Check all per-buffer states */ for (buf = 0; buf < numBuffers; buf++) { From 5d954fd5cb4b35b896b90100956d45eaa30edacb Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:53:53 -0600 Subject: [PATCH 157/270] mesa: wrap a ridiculously long line in es1_conversion.c Reviewed-by: Eric Anholt --- src/mesa/main/es1_conversion.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/es1_conversion.c b/src/mesa/main/es1_conversion.c index b254a6ef1c7..1dfe8278e71 100644 --- a/src/mesa/main/es1_conversion.c +++ b/src/mesa/main/es1_conversion.c @@ -1,3 +1,4 @@ + #include #include "api_loopback.h" @@ -326,7 +327,24 @@ _mesa_GetTexEnvxv(GLenum target, GLenum pname, GLfixed *params) } break; case GL_TEXTURE_ENV: - if (pname != GL_TEXTURE_ENV_COLOR && pname != GL_RGB_SCALE && pname != GL_ALPHA_SCALE && pname != GL_TEXTURE_ENV_MODE && pname != GL_COMBINE_RGB && pname != GL_COMBINE_ALPHA && pname != GL_SRC0_RGB && pname != GL_SRC1_RGB && pname != GL_SRC2_RGB && pname != GL_SRC0_ALPHA && pname != GL_SRC1_ALPHA && pname != GL_SRC2_ALPHA && pname != GL_OPERAND0_RGB && pname != GL_OPERAND1_RGB && pname != GL_OPERAND2_RGB && pname != GL_OPERAND0_ALPHA && pname != GL_OPERAND1_ALPHA && pname != GL_OPERAND2_ALPHA) { + if (pname != GL_TEXTURE_ENV_COLOR && + pname != GL_RGB_SCALE && + pname != GL_ALPHA_SCALE && + pname != GL_TEXTURE_ENV_MODE && + pname != GL_COMBINE_RGB && + pname != GL_COMBINE_ALPHA && + pname != GL_SRC0_RGB && + pname != GL_SRC1_RGB && + pname != GL_SRC2_RGB && + pname != GL_SRC0_ALPHA && + pname != GL_SRC1_ALPHA && + pname != GL_SRC2_ALPHA && + pname != GL_OPERAND0_RGB && + pname != GL_OPERAND1_RGB && + pname != GL_OPERAND2_RGB && + pname != GL_OPERAND0_ALPHA && + pname != GL_OPERAND1_ALPHA && + pname != GL_OPERAND2_ALPHA) { _mesa_error(_mesa_get_current_context(), GL_INVALID_ENUM, "glGetTexEnvxv(target=0x%x)", target); return; From 635daef76ede735d97e202446d6477b7d2cf2f86 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 9 Oct 2015 07:02:57 -0700 Subject: [PATCH 158/270] nir/prog: Use nir_foreach_variable Reviewed-by: Matt Turner Reviewed-by: Iago Toral Quiroga --- src/mesa/program/prog_to_nir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index fc00534028f..d9b185461fe 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -923,7 +923,7 @@ ptn_add_output_stores(struct ptn_compile *c) { nir_builder *b = &c->build; - foreach_list_typed(nir_variable, var, node, &b->shader->outputs) { + nir_foreach_variable(var, &b->shader->outputs) { nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var); store->num_components = glsl_get_vector_elements(var->type); From eb893c220c8af9e7400973ce3eca41246379889b Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 9 Oct 2015 07:05:11 -0700 Subject: [PATCH 159/270] nir: Add helpers for creating variables and adding them to lists Reviewed-by: Iago Toral Quiroga --- src/glsl/nir/glsl_to_nir.cpp | 40 ++++----------------- src/glsl/nir/nir.c | 66 ++++++++++++++++++++++++++++++++++ src/glsl/nir/nir.h | 20 +++++++++++ src/mesa/program/prog_to_nir.c | 19 ++++------ 4 files changed, 99 insertions(+), 46 deletions(-) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 6f67b1dae5b..0e4289b32eb 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -389,35 +389,10 @@ nir_visitor::visit(ir_variable *ir) var->interface_type = ir->get_interface_type(); - switch (var->data.mode) { - case nir_var_local: - exec_list_push_tail(&impl->locals, &var->node); - break; - - case nir_var_global: - exec_list_push_tail(&shader->globals, &var->node); - break; - - case nir_var_shader_in: - exec_list_push_tail(&shader->inputs, &var->node); - break; - - case nir_var_shader_out: - exec_list_push_tail(&shader->outputs, &var->node); - break; - - case nir_var_uniform: - case nir_var_shader_storage: - exec_list_push_tail(&shader->uniforms, &var->node); - break; - - case nir_var_system_value: - exec_list_push_tail(&shader->system_values, &var->node); - break; - - default: - unreachable("not reached"); - } + if (var->data.mode == nir_var_local) + nir_function_impl_add_variable(impl, var); + else + nir_shader_add_variable(shader, var); _mesa_hash_table_insert(var_table, ir, var); this->var = var; @@ -2074,13 +2049,10 @@ nir_visitor::visit(ir_constant *ir) * constant initializer and return a dereference. */ - nir_variable *var = ralloc(this->shader, nir_variable); - var->name = ralloc_strdup(var, "const_temp"); - var->type = ir->type; - var->data.mode = nir_var_local; + nir_variable *var = + nir_local_variable_create(this->impl, ir->type, "const_temp"); var->data.read_only = true; var->constant_initializer = constant_copy(ir, var); - exec_list_push_tail(&this->impl->locals, &var->node); this->deref_head = nir_deref_var_create(this->shader, var); this->deref_tail = &this->deref_head->deref; diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c index 83670889a29..793bdafb54b 100644 --- a/src/glsl/nir/nir.c +++ b/src/glsl/nir/nir.c @@ -103,6 +103,72 @@ nir_reg_remove(nir_register *reg) exec_node_remove(®->node); } +void +nir_shader_add_variable(nir_shader *shader, nir_variable *var) +{ + switch (var->data.mode) { + case nir_var_local: + assert(!"nir_shader_add_variable cannot be used for local variables"); + break; + + case nir_var_global: + exec_list_push_tail(&shader->globals, &var->node); + break; + + case nir_var_shader_in: + exec_list_push_tail(&shader->inputs, &var->node); + break; + + case nir_var_shader_out: + exec_list_push_tail(&shader->outputs, &var->node); + break; + + case nir_var_uniform: + case nir_var_shader_storage: + exec_list_push_tail(&shader->uniforms, &var->node); + break; + + case nir_var_system_value: + exec_list_push_tail(&shader->system_values, &var->node); + break; + } +} + +nir_variable * +nir_variable_create(nir_shader *shader, nir_variable_mode mode, + const struct glsl_type *type, const char *name) +{ + nir_variable *var = rzalloc(shader, nir_variable); + var->name = ralloc_strdup(var, name); + var->type = type; + var->data.mode = mode; + + if ((mode == nir_var_shader_in && shader->stage != MESA_SHADER_VERTEX) || + (mode == nir_var_shader_out && shader->stage != MESA_SHADER_FRAGMENT)) + var->data.interpolation = INTERP_QUALIFIER_SMOOTH; + + if (mode == nir_var_shader_in || mode == nir_var_uniform) + var->data.read_only = true; + + nir_shader_add_variable(shader, var); + + return var; +} + +nir_variable * +nir_local_variable_create(nir_function_impl *impl, + const struct glsl_type *type, const char *name) +{ + nir_variable *var = rzalloc(impl->overload->function->shader, nir_variable); + var->name = ralloc_strdup(var, name); + var->type = type; + var->data.mode = nir_var_local; + + nir_function_impl_add_variable(impl, var); + + return var; +} + nir_function * nir_function_create(nir_shader *shader, const char *name) { diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 112c6b5412a..c867e6d9f18 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1559,6 +1559,26 @@ nir_register *nir_local_reg_create(nir_function_impl *impl); void nir_reg_remove(nir_register *reg); +/** Adds a variable to the appropreate list in nir_shader */ +void nir_shader_add_variable(nir_shader *shader, nir_variable *var); + +static inline void +nir_function_impl_add_variable(nir_function_impl *impl, nir_variable *var) +{ + assert(var->data.mode == nir_var_local); + exec_list_push_tail(&impl->locals, &var->node); +} + +/** creates a variable, sets a few defaults, and adds it to the list */ +nir_variable *nir_variable_create(nir_shader *shader, + nir_variable_mode mode, + const struct glsl_type *type, + const char *name); +/** creates a local variable and adds it to the list */ +nir_variable *nir_local_variable_create(nir_function_impl *impl, + const struct glsl_type *type, + const char *name); + /** creates a function and adds it to the shader's list of functions */ nir_function *nir_function_create(nir_shader *shader, const char *name); diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index d9b185461fe..fe8c238b159 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -958,11 +958,10 @@ setup_registers_and_variables(struct ptn_compile *c) for (int i = 0; i < num_inputs; i++) { if (!(c->prog->InputsRead & BITFIELD64_BIT(i))) continue; - nir_variable *var = rzalloc(shader, nir_variable); - var->type = glsl_vec4_type(); - var->data.read_only = true; - var->data.mode = nir_var_shader_in; - var->name = ralloc_asprintf(var, "in_%d", i); + + nir_variable *var = + nir_variable_create(shader, nir_var_shader_in, glsl_vec4_type(), + ralloc_asprintf(shader, "in_%d", i)); var->data.location = i; var->data.index = 0; @@ -992,12 +991,9 @@ setup_registers_and_variables(struct ptn_compile *c) nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0), nir_imm_float(b, 0.0), nir_imm_float(b, 1.0)); - nir_variable *fullvar = rzalloc(shader, nir_variable); - fullvar->type = glsl_vec4_type(); - fullvar->data.mode = nir_var_local; - fullvar->name = "fogcoord_tmp"; - exec_list_push_tail(&b->impl->locals, &fullvar->node); - + nir_variable *fullvar = + nir_local_variable_create(b->impl, glsl_vec4_type(), + "fogcoord_tmp"); nir_intrinsic_instr *store = nir_intrinsic_instr_create(shader, nir_intrinsic_store_var); store->num_components = 4; @@ -1015,7 +1011,6 @@ setup_registers_and_variables(struct ptn_compile *c) } } - exec_list_push_tail(&shader->inputs, &var->node); c->input_vars[i] = var; } From b705005584730cff75f5bbe057832d8b1106124a Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 18:36:27 -0700 Subject: [PATCH 160/270] nir/glsl: Use shader_prog->Name for naming the NIR shader This has the better name to use. Aparently, sh->Name is usually 0. Reviewed-by: Kenneth Graunke Reviewed-by: Neil Roberts --- src/glsl/nir/glsl_to_nir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 0e4289b32eb..5aba8f80b6b 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -150,7 +150,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, if (sh->Program->SamplersUsed & (1 << i)) num_textures = i; - shader->info.name = ralloc_asprintf(shader, "GLSL%d", sh->Name); + shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); shader->info.num_textures = num_textures; shader->info.num_ubos = sh->NumBufferInterfaceBlocks; shader->info.num_abos = shader_prog->NumAtomicBuffers; From 5f106153f55219d5092f9ba7c019316ea69baef4 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 15 Oct 2015 09:20:24 -0700 Subject: [PATCH 161/270] nir/prog: Don't double-insert the fog-coord variable nir_variable_create already inserts it in the right list for us so inserting it again causes a linked list corruption. Reviewed-by: Matt Turner --- src/mesa/program/prog_to_nir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index fe8c238b159..da61a2b9bd3 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -1001,11 +1001,10 @@ setup_registers_and_variables(struct ptn_compile *c) store->src[0] = nir_src_for_ssa(f001); nir_builder_instr_insert(b, &store->instr); - /* Insert the real input into the list so the driver has real - * inputs, but set c->input_vars[i] to the temporary so we use + /* We inserted the real input into the list so the driver has real + * inputs, but we set c->input_vars[i] to the temporary so we use * the splatted value. */ - exec_list_push_tail(&shader->inputs, &var->node); c->input_vars[i] = fullvar; continue; } From d31005e3e5588b20760c774f14ac0ea80375a181 Mon Sep 17 00:00:00 2001 From: Chih-Wei Huang Date: Thu, 15 Oct 2015 23:46:30 +0800 Subject: [PATCH 162/270] nv50/ir: use C++11 standard std::unordered_map if possible Note Android version before Lollipop is not supported. Signed-off-by: Chih-Wei Huang Reviewed-by: Ilia Mirkin Cc: mesa-stable@lists.freedesktop.org --- .../drivers/nouveau/codegen/nv50_ir_ra.cpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 400b9f09e51..7859c8e79bd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -25,10 +25,24 @@ #include #include +#if __cplusplus >= 201103L +#include +#else #include +#endif namespace nv50_ir { +#if __cplusplus >= 201103L +using std::hash; +using std::unordered_map; +#elif !defined(ANDROID) +using std::tr1::hash; +using std::tr1::unordered_map; +#else +#error Android release before Lollipop is not supported! +#endif + #define MAX_REGISTER_FILE_SIZE 256 class RegisterSet @@ -349,12 +363,12 @@ RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p) struct PhiMapHash { size_t operator()(const std::pair& val) const { - return std::tr1::hash()(val.first) * 31 + - std::tr1::hash()(val.second); + return hash()(val.first) * 31 + + hash()(val.second); } }; -typedef std::tr1::unordered_map< +typedef unordered_map< std::pair, Value *, PhiMapHash> PhiMap; // Critical edges need to be split up so that work can be inserted along From 7599f8b167321cb8adb2ba51a53163752b668532 Mon Sep 17 00:00:00 2001 From: Chih-Wei Huang Date: Thu, 15 Oct 2015 23:46:32 +0800 Subject: [PATCH 163/270] nv30: include the header of ffs prototype It fixes a building error of the android 6.0 64-bit target. Signed-off-by: Chih-Wei Huang Reviewed-by: Ilia Mirkin Cc: mesa-stable@lists.freedesktop.org --- src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c index 5757eb1fb16..dbbb8baad79 100644 --- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c +++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c @@ -1,3 +1,4 @@ +#include #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" From fefffdc2b21c35f4a08a55103ec1932faafe5993 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 15 Oct 2015 13:30:24 -0400 Subject: [PATCH 164/270] gallium/util: fix debug_get_flags_option on 32-bit harder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (yes, we want PRI?64, but we want the x version rather than the u version) Signed-off-by: Rob Clark Reviewed-by: Marek Olšák --- src/gallium/auxiliary/util/u_debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c index 5fe9e33e208..7388a499c74 100644 --- a/src/gallium/auxiliary/util/u_debug.c +++ b/src/gallium/auxiliary/util/u_debug.c @@ -276,7 +276,7 @@ debug_get_flags_option(const char *name, for (; flags->name; ++flags) namealign = MAX2(namealign, strlen(flags->name)); for (flags = orig; flags->name; ++flags) - _debug_printf("| %*s [0x%0*"PRIu64"]%s%s\n", namealign, flags->name, + _debug_printf("| %*s [0x%0*"PRIx64"]%s%s\n", namealign, flags->name, (int)sizeof(uint64_t)*CHAR_BIT/4, flags->value, flags->desc ? " " : "", flags->desc ? flags->desc : ""); } @@ -291,9 +291,9 @@ debug_get_flags_option(const char *name, if (debug_get_option_should_print()) { if (str) { - debug_printf("%s: %s = 0x%"PRIu64" (%s)\n", __FUNCTION__, name, result, str); + debug_printf("%s: %s = 0x%"PRIx64" (%s)\n", __FUNCTION__, name, result, str); } else { - debug_printf("%s: %s = 0x%"PRIu64"\n", __FUNCTION__, name, result); + debug_printf("%s: %s = 0x%"PRIx64"\n", __FUNCTION__, name, result); } } From 6206da736c84c4f7316ab586c886b4865fda8805 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 15 Oct 2015 16:22:23 -0400 Subject: [PATCH 165/270] freedreno/a3xx: cache-flush is needed after MEM_WRITE Otherwise the mem2gmem blit would see potentially bogus texture coordinates. Fixes an issue that shows up with glamor. CC: "11.0" Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 6 +----- src/gallium/drivers/freedreno/a3xx/fd3_emit.h | 11 +++++++++++ src/gallium/drivers/freedreno/a3xx/fd3_gmem.c | 2 ++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index 6153d92dc21..411f5b76329 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -798,11 +798,7 @@ fd3_emit_restore(struct fd_context *ctx) OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); - OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); - OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); - OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | - A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | - A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); + fd3_emit_cache_flush(ctx, ring); OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index 795654706a7..42483f6c39b 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -90,4 +90,15 @@ void fd3_emit_restore(struct fd_context *ctx); void fd3_emit_init(struct pipe_context *pctx); +static inline void +fd3_emit_cache_flush(struct fd_context *ctx, struct fd_ringbuffer *ring) +{ + fd_wfi(ctx, ring); + OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); +} + #endif /* FD3_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index 9a5b45e2fcb..21fb59e450d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -558,6 +558,8 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, fui(x1)); OUT_RING(ring, fui(y1)); + fd3_emit_cache_flush(ctx, ring); + for (i = 0; i < 4; i++) { OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | From ef7a5638290234a9d1f0574585174539e2c126eb Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 15 Oct 2015 16:28:17 -0400 Subject: [PATCH 166/270] freedreno: add debug option to dirty state after draw Similar to "dclear", "ddraw" will mark all state dirty after each draw. Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/freedreno_draw.c | 3 +++ src/gallium/drivers/freedreno/freedreno_screen.c | 3 ++- src/gallium/drivers/freedreno/freedreno_util.h | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 6831a58749c..7bf3343f43a 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -187,6 +187,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) for (i = 0; i < ctx->streamout.num_targets; i++) ctx->streamout.offsets[i] += prims; + if (fd_mesa_debug & FD_DBG_DDRAW) + ctx->dirty = 0xffffffff; + /* if an app (or, well, piglit test) does many thousands of draws * without flush (or anything which implicitly flushes, like * changing render targets), we can exceed the ringbuffer size. diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 0d0100590d6..b64f78ca32b 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -61,7 +61,7 @@ static const struct debug_named_value debug_options[] = { {"msgs", FD_DBG_MSGS, "Print debug messages"}, {"disasm", FD_DBG_DISASM, "Dump TGSI and adreno shader disassembly"}, {"dclear", FD_DBG_DCLEAR, "Mark all state dirty after clear"}, - {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, + {"ddraw", FD_DBG_DDRAW, "Mark all state dirty after draw"}, {"noscis", FD_DBG_NOSCIS, "Disable scissor optimization"}, {"direct", FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"}, {"nobypass", FD_DBG_NOBYPASS, "Disable GMEM bypass"}, @@ -70,6 +70,7 @@ static const struct debug_named_value debug_options[] = { {"optmsgs", FD_DBG_OPTMSGS,"Enable optimizer debug messages"}, {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"}, {"shaderdb", FD_DBG_SHADERDB, "Enable shaderdb output"}, + {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, DEBUG_NAMED_VALUE_END }; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 7129a1bddd1..0d2418e1e00 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -63,7 +63,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_MSGS 0x0001 #define FD_DBG_DISASM 0x0002 #define FD_DBG_DCLEAR 0x0004 -#define FD_DBG_FLUSH 0x0008 +#define FD_DBG_DDRAW 0x0008 #define FD_DBG_NOSCIS 0x0010 #define FD_DBG_DIRECT 0x0020 #define FD_DBG_NOBYPASS 0x0040 @@ -72,6 +72,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_OPTMSGS 0x0200 #define FD_DBG_GLSL120 0x0400 #define FD_DBG_SHADERDB 0x0800 +#define FD_DBG_FLUSH 0x1000 extern int fd_mesa_debug; extern bool fd_binning_enabled; From 6f9ca3026693e061ee55fa6d5f16d9ec0e744b59 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 15 Oct 2015 08:47:40 +0200 Subject: [PATCH 167/270] i965/fs: use the right number of UBOs Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 0e044d01f1e..792663f2644 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -1432,7 +1432,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr */ brw_mark_surface_used(prog_data, stage_prog_data->binding_table.ubo_start + - nir->info.num_ssbos - 1); + nir->info.num_ubos - 1); } if (has_indirect) { From f534f331ca354bcb138e2b8f6d6d80147ee4a186 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 15 Oct 2015 08:48:03 +0200 Subject: [PATCH 168/270] i965/vec4: Use the right number of UBOs Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 0025f3647a1..ea1e3e7bbcf 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -765,7 +765,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) */ brw_mark_surface_used(&prog_data->base, prog_data->base.binding_table.ubo_start + - nir->info.num_ssbos - 1); + nir->info.num_ubos - 1); } unsigned const_offset = instr->const_index[0]; From c8f5274b52682f4e1b767251b50f6191d8251079 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 15 Oct 2015 08:47:09 +0200 Subject: [PATCH 169/270] nir: Get the number of SSBOs and UBOs right Before d31f98a272e429d and 56e2bdbca36a20 we had a sigle index space for UBOs and SSBOs, so NumBufferInterfaceBlocks would contain the combined number of blocks, not just one kind. This means that for shader programs using both UBOs and SSBOs, we were setting num_ssbos and num_ubos to a larger number than we should. Since the above commits we have separate index spaces for each so we can just get the right numbers. Reviewed-by: Jason Ekstrand --- src/glsl/nir/glsl_to_nir.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 5aba8f80b6b..cf5bb9360c8 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -152,9 +152,9 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); shader->info.num_textures = num_textures; - shader->info.num_ubos = sh->NumBufferInterfaceBlocks; + shader->info.num_ubos = sh->NumUniformBlocks; shader->info.num_abos = shader_prog->NumAtomicBuffers; - shader->info.num_ssbos = shader_prog->NumBufferInterfaceBlocks; + shader->info.num_ssbos = sh->NumShaderStorageBlocks; shader->info.num_images = sh->NumImages; shader->info.inputs_read = sh->Program->InputsRead; shader->info.outputs_written = sh->Program->OutputsWritten; From dc8c221e2890cc9913dfc99e1e0fcb73c89af52c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tapani=20P=C3=A4lli?= Date: Tue, 13 Oct 2015 08:49:57 +0300 Subject: [PATCH 170/270] mesa: Set api prefix to version string when overriding version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise there are problems when user overrides version and application such as Piglit wants to detect used api with glGetString(GL_VERSION). This makes it currently impossible to run glslparsertest tests for OpenGL ES when using version override. Below is example when using MESA_GLES_VERSION_OVERRIDE=3.1. Before: "3.1 Mesa 11.1.0-devel (git-24a1a15)" After: "OpenGL ES 3.1 Mesa 11.1.0-devel (git-78042ff)" v2: only include api prefix for OpenGL ES (Boyan Ding) Signed-off-by: Tapani Pälli Reviewed-by: Iago Toral Quiroga Cc: "11.0" --- src/mesa/main/version.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index 498b2f867d0..5635a643200 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -24,6 +24,7 @@ #include +#include "context.h" #include "imports.h" #include "mtypes.h" #include "version.h" @@ -181,7 +182,23 @@ _mesa_override_gl_version(struct gl_context *ctx) { if (_mesa_override_gl_version_contextless(&ctx->Const, &ctx->API, &ctx->Version)) { - create_version_string(ctx, ""); + /* We need to include API in version string for OpenGL ES, otherwise + * application can not detect GLES via glGetString(GL_VERSION) query. + * + * From OpenGL ES 3.2 spec, Page 436: + * + * "The VERSION string is laid out as follows: + * + * OpenGL ES N.M vendor-specific information" + * + * From OpenGL 4.5 spec, Page 538: + * + * "The VERSION and SHADING_LANGUAGE_VERSION strings are laid out as + * follows: + * + * " + */ + create_version_string(ctx, _mesa_is_gles(ctx) ? "OpenGL ES " : ""); } } From ccbb52ac1117aa99144785bc032dd459f24b8ba1 Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Fri, 16 Oct 2015 08:44:38 +0200 Subject: [PATCH 171/270] glsl: fix check SSBOs support for builtin functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit has_shader_storage_buffer_objects() returns true also if the OpenGL context is 4.30 or ES 3.1. Previously, we were saying that all atomic*() GLSL builtin functions for SSBOs were not available when OpenGL ES 3.1 context was in use. Fixes 48 dEQP-GLES31 tests: dEQP-GLES31.functional.ssbo.atomic.* Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Tapani Pälli Reviewed-by: Iago Toral Quiroga --- src/glsl/builtin_functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp index f0f6be21b7d..aae25f893e8 100644 --- a/src/glsl/builtin_functions.cpp +++ b/src/glsl/builtin_functions.cpp @@ -403,7 +403,7 @@ shader_atomic_counters(const _mesa_glsl_parse_state *state) static bool shader_storage_buffer_object(const _mesa_glsl_parse_state *state) { - return state->ARB_shader_storage_buffer_object_enable; + return state->has_shader_storage_buffer_objects(); } static bool From 4627e8058ec1bc02cc2b6464ff83394b2d16879e Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 16 Oct 2015 09:10:22 -0600 Subject: [PATCH 172/270] Revert "mesa: remove FLUSH_VERTICES() in _mesa_MatrixMode()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 0de5e0f3fb0f3671a3ecec6ab4473f9131ecd0ae. Michel Dänzer spotted two piglit regressions from the change. I suspect that removing the FLUSH_VERTICES() actually exposed a bug elsewhere but I don't have time to hunt down the root issue at this time. --- src/mesa/main/matrix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c index 5ff5ac5bfe1..2b8016a4a72 100644 --- a/src/mesa/main/matrix.c +++ b/src/mesa/main/matrix.c @@ -151,6 +151,7 @@ _mesa_MatrixMode( GLenum mode ) if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE) return; + FLUSH_VERTICES(ctx, _NEW_TRANSFORM); switch (mode) { case GL_MODELVIEW: From afff809fea373f849b648983bff8390c090b7145 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 15 Oct 2015 07:25:51 -0600 Subject: [PATCH 173/270] st/mesa: fix incorrect pointer type arguments in st_new_program() Silences 5 warnings of the type: state_tracker/st_cb_program.c: In function 'st_new_program': state_tracker/st_cb_program.c:108:7: warning: passing argument 1 of '_mesa_init_gl_program' from incompatible pointer type [enabled by default] return _mesa_init_gl_program(&prog->Base, target, id); ^ Reviewed-by: Emil Velikov --- src/mesa/state_tracker/st_cb_program.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 26d128abd38..708bdf5011e 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -105,23 +105,23 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id) switch (target) { case GL_VERTEX_PROGRAM_ARB: { struct st_vertex_program *prog = ST_CALLOC_STRUCT(st_vertex_program); - return _mesa_init_gl_program(&prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } case GL_FRAGMENT_PROGRAM_ARB: { struct st_fragment_program *prog = ST_CALLOC_STRUCT(st_fragment_program); - return _mesa_init_gl_program(&prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } case GL_GEOMETRY_PROGRAM_NV: { struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program); - return _mesa_init_gl_program(&prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } case GL_TESS_CONTROL_PROGRAM_NV: { struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program); - return _mesa_init_gl_program(&prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } case GL_TESS_EVALUATION_PROGRAM_NV: { struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program); - return _mesa_init_gl_program(&prog->Base, target, id); + return _mesa_init_gl_program(&prog->Base.Base, target, id); } default: assert(0); From cb473c46feb17d652f69836bf0d8843803fe77cd Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 15 Oct 2015 07:26:49 -0600 Subject: [PATCH 174/270] glsl: silence warning about unhandled ast_unsized_array_dim case in switch Reviewed-by: Timothy Arceri --- src/glsl/ast_to_hir.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index cd40fe343e3..ede02d94cb2 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -2017,6 +2017,9 @@ ast_expression::has_sequence_subexpression() const case ast_function_call: unreachable("should be handled by ast_function_expression::hir"); + + case ast_unsized_array_dim: + unreachable("ast_unsized_array_dim: Should never get here."); } return false; From 615b37a0e260ad8bd108a3e57a2a6f0eb6284246 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 8 Oct 2015 21:00:37 -0600 Subject: [PATCH 175/270] svga: remove svga_tgsi_vgpu9_translate() call in GS path We can never have geometry shaders with vgpu9. Reviewed-by: Charmaine Lee --- src/gallium/drivers/svga/svga_state_gs.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/svga/svga_state_gs.c b/src/gallium/drivers/svga/svga_state_gs.c index 7f75410fb57..0b336baee86 100644 --- a/src/gallium/drivers/svga/svga_state_gs.c +++ b/src/gallium/drivers/svga/svga_state_gs.c @@ -53,13 +53,9 @@ translate_geometry_program(struct svga_context *svga, const struct svga_geometry_shader *gs, const struct svga_compile_key *key) { - if (svga_have_vgpu10(svga)) { - return svga_tgsi_vgpu10_translate(svga, &gs->base, key, - PIPE_SHADER_GEOMETRY); - } - else { - return svga_tgsi_vgpu9_translate(&gs->base, key, PIPE_SHADER_GEOMETRY); - } + assert(svga_have_vgpu10(svga)); + return svga_tgsi_vgpu10_translate(svga, &gs->base, key, + PIPE_SHADER_GEOMETRY); } From 8d0d5dca5bd076ef363d3e923e6473916f3a9d35 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 8 Oct 2015 21:03:27 -0600 Subject: [PATCH 176/270] svga: pass context to svga_tgsi_vgpu9_translate() Will be used for upcoming change. Reviewed-by: Charmaine Lee --- src/gallium/drivers/svga/svga_state_fs.c | 3 ++- src/gallium/drivers/svga/svga_state_vs.c | 3 ++- src/gallium/drivers/svga/svga_tgsi.c | 3 ++- src/gallium/drivers/svga/svga_tgsi.h | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c index c244d5352d9..e392778c2fb 100644 --- a/src/gallium/drivers/svga/svga_state_fs.c +++ b/src/gallium/drivers/svga/svga_state_fs.c @@ -90,7 +90,8 @@ translate_fragment_program(struct svga_context *svga, PIPE_SHADER_FRAGMENT); } else { - return svga_tgsi_vgpu9_translate(&fs->base, key, PIPE_SHADER_FRAGMENT); + return svga_tgsi_vgpu9_translate(svga, &fs->base, key, + PIPE_SHADER_FRAGMENT); } } diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c index a846b779e70..24574c1bf85 100644 --- a/src/gallium/drivers/svga/svga_state_vs.c +++ b/src/gallium/drivers/svga/svga_state_vs.c @@ -81,7 +81,8 @@ translate_vertex_program(struct svga_context *svga, PIPE_SHADER_VERTEX); } else { - return svga_tgsi_vgpu9_translate(&vs->base, key, PIPE_SHADER_VERTEX); + return svga_tgsi_vgpu9_translate(svga, &vs->base, key, + PIPE_SHADER_VERTEX); } } diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c index 9a6fb465ccb..a047570ae36 100644 --- a/src/gallium/drivers/svga/svga_tgsi.c +++ b/src/gallium/drivers/svga/svga_tgsi.c @@ -175,7 +175,8 @@ svga_shader_emit_header(struct svga_shader_emitter *emit) * it is, it will be copied to a hardware buffer for upload. */ struct svga_shader_variant * -svga_tgsi_vgpu9_translate(const struct svga_shader *shader, +svga_tgsi_vgpu9_translate(struct svga_context *svga, + const struct svga_shader *shader, const struct svga_compile_key *key, unsigned unit) { struct svga_shader_variant *variant = NULL; diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h index 207a3f0a845..2581135701f 100644 --- a/src/gallium/drivers/svga/svga_tgsi.h +++ b/src/gallium/drivers/svga/svga_tgsi.h @@ -63,7 +63,8 @@ static inline void svga_generate_vdecl_semantics( unsigned idx, struct svga_shader_variant * -svga_tgsi_vgpu9_translate(const struct svga_shader *shader, +svga_tgsi_vgpu9_translate(struct svga_context *svga, + const struct svga_shader *shader, const struct svga_compile_key *key, unsigned unit); struct svga_shader_variant * From f413f1a17c506d5d4474a1baa0556a9e9f554c63 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Thu, 8 Oct 2015 21:06:18 -0600 Subject: [PATCH 177/270] svga: use new svga_new_shader_variant() function To simplify upcoming new HUD shader count implementation. Reviewed-by: Charmaine Lee --- src/gallium/drivers/svga/svga_shader.c | 7 +++++++ src/gallium/drivers/svga/svga_shader.h | 3 +++ src/gallium/drivers/svga/svga_tgsi.c | 2 +- src/gallium/drivers/svga/svga_tgsi_vgpu10.c | 2 +- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c index d46e7ebbc38..7b847558db1 100644 --- a/src/gallium/drivers/svga/svga_shader.c +++ b/src/gallium/drivers/svga/svga_shader.c @@ -414,6 +414,13 @@ svga_set_shader(struct svga_context *svga, } +struct svga_shader_variant * +svga_new_shader_variant(struct svga_context *svga) +{ + return CALLOC_STRUCT(svga_shader_variant); +} + + enum pipe_error svga_destroy_shader_variant(struct svga_context *svga, SVGA3dShaderType type, diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h index b0800c1ecad..efcac408626 100644 --- a/src/gallium/drivers/svga/svga_shader.h +++ b/src/gallium/drivers/svga/svga_shader.h @@ -273,6 +273,9 @@ svga_set_shader(struct svga_context *svga, SVGA3dShaderType type, struct svga_shader_variant *variant); +struct svga_shader_variant * +svga_new_shader_variant(struct svga_context *svga); + enum pipe_error svga_destroy_shader_variant(struct svga_context *svga, SVGA3dShaderType type, diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c index a047570ae36..202eee276b7 100644 --- a/src/gallium/drivers/svga/svga_tgsi.c +++ b/src/gallium/drivers/svga/svga_tgsi.c @@ -228,7 +228,7 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga, goto fail; } - variant = CALLOC_STRUCT(svga_shader_variant); + variant = svga_new_shader_variant(svga); if (variant == NULL) goto fail; diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c index e4f027b9567..d62f2bbcc96 100644 --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c @@ -6735,7 +6735,7 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga, /* * Create, initialize the 'variant' object. */ - variant = CALLOC_STRUCT(svga_shader_variant); + variant = svga_new_shader_variant(svga); if (!variant) goto cleanup; From 9bc7e3105aeadbe360ca9f060c50a181d3fa7a3d Mon Sep 17 00:00:00 2001 From: Neha Bhende Date: Fri, 9 Oct 2015 16:10:16 -0600 Subject: [PATCH 178/270] svga: add new GALLIUM_HUD queries Add new GALLIUM_HUD queries for: num-shaders num-resources num-state-objects num-validations map-buffer-time num-surface-views num-resources-mapped num-flushes Most of this patch was originally written by Neha. Additional clean-ups and num-flushes counter added by Brian Paul. Reviewed-by: Brian Paul Reviewed-by: Charmaine Lee --- src/gallium/drivers/svga/svga_context.c | 2 + src/gallium/drivers/svga/svga_context.h | 34 +++++-- src/gallium/drivers/svga/svga_pipe_blend.c | 3 + .../drivers/svga/svga_pipe_depthstencil.c | 3 + src/gallium/drivers/svga/svga_pipe_draw.c | 4 +- src/gallium/drivers/svga/svga_pipe_query.c | 97 ++++++++++++++++--- .../drivers/svga/svga_pipe_rasterizer.c | 3 + src/gallium/drivers/svga/svga_pipe_sampler.c | 3 + src/gallium/drivers/svga/svga_pipe_vertex.c | 4 + .../drivers/svga/svga_resource_buffer.c | 17 +++- .../drivers/svga/svga_resource_texture.c | 27 ++++-- src/gallium/drivers/svga/svga_screen.c | 19 +++- src/gallium/drivers/svga/svga_screen.h | 8 +- src/gallium/drivers/svga/svga_shader.c | 3 + src/gallium/drivers/svga/svga_state.c | 3 + src/gallium/drivers/svga/svga_surface.c | 4 + 16 files changed, 196 insertions(+), 38 deletions(-) diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c index 2bf795de22d..f8622b96f45 100644 --- a/src/gallium/drivers/svga/svga_context.c +++ b/src/gallium/drivers/svga/svga_context.c @@ -312,6 +312,8 @@ void svga_context_flush( struct svga_context *svga, */ svga->swc->flush(svga->swc, &fence); + svga->hud.num_flushes++; + svga_screen_cache_flush(svgascreen, fence); /* To force the re-emission of rendertargets and texture sampler bindings on diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index e8575f36c3b..bcce18a3502 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -44,10 +44,21 @@ /** Non-GPU queries for gallium HUD */ -#define SVGA_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) -#define SVGA_QUERY_FALLBACKS (PIPE_QUERY_DRIVER_SPECIFIC + 1) -#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 2) -#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 3) +/* per-frame counters */ +#define SVGA_QUERY_NUM_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) +#define SVGA_QUERY_NUM_FALLBACKS (PIPE_QUERY_DRIVER_SPECIFIC + 1) +#define SVGA_QUERY_NUM_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 2) +#define SVGA_QUERY_NUM_VALIDATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 3) +#define SVGA_QUERY_MAP_BUFFER_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 4) +#define SVGA_QUERY_NUM_RESOURCES_MAPPED (PIPE_QUERY_DRIVER_SPECIFIC + 5) +/* running total counters */ +#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 6) +#define SVGA_QUERY_NUM_SHADERS (PIPE_QUERY_DRIVER_SPECIFIC + 7) +#define SVGA_QUERY_NUM_RESOURCES (PIPE_QUERY_DRIVER_SPECIFIC + 8) +#define SVGA_QUERY_NUM_STATE_OBJECTS (PIPE_QUERY_DRIVER_SPECIFIC + 9) +#define SVGA_QUERY_NUM_SURFACE_VIEWS (PIPE_QUERY_DRIVER_SPECIFIC + 10) +/*SVGA_QUERY_MAX has to be last because it is size of an array*/ +#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 11) /** * Maximum supported number of constant buffers per shader @@ -463,9 +474,18 @@ struct svga_context /** List of buffers with queued transfers */ struct list_head dirty_buffers; - /** performance / info queries */ - uint64_t num_draw_calls; /**< SVGA_QUERY_DRAW_CALLS */ - uint64_t num_fallbacks; /**< SVGA_QUERY_FALLBACKS */ + /** performance / info queries for HUD */ + struct { + uint64_t num_draw_calls; /**< SVGA_QUERY_DRAW_CALLS */ + uint64_t num_fallbacks; /**< SVGA_QUERY_NUM_FALLBACKS */ + uint64_t num_flushes; /**< SVGA_QUERY_NUM_FLUSHES */ + uint64_t num_validations; /**< SVGA_QUERY_NUM_VALIDATIONS */ + uint64_t map_buffer_time; /**< SVGA_QUERY_MAP_BUFFER_TIME */ + uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */ + uint64_t num_shaders; /**< SVGA_QUERY_NUM_SHADERS */ + uint64_t num_state_objects; /**< SVGA_QUERY_NUM_STATE_OBJECTS */ + uint64_t num_surface_views; /**< SVGA_QUERY_NUM_SURFACE_VIEWS */ + } hud; /** The currently bound stream output targets */ unsigned num_so_targets; diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c index 06bb3e3bd7e..0c9d6129b53 100644 --- a/src/gallium/drivers/svga/svga_pipe_blend.c +++ b/src/gallium/drivers/svga/svga_pipe_blend.c @@ -321,6 +321,8 @@ svga_create_blend_state(struct pipe_context *pipe, define_blend_state_object(svga, blend); } + svga->hud.num_state_objects++; + return blend; } @@ -359,6 +361,7 @@ static void svga_delete_blend_state(struct pipe_context *pipe, } FREE(blend); + svga->hud.num_state_objects--; } static void svga_set_blend_color( struct pipe_context *pipe, diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c index 5ea623be4d9..d84ed1df48e 100644 --- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c +++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c @@ -202,6 +202,8 @@ svga_create_depth_stencil_state(struct pipe_context *pipe, define_depth_stencil_state_object(svga, ds); } + svga->hud.num_state_objects++; + return ds; } @@ -248,6 +250,7 @@ static void svga_delete_depth_stencil_state(struct pipe_context *pipe, } FREE(depth_stencil); + svga->hud.num_state_objects--; } diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c index 303d4565cdb..50ebb53df90 100644 --- a/src/gallium/drivers/svga/svga_pipe_draw.c +++ b/src/gallium/drivers/svga/svga_pipe_draw.c @@ -177,7 +177,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) enum pipe_error ret = 0; boolean needed_swtnl; - svga->num_draw_calls++; /* for SVGA_QUERY_DRAW_CALLS */ + svga->hud.num_draw_calls++; /* for SVGA_QUERY_NUM_DRAW_CALLS */ if (u_reduced_prim(info->mode) == PIPE_PRIM_TRIANGLES && svga->curr.rast->templ.cull_face == PIPE_FACE_FRONT_AND_BACK) @@ -219,7 +219,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) #endif if (svga->state.sw.need_swtnl) { - svga->num_fallbacks++; /* for SVGA_QUERY_FALLBACKS */ + svga->hud.num_fallbacks++; /* for SVGA_QUERY_NUM_FALLBACKS */ if (!needed_swtnl) { /* * We're switching from HW to SW TNL. SW TNL will require mapping all diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c index 7081e5a1c43..8b9818334ca 100644 --- a/src/gallium/drivers/svga/svga_pipe_query.c +++ b/src/gallium/drivers/svga/svga_pipe_query.c @@ -720,9 +720,17 @@ svga_create_query(struct pipe_context *pipe, define_query_vgpu10(svga, sq, sizeof(SVGADXTimestampQueryResult)); break; - case SVGA_QUERY_DRAW_CALLS: - case SVGA_QUERY_FALLBACKS: + case SVGA_QUERY_NUM_DRAW_CALLS: + case SVGA_QUERY_NUM_FALLBACKS: + case SVGA_QUERY_NUM_FLUSHES: case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_NUM_SURFACE_VIEWS: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: break; default: assert(!"unexpected query type in svga_create_query()"); @@ -778,9 +786,17 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q) destroy_query_vgpu10(svga, sq); sws->fence_reference(sws, &sq->fence, NULL); break; - case SVGA_QUERY_DRAW_CALLS: - case SVGA_QUERY_FALLBACKS: + case SVGA_QUERY_NUM_DRAW_CALLS: + case SVGA_QUERY_NUM_FALLBACKS: + case SVGA_QUERY_NUM_FLUSHES: case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_MAP_BUFFER_TIME: + case SVGA_QUERY_NUM_SURFACE_VIEWS: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: /* nothing */ break; default: @@ -842,13 +858,29 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q) ret = begin_query_vgpu10(svga, sq); assert(ret == PIPE_OK); break; - case SVGA_QUERY_DRAW_CALLS: - sq->begin_count = svga->num_draw_calls; + case SVGA_QUERY_NUM_DRAW_CALLS: + sq->begin_count = svga->hud.num_draw_calls; break; - case SVGA_QUERY_FALLBACKS: - sq->begin_count = svga->num_fallbacks; + case SVGA_QUERY_NUM_FALLBACKS: + sq->begin_count = svga->hud.num_fallbacks; + break; + case SVGA_QUERY_NUM_FLUSHES: + sq->begin_count = svga->hud.num_flushes; + break; + case SVGA_QUERY_NUM_VALIDATIONS: + sq->begin_count = svga->hud.num_validations; + break; + case SVGA_QUERY_MAP_BUFFER_TIME: + sq->begin_count = svga->hud.map_buffer_time; + break; + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + sq->begin_count = svga->hud.num_resources_mapped; break; case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_SURFACE_VIEWS: /* nothing */ break; default: @@ -916,13 +948,29 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q) ret = end_query_vgpu10(svga, sq); assert(ret == PIPE_OK); break; - case SVGA_QUERY_DRAW_CALLS: - sq->end_count = svga->num_draw_calls; + case SVGA_QUERY_NUM_DRAW_CALLS: + sq->end_count = svga->hud.num_draw_calls; break; - case SVGA_QUERY_FALLBACKS: - sq->end_count = svga->num_fallbacks; + case SVGA_QUERY_NUM_FALLBACKS: + sq->end_count = svga->hud.num_fallbacks; + break; + case SVGA_QUERY_NUM_FLUSHES: + sq->end_count = svga->hud.num_flushes; + break; + case SVGA_QUERY_NUM_VALIDATIONS: + sq->end_count = svga->hud.num_validations; + break; + case SVGA_QUERY_MAP_BUFFER_TIME: + sq->end_count = svga->hud.map_buffer_time; + break; + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + sq->end_count = svga->hud.num_resources_mapped; break; case SVGA_QUERY_MEMORY_USED: + case SVGA_QUERY_NUM_SHADERS: + case SVGA_QUERY_NUM_RESOURCES: + case SVGA_QUERY_NUM_STATE_OBJECTS: + case SVGA_QUERY_NUM_SURFACE_VIEWS: /* nothing */ break; default: @@ -1007,13 +1055,30 @@ svga_get_query_result(struct pipe_context *pipe, *result = (uint64_t)sResult.numPrimitivesWritten; break; } - case SVGA_QUERY_DRAW_CALLS: - /* fall-through */ - case SVGA_QUERY_FALLBACKS: + /* These are per-frame counters */ + case SVGA_QUERY_NUM_DRAW_CALLS: + case SVGA_QUERY_NUM_FALLBACKS: + case SVGA_QUERY_NUM_FLUSHES: + case SVGA_QUERY_NUM_VALIDATIONS: + case SVGA_QUERY_NUM_RESOURCES_MAPPED: + case SVGA_QUERY_MAP_BUFFER_TIME: vresult->u64 = sq->end_count - sq->begin_count; break; + /* These are running total counters */ case SVGA_QUERY_MEMORY_USED: - vresult->u64 = svgascreen->total_resource_bytes; + vresult->u64 = svgascreen->hud.total_resource_bytes; + break; + case SVGA_QUERY_NUM_SHADERS: + vresult->u64 = svga->hud.num_shaders; + break; + case SVGA_QUERY_NUM_RESOURCES: + vresult->u64 = svgascreen->hud.num_resources; + break; + case SVGA_QUERY_NUM_STATE_OBJECTS: + vresult->u64 = svga->hud.num_state_objects; + break; + case SVGA_QUERY_NUM_SURFACE_VIEWS: + vresult->u64 = svga->hud.num_surface_views; break; default: assert(!"unexpected query type in svga_get_query_result"); diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c index a7aadac0111..6310b7a5e86 100644 --- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c +++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c @@ -352,6 +352,8 @@ svga_create_rasterizer_state(struct pipe_context *pipe, define_rasterizer_object(svga, rast); } + svga->hud.num_state_objects++; + return rast; } @@ -392,6 +394,7 @@ svga_delete_rasterizer_state(struct pipe_context *pipe, void *state) } FREE(state); + svga->hud.num_state_objects--; } diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c index 60e2d44ace4..95241176510 100644 --- a/src/gallium/drivers/svga/svga_pipe_sampler.c +++ b/src/gallium/drivers/svga/svga_pipe_sampler.c @@ -273,6 +273,8 @@ svga_create_sampler_state(struct pipe_context *pipe, cso->min_lod, cso->view_min_lod, cso->view_max_lod, cso->mipfilter == SVGA3D_TEX_FILTER_NONE ? "SVGA3D_TEX_FILTER_NONE" : "SOMETHING"); + svga->hud.num_state_objects++; + return cso; } @@ -328,6 +330,7 @@ static void svga_delete_sampler_state(struct pipe_context *pipe, } FREE(sampler); + svga->hud.num_state_objects--; } diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c index e0932a9dbc1..b932c568f53 100644 --- a/src/gallium/drivers/svga/svga_pipe_vertex.c +++ b/src/gallium/drivers/svga/svga_pipe_vertex.c @@ -274,6 +274,9 @@ svga_create_vertex_elements_state(struct pipe_context *pipe, translate_vertex_decls(svga, velems); } } + + svga->hud.num_state_objects++; + return velems; } @@ -315,6 +318,7 @@ svga_delete_vertex_elements_state(struct pipe_context *pipe, void *state) } FREE(velems); + svga->hud.num_state_objects--; } void svga_cleanup_vertex_state( struct svga_context *svga ) diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c index 57e37fcfe14..6a8fff454e4 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.c +++ b/src/gallium/drivers/svga/svga_resource_buffer.c @@ -29,6 +29,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "os/os_thread.h" +#include "os/os_time.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_resource.h" @@ -77,6 +78,7 @@ svga_buffer_transfer_map(struct pipe_context *pipe, struct svga_buffer *sbuf = svga_buffer(resource); struct pipe_transfer *transfer; uint8_t *map; + int64_t begin = os_time_get(); transfer = CALLOC_STRUCT(pipe_transfer); if (transfer == NULL) { @@ -244,6 +246,9 @@ svga_buffer_transfer_map(struct pipe_context *pipe, FREE(transfer); } + svga->hud.map_buffer_time += (os_time_get() - begin); + svga->hud.num_resources_mapped++; + return map; } @@ -331,7 +336,10 @@ svga_buffer_destroy( struct pipe_screen *screen, if (sbuf->swbuf && !sbuf->user) align_free(sbuf->swbuf); - ss->total_resource_bytes -= sbuf->size; + ss->hud.total_resource_bytes -= sbuf->size; + assert(ss->hud.num_resources > 0); + if (ss->hud.num_resources > 0) + ss->hud.num_resources--; FREE(sbuf); } @@ -409,7 +417,9 @@ svga_buffer_create(struct pipe_screen *screen, (debug_reference_descriptor)debug_describe_resource, 0); sbuf->size = util_resource_size(&sbuf->b.b); - ss->total_resource_bytes += sbuf->size; + ss->hud.total_resource_bytes += sbuf->size; + + ss->hud.num_resources++; return &sbuf->b.b; @@ -427,6 +437,7 @@ svga_user_buffer_create(struct pipe_screen *screen, unsigned bind) { struct svga_buffer *sbuf; + struct svga_screen *ss = svga_screen(screen); sbuf = CALLOC_STRUCT(svga_buffer); if (!sbuf) @@ -450,6 +461,8 @@ svga_user_buffer_create(struct pipe_screen *screen, debug_reference(&sbuf->b.b.reference, (debug_reference_descriptor)debug_describe_resource, 0); + ss->hud.num_resources++; + return &sbuf->b.b; no_sbuf: diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c index 90787be8073..a02d1e495ff 100644 --- a/src/gallium/drivers/svga/svga_resource_texture.c +++ b/src/gallium/drivers/svga/svga_resource_texture.c @@ -29,6 +29,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" #include "os/os_thread.h" +#include "os/os_time.h" #include "util/u_format.h" #include "util/u_inlines.h" #include "util/u_math.h" @@ -229,11 +230,15 @@ svga_texture_destroy(struct pipe_screen *screen, SVGA_DBG(DEBUG_DMA, "unref sid %p (texture)\n", tex->handle); svga_screen_surface_destroy(ss, &tex->key, &tex->handle); - ss->total_resource_bytes -= tex->size; + ss->hud.total_resource_bytes -= tex->size; FREE(tex->defined); FREE(tex->rendered_to); FREE(tex); + + assert(ss->hud.num_resources > 0); + if (ss->hud.num_resources > 0) + ss->hud.num_resources--; } @@ -322,6 +327,8 @@ svga_texture_transfer_map(struct pipe_context *pipe, boolean use_direct_map = svga_have_gb_objects(svga) && !svga_have_gb_dma(svga); unsigned d; + void *returnVal; + int64_t begin = os_time_get(); /* We can't map texture storage directly unless we have GB objects */ if (usage & PIPE_TRANSFER_MAP_DIRECTLY) { @@ -464,10 +471,10 @@ svga_texture_transfer_map(struct pipe_context *pipe, * Begin mapping code */ if (st->swbuf) { - return st->swbuf; + returnVal = st->swbuf; } else if (!st->use_direct_map) { - return sws->buffer_map(sws, st->hwbuf, usage); + returnVal = sws->buffer_map(sws, st->hwbuf, usage); } else { SVGA3dSize baseLevelSize; @@ -518,9 +525,13 @@ svga_texture_transfer_map(struct pipe_context *pipe, offset += svga3dsurface_get_pixel_offset(tex->key.format, mip_width, mip_height, xoffset, yoffset, zoffset); - - return (void *) (map + offset); + returnVal = (void *) (map + offset); } + + svga->hud.map_buffer_time += (os_time_get() - begin); + svga->hud.num_resources_mapped++; + + return returnVal; } @@ -889,7 +900,8 @@ svga_texture_create(struct pipe_screen *screen, (debug_reference_descriptor)debug_describe_resource, 0); tex->size = util_resource_size(template); - svgascreen->total_resource_bytes += tex->size; + svgascreen->hud.total_resource_bytes += tex->size; + svgascreen->hud.num_resources++; return &tex->b.b; } @@ -901,6 +913,7 @@ svga_texture_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle) { struct svga_winsys_screen *sws = svga_winsys_screen(screen); + struct svga_screen *ss = svga_screen(screen); struct svga_winsys_surface *srf; struct svga_texture *tex; enum SVGA3dSurfaceFormat format = 0; @@ -970,5 +983,7 @@ svga_texture_from_handle(struct pipe_screen *screen, tex->rendered_to = CALLOC(1, sizeof(tex->rendered_to[0])); tex->imported = TRUE; + ss->hud.num_resources++; + return &tex->b.b; } diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index e0a28788238..dab89814334 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -772,9 +772,22 @@ svga_get_driver_query_info(struct pipe_screen *screen, struct pipe_driver_query_info *info) { static const struct pipe_driver_query_info queries[] = { - {"draw-calls", SVGA_QUERY_DRAW_CALLS, {0}}, - {"fallbacks", SVGA_QUERY_FALLBACKS, {0}}, - {"memory-used", SVGA_QUERY_MEMORY_USED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES} + /* per-frame counters */ + {"num-draw-calls", SVGA_QUERY_NUM_DRAW_CALLS, {0}}, + {"num-fallbacks", SVGA_QUERY_NUM_FALLBACKS, {0}}, + {"num-flushes", SVGA_QUERY_NUM_FLUSHES, {0}}, + {"num-validations", SVGA_QUERY_NUM_VALIDATIONS, {0}}, + {"map-buffer-time", SVGA_QUERY_MAP_BUFFER_TIME, {0}, + PIPE_DRIVER_QUERY_TYPE_MICROSECONDS}, + {"num-resources-mapped", SVGA_QUERY_NUM_RESOURCES_MAPPED, {0}}, + + /* running total counters */ + {"memory-used", SVGA_QUERY_MEMORY_USED, {0}, + PIPE_DRIVER_QUERY_TYPE_BYTES}, + {"num-shaders", SVGA_QUERY_NUM_SHADERS, {0}}, + {"num-resources", SVGA_QUERY_NUM_RESOURCES, {0}}, + {"num-state-objects", SVGA_QUERY_NUM_STATE_OBJECTS, {0}}, + {"num-surface-views", SVGA_QUERY_NUM_SURFACE_VIEWS, {0}}, }; if (!info) diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h index 5581d2e1ffd..98b56b2a6d1 100644 --- a/src/gallium/drivers/svga/svga_screen.h +++ b/src/gallium/drivers/svga/svga_screen.h @@ -80,8 +80,12 @@ struct svga_screen struct svga_host_surface_cache cache; - /** Memory used by all resources (buffers and surfaces) */ - uint64_t total_resource_bytes; + /** HUD counters */ + struct { + /** Memory used by all resources (buffers and surfaces) */ + uint64_t total_resource_bytes; + uint64_t num_resources; + } hud; }; #ifndef DEBUG diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c index 7b847558db1..5c99e16d976 100644 --- a/src/gallium/drivers/svga/svga_shader.c +++ b/src/gallium/drivers/svga/svga_shader.c @@ -417,6 +417,7 @@ svga_set_shader(struct svga_context *svga, struct svga_shader_variant * svga_new_shader_variant(struct svga_context *svga) { + svga->hud.num_shaders++; return CALLOC_STRUCT(svga_shader_variant); } @@ -462,6 +463,8 @@ svga_destroy_shader_variant(struct svga_context *svga, FREE((unsigned *)variant->tokens); FREE(variant); + svga->hud.num_shaders--; + return ret; } diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c index 37d16dc9afe..722b369fd4b 100644 --- a/src/gallium/drivers/svga/svga_state.c +++ b/src/gallium/drivers/svga/svga_state.c @@ -225,6 +225,9 @@ svga_update_state(struct svga_context *svga, unsigned max_level) svga->state.dirty[i] |= svga->dirty; svga->dirty = 0; + + svga->hud.num_validations++; + return PIPE_OK; } diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c index aca5abcdfce..9f09311116e 100644 --- a/src/gallium/drivers/svga/svga_surface.c +++ b/src/gallium/drivers/svga/svga_surface.c @@ -317,6 +317,8 @@ svga_create_surface_view(struct pipe_context *pipe, s->real_level = surf_tmpl->u.tex.level; } + svga->hud.num_surface_views++; + return &s->base; } @@ -509,6 +511,8 @@ svga_surface_destroy(struct pipe_context *pipe, pipe_resource_reference(&surf->texture, NULL); FREE(surf); + + svga->hud.num_surface_views--; } From efe37519b0cb0cadea57455d1d2457af09e2e7dd Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 12 Oct 2015 18:40:27 -0600 Subject: [PATCH 179/270] svga: only count hardware buffer mappings for HUD Don't count client memory buffer mappings since they're basically free. Reviewed-by: Charmaine Lee --- src/gallium/drivers/svga/svga_resource_buffer.c | 1 - src/gallium/drivers/svga/svga_resource_buffer.h | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c index 6a8fff454e4..71f2f4f2779 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.c +++ b/src/gallium/drivers/svga/svga_resource_buffer.c @@ -247,7 +247,6 @@ svga_buffer_transfer_map(struct pipe_context *pipe, } svga->hud.map_buffer_time += (os_time_get() - begin); - svga->hud.num_resources_mapped++; return map; } diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h index 75e12c3220c..0591f8960b9 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.h +++ b/src/gallium/drivers/svga/svga_resource_buffer.h @@ -253,6 +253,9 @@ svga_buffer_hw_storage_map(struct svga_context *svga, unsigned flags, boolean *retry) { struct svga_winsys_screen *sws = svga_buffer_winsys_screen(sbuf); + + svga->hud.num_resources_mapped++; + if (sws->have_gb_objects) { return svga->swc->surface_map(svga->swc, sbuf->handle, flags, retry); } else { From d4ecc2bce458c28355f7ecdea72b619578a69fd5 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 10:36:39 +0200 Subject: [PATCH 180/270] nvc0: remove useless call to query_get_cfg() in nvc0_hw_sm_query_end() Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 3bdb90a8d7a..8e2239fd146 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -439,9 +439,6 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; const uint grid[3] = { screen->mp_count, 1, 1 }; unsigned c; - const struct nvc0_hw_sm_query_cfg *cfg; - - cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); if (unlikely(!screen->pm.prog)) { struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program); @@ -495,6 +492,7 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) PUSH_SPACE(push, 16); mask = 0; for (c = 0; c < 8; ++c) { + const struct nvc0_hw_sm_query_cfg *cfg; unsigned i; hsq = screen->pm.mp_counter[c]; From dab7e0ed09073b0fd7c37d0fabd70f302acbce32 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 10:45:04 +0200 Subject: [PATCH 181/270] nvc0: split out begin_query() hook used by MP counters The way we configure MP performance counters is going to pretty different between Fermi and Kepler. Having two separate functions is much better. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 112 ++++++++++++++---- 1 file changed, 86 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 8e2239fd146..f83966a6c74 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -338,11 +338,10 @@ nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } static boolean -nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { struct nvc0_screen *screen = nvc0->screen; struct nouveau_pushbuf *push = nvc0->base.pushbuf; - const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); const struct nvc0_hw_sm_query_cfg *cfg; unsigned i, c; @@ -361,7 +360,7 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } assert(cfg->num_counters <= 4); - PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6); + PUSH_SPACE(push, 4 * 8 * + 6); if (!screen->pm.mp_counters_enabled) { screen->pm.mp_counters_enabled = true; @@ -396,32 +395,93 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ /* configure and reset the counter(s) */ - if (is_nve4) { - if (d == 0) - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); - else - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); - PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); - PUSH_DATA (push, 0); - } else { - unsigned s; + if (d == 0) + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1); + else + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); + } + return true; +} - for (s = 0; s < cfg->ctr[i].num_src; s++) { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); - PUSH_DATA (push, 0); +static boolean +nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); + const struct nvc0_hw_sm_query_cfg *cfg; + unsigned i, c; + unsigned num_ab[2] = { 0, 0 }; + + if (screen->base.class_3d >= NVE4_3D_CLASS) + return nve4_hw_sm_begin_query(nvc0, hq); + + cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); + + /* check if we have enough free counter slots */ + for (i = 0; i < cfg->num_counters; ++i) + num_ab[cfg->ctr[i].sig_dom]++; + + if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || + screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { + NOUVEAU_ERR("Not enough free MP counter slots !\n"); + return false; + } + + assert(cfg->num_counters <= 4); + PUSH_SPACE(push, 4 * 8 * 6 + 6); + + if (!screen->pm.mp_counters_enabled) { + screen->pm.mp_counters_enabled = true; + BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); + PUSH_DATA (push, 0x1fcb); + } + + /* set sequence field to 0 (used to check if result is available) */ + for (i = 0; i < screen->mp_count; ++i) + hq->data[i * 10 + 10] = 0; + hq->sequence++; + + for (i = 0; i < cfg->num_counters; ++i) { + const unsigned d = cfg->ctr[i].sig_dom; + unsigned s; + + if (!screen->pm.num_hw_sm_active[d]) { + uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); + if (screen->pm.num_hw_sm_active[!d]) + m |= 1 << (7 + (8 * d)); + BEGIN_NVC0(push, SUBC_SW(0x0600), 1); + PUSH_DATA (push, m); + } + screen->pm.num_hw_sm_active[d]++; + + for (c = d * 4; c < (d * 4 + 4); ++c) { + if (!screen->pm.mp_counter[c]) { + hsq->ctr[i] = c; + screen->pm.mp_counter[c] = hsq; + break; } } + assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ + + /* configure and reset the counter(s) */ + for (s = 0; s < cfg->ctr[i].num_src; s++) { + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); + PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); + PUSH_DATA (push, 0); + } } return true; } From c3570c3fb9ee78ab7af2de781a7056b3642bebb9 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 10:54:15 +0200 Subject: [PATCH 182/270] nvc0: rip off the kepler MP-enabling logic from the Fermi codepath Writing 0x1fcb to 0x419eac is definitely not related to MP counters and has no effect on Fermi (although this enables MP counters on Kepler). Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index f83966a6c74..a74bfee60e6 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -436,13 +436,7 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } assert(cfg->num_counters <= 4); - PUSH_SPACE(push, 4 * 8 * 6 + 6); - - if (!screen->pm.mp_counters_enabled) { - screen->pm.mp_counters_enabled = true; - BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); - PUSH_DATA (push, 0x1fcb); - } + PUSH_SPACE(push, 4 * 8 * 6 + 4); /* set sequence field to 0 (used to check if result is available) */ for (i = 0; i < screen->mp_count; ++i) From 409658c367c50bfb2fae6dec7d5b0ab5db16fe57 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 10:55:25 +0200 Subject: [PATCH 183/270] nvc0: correctly enable the MP counters' multiplexer on Fermi Writing 0x408000 to 0x419e00 (like on Kepler) has no effect on Fermi because we only have one domain of 8 counters. Instead, we have to write 0x80000000. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index a74bfee60e6..f7b49da8d43 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -448,11 +448,8 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) unsigned s; if (!screen->pm.num_hw_sm_active[d]) { - uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); - if (screen->pm.num_hw_sm_active[!d]) - m |= 1 << (7 + (8 * d)); BEGIN_NVC0(push, SUBC_SW(0x0600), 1); - PUSH_DATA (push, m); + PUSH_DATA (push, 0x80000000); } screen->pm.num_hw_sm_active[d]++; From cac897197b4ab021e06ed8b023f11035557be55e Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 15:33:23 +0200 Subject: [PATCH 184/270] nvc0: fix sequence field init for MP counters on Fermi Sequence fields are located at MP[i] + 0x20 in the buffer object. This is used to check if result is available for MP[i]. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index f7b49da8d43..b810d254c21 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -439,8 +439,10 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) PUSH_SPACE(push, 4 * 8 * 6 + 4); /* set sequence field to 0 (used to check if result is available) */ - for (i = 0; i < screen->mp_count; ++i) - hq->data[i * 10 + 10] = 0; + for (i = 0; i < screen->mp_count; ++i) { + const unsigned b = (0x24 / 4) * i; + hq->data[b + 8] = 0; + } hq->sequence++; for (i = 0; i < cfg->num_counters; ++i) { From 6353f620cdb86009812ad2e850b79374e85ecec2 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 11:01:56 +0200 Subject: [PATCH 185/270] nvc0: allow to use 8 MP counters on Fermi On Fermi, we have one domain of 8 MP counters while we have two domains of 4 MP counters on Kepler. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 30 ++++++++----------- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h | 2 +- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index b810d254c21..20606622174 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -144,7 +144,7 @@ struct nvc0_hw_sm_counter_cfg struct nvc0_hw_sm_query_cfg { - struct nvc0_hw_sm_counter_cfg ctr[4]; + struct nvc0_hw_sm_counter_cfg ctr[8]; uint8_t num_counters; uint8_t op; uint8_t norm[2]; /* normalization num,denom */ @@ -418,7 +418,6 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); const struct nvc0_hw_sm_query_cfg *cfg; unsigned i, c; - unsigned num_ab[2] = { 0, 0 }; if (screen->base.class_3d >= NVE4_3D_CLASS) return nve4_hw_sm_begin_query(nvc0, hq); @@ -426,17 +425,13 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); /* check if we have enough free counter slots */ - for (i = 0; i < cfg->num_counters; ++i) - num_ab[cfg->ctr[i].sig_dom]++; - - if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || - screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { + if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) { NOUVEAU_ERR("Not enough free MP counter slots !\n"); return false; } - assert(cfg->num_counters <= 4); - PUSH_SPACE(push, 4 * 8 * 6 + 4); + assert(cfg->num_counters <= 8); + PUSH_SPACE(push, 4 * 8 * 6 + 2); /* set sequence field to 0 (used to check if result is available) */ for (i = 0; i < screen->mp_count; ++i) { @@ -446,23 +441,21 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) hq->sequence++; for (i = 0; i < cfg->num_counters; ++i) { - const unsigned d = cfg->ctr[i].sig_dom; unsigned s; - if (!screen->pm.num_hw_sm_active[d]) { + if (!screen->pm.num_hw_sm_active[0]) { BEGIN_NVC0(push, SUBC_SW(0x0600), 1); PUSH_DATA (push, 0x80000000); } - screen->pm.num_hw_sm_active[d]++; + screen->pm.num_hw_sm_active[0]++; - for (c = d * 4; c < (d * 4 + 4); ++c) { + for (c = 0; c < 8; ++c) { if (!screen->pm.mp_counter[c]) { hsq->ctr[i] = c; screen->pm.mp_counter[c] = hsq; break; } } - assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ /* configure and reset the counter(s) */ for (s = 0; s < cfg->ctr[i].num_src; s++) { @@ -522,7 +515,8 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) /* release counters for this query */ for (c = 0; c < 8; ++c) { if (screen->pm.mp_counter[c] == hsq) { - screen->pm.num_hw_sm_active[c / 4]--; + uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */ + screen->pm.num_hw_sm_active[d]--; screen->pm.mp_counter[c] = NULL; } } @@ -568,7 +562,7 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } static inline bool -nvc0_hw_sm_query_read_data(uint32_t count[32][4], +nvc0_hw_sm_query_read_data(uint32_t count[32][8], struct nvc0_context *nvc0, bool wait, struct nvc0_hw_query *hq, const struct nvc0_hw_sm_query_cfg *cfg, @@ -594,7 +588,7 @@ nvc0_hw_sm_query_read_data(uint32_t count[32][4], } static inline bool -nve4_hw_sm_query_read_data(uint32_t count[32][4], +nve4_hw_sm_query_read_data(uint32_t count[32][8], struct nvc0_context *nvc0, bool wait, struct nvc0_hw_query *hq, const struct nvc0_hw_sm_query_cfg *cfg, @@ -640,7 +634,7 @@ static boolean nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq, boolean wait, union pipe_query_result *result) { - uint32_t count[32][4]; + uint32_t count[32][8]; uint64_t value = 0; unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); unsigned p, c; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h index bab6f34afc8..0ad8a91ee6d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -5,7 +5,7 @@ struct nvc0_hw_sm_query { struct nvc0_hw_query base; - int8_t ctr[4]; + uint8_t ctr[8]; }; static inline struct nvc0_hw_sm_query * From 4fcb661711c19a6c315267733e4cdcaee9f5e95b Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 11:14:31 +0200 Subject: [PATCH 186/270] nvc0: fix queries which use multiple MP counters on Fermi Queries which use more than one MP counters was misconfigured and computing the final result was also wrong because sources need to be configured on different hardware counters instead. According to the blob, computing the result is now as follows: FOR i..n val += ctr[i] * pow(2, i) Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 128 +++++++++++------- 1 file changed, 81 insertions(+), 47 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 20606622174..99e907388d4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -131,7 +131,7 @@ struct nvc0_hw_sm_counter_cfg uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ uint32_t sig_sel : 8; /* signal group */ - uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */ + uint32_t src_sel; /* signal selection for up to 4 sources */ }; #define NVC0_COUNTER_OPn_SUM 0 @@ -280,44 +280,82 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = 0x8000000000001de7ULL }; -#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } } +#define _C(f, o, g, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, 0, g, s } +#define _Q(n, c, ...) [NVC0_HW_SM_QUERY_##n] = { \ + { __VA_ARGS__ }, c, NVC0_COUNTER_OPn_SUM, { 1, 1 }, \ +} static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00), - _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00), - _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00), - _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65), - _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55), - _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00), + _Q(ACTIVE_CYCLES, 1, _C(0xaaaa, LOGOP, 0x11, 0x00000000)), + _Q(ACTIVE_WARPS, 6, _C(0xaaaa, LOGOP, 0x24, 0x00000010), + _C(0xaaaa, LOGOP, 0x24, 0x00000021), + _C(0xaaaa, LOGOP, 0x24, 0x00000032), + _C(0xaaaa, LOGOP, 0x24, 0x00000043), + _C(0xaaaa, LOGOP, 0x24, 0x00000054), + _C(0xaaaa, LOGOP, 0x24, 0x00000065)), + _Q(ATOM_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x00000030)), + _Q(BRANCH, 2, _C(0xaaaa, LOGOP, 0x1a, 0x00000000), + _C(0xaaaa, LOGOP, 0x1a, 0x00000011)), + _Q(DIVERGENT_BRANCH, 2, _C(0xaaaa, LOGOP, 0x19, 0x00000020), + _C(0xaaaa, LOGOP, 0x19, 0x00000031)), + _Q(GLD_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000030)), + _Q(GRED_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x00000040)), + _Q(GST_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000060)), + _Q(INST_EXECUTED, 3, _C(0xaaaa, LOGOP, 0x2d, 0x00000000), + _C(0xaaaa, LOGOP, 0x2d, 0x00000011), + _C(0xaaaa, LOGOP, 0x2d, 0x00000022)), + _Q(INST_ISSUED1_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000010)), + _Q(INST_ISSUED1_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000040)), + _Q(INST_ISSUED2_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000020)), + _Q(INST_ISSUED2_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000050)), + _Q(LOCAL_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000020)), + _Q(LOCAL_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000050)), + _Q(PROF_TRIGGER_0, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000000)), + _Q(PROF_TRIGGER_1, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000010)), + _Q(PROF_TRIGGER_2, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000020)), + _Q(PROF_TRIGGER_3, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000030)), + _Q(PROF_TRIGGER_4, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000040)), + _Q(PROF_TRIGGER_5, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000050)), + _Q(PROF_TRIGGER_6, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000060)), + _Q(PROF_TRIGGER_7, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000070)), + _Q(SHARED_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000010)), + _Q(SHARED_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000040)), + _Q(THREADS_LAUNCHED, 6, _C(0xaaaa, LOGOP, 0x26, 0x00000010), + _C(0xaaaa, LOGOP, 0x26, 0x00000021), + _C(0xaaaa, LOGOP, 0x26, 0x00000032), + _C(0xaaaa, LOGOP, 0x26, 0x00000043), + _C(0xaaaa, LOGOP, 0x26, 0x00000054), + _C(0xaaaa, LOGOP, 0x26, 0x00000065)), + _Q(TH_INST_EXECUTED_0, 6, _C(0xaaaa, LOGOP, 0xa3, 0x00000000), + _C(0xaaaa, LOGOP, 0xa3, 0x00000011), + _C(0xaaaa, LOGOP, 0xa3, 0x00000022), + _C(0xaaaa, LOGOP, 0xa3, 0x00000033), + _C(0xaaaa, LOGOP, 0xa3, 0x00000044), + _C(0xaaaa, LOGOP, 0xa3, 0x00000055)), + _Q(TH_INST_EXECUTED_1, 6, _C(0xaaaa, LOGOP, 0xa5, 0x00000000), + _C(0xaaaa, LOGOP, 0xa5, 0x00000011), + _C(0xaaaa, LOGOP, 0xa5, 0x00000022), + _C(0xaaaa, LOGOP, 0xa5, 0x00000033), + _C(0xaaaa, LOGOP, 0xa5, 0x00000044), + _C(0xaaaa, LOGOP, 0xa5, 0x00000055)), + _Q(TH_INST_EXECUTED_2, 6, _C(0xaaaa, LOGOP, 0xa4, 0x00000000), + _C(0xaaaa, LOGOP, 0xa4, 0x00000011), + _C(0xaaaa, LOGOP, 0xa4, 0x00000022), + _C(0xaaaa, LOGOP, 0xa4, 0x00000033), + _C(0xaaaa, LOGOP, 0xa4, 0x00000044), + _C(0xaaaa, LOGOP, 0xa4, 0x00000055)), + _Q(TH_INST_EXECUTED_3, 6, _C(0xaaaa, LOGOP, 0xa6, 0x00000000), + _C(0xaaaa, LOGOP, 0xa6, 0x00000011), + _C(0xaaaa, LOGOP, 0xa6, 0x00000022), + _C(0xaaaa, LOGOP, 0xa6, 0x00000033), + _C(0xaaaa, LOGOP, 0xa6, 0x00000044), + _C(0xaaaa, LOGOP, 0xa6, 0x00000055)), + _Q(WARPS_LAUNCHED, 1, _C(0xaaaa, LOGOP, 0x26, 0x00000000)), }; #undef _Q +#undef _C static const struct nvc0_hw_sm_query_cfg * nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) @@ -431,7 +469,7 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } assert(cfg->num_counters <= 8); - PUSH_SPACE(push, 4 * 8 * 6 + 2); + PUSH_SPACE(push, 8 * 8 + 2); /* set sequence field to 0 (used to check if result is available) */ for (i = 0; i < screen->mp_count; ++i) { @@ -441,8 +479,6 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) hq->sequence++; for (i = 0; i < cfg->num_counters; ++i) { - unsigned s; - if (!screen->pm.num_hw_sm_active[0]) { BEGIN_NVC0(push, SUBC_SW(0x0600), 1); PUSH_DATA (push, 0x80000000); @@ -458,16 +494,14 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } /* configure and reset the counter(s) */ - for (s = 0; s < cfg->ctr[i].num_src; s++) { - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1); - PUSH_DATA (push, cfg->ctr[i].sig_sel); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1); - PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); - BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1); - PUSH_DATA (push, 0); - } + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].sig_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1); + PUSH_DATA (push, cfg->ctr[i].src_sel); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1); + PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); + BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1); + PUSH_DATA (push, 0); } return true; } @@ -581,7 +615,7 @@ nvc0_hw_sm_query_read_data(uint32_t count[32][8], if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) return false; } - count[p][c] = hq->data[b + hsq->ctr[c]]; + count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c); } } return true; From 7abd707251f29aaf27f83644e47d2dc8b75e10c6 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 11:18:45 +0200 Subject: [PATCH 187/270] nvc0: fix monitoring multiple MP counters queries on Fermi For strange reasons, the signal id depends on the slot selected on Fermi but not on Kepler. Fortunately, the signal ids are just offseted by the slot id! Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 163 ++++++++++-------- 1 file changed, 87 insertions(+), 76 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 99e907388d4..2f827b0a944 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -128,9 +128,9 @@ struct nvc0_hw_sm_counter_cfg { uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ - uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */ uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ uint32_t sig_sel : 8; /* signal group */ + uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */ uint32_t src_sel; /* signal selection for up to 4 sources */ }; @@ -150,19 +150,19 @@ struct nvc0_hw_sm_query_cfg uint8_t norm[2]; /* normalization num,denom */ }; -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } +#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } } #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \ {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \ {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \ - { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \ - { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \ + { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \ + { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \ {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } } /* NOTES: @@ -280,78 +280,78 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = 0x8000000000001de7ULL }; -#define _C(f, o, g, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, 0, g, s } +#define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } #define _Q(n, c, ...) [NVC0_HW_SM_QUERY_##n] = { \ { __VA_ARGS__ }, c, NVC0_COUNTER_OPn_SUM, { 1, 1 }, \ } static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, 1, _C(0xaaaa, LOGOP, 0x11, 0x00000000)), - _Q(ACTIVE_WARPS, 6, _C(0xaaaa, LOGOP, 0x24, 0x00000010), - _C(0xaaaa, LOGOP, 0x24, 0x00000021), - _C(0xaaaa, LOGOP, 0x24, 0x00000032), - _C(0xaaaa, LOGOP, 0x24, 0x00000043), - _C(0xaaaa, LOGOP, 0x24, 0x00000054), - _C(0xaaaa, LOGOP, 0x24, 0x00000065)), - _Q(ATOM_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x00000030)), - _Q(BRANCH, 2, _C(0xaaaa, LOGOP, 0x1a, 0x00000000), - _C(0xaaaa, LOGOP, 0x1a, 0x00000011)), - _Q(DIVERGENT_BRANCH, 2, _C(0xaaaa, LOGOP, 0x19, 0x00000020), - _C(0xaaaa, LOGOP, 0x19, 0x00000031)), - _Q(GLD_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000030)), - _Q(GRED_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x00000040)), - _Q(GST_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000060)), - _Q(INST_EXECUTED, 3, _C(0xaaaa, LOGOP, 0x2d, 0x00000000), - _C(0xaaaa, LOGOP, 0x2d, 0x00000011), - _C(0xaaaa, LOGOP, 0x2d, 0x00000022)), - _Q(INST_ISSUED1_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000010)), - _Q(INST_ISSUED1_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000040)), - _Q(INST_ISSUED2_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000020)), - _Q(INST_ISSUED2_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x00000050)), - _Q(LOCAL_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000020)), - _Q(LOCAL_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000050)), - _Q(PROF_TRIGGER_0, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000000)), - _Q(PROF_TRIGGER_1, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000010)), - _Q(PROF_TRIGGER_2, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000020)), - _Q(PROF_TRIGGER_3, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000030)), - _Q(PROF_TRIGGER_4, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000040)), - _Q(PROF_TRIGGER_5, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000050)), - _Q(PROF_TRIGGER_6, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000060)), - _Q(PROF_TRIGGER_7, 1, _C(0xaaaa, LOGOP, 0x01, 0x00000070)), - _Q(SHARED_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000010)), - _Q(SHARED_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x00000040)), - _Q(THREADS_LAUNCHED, 6, _C(0xaaaa, LOGOP, 0x26, 0x00000010), - _C(0xaaaa, LOGOP, 0x26, 0x00000021), - _C(0xaaaa, LOGOP, 0x26, 0x00000032), - _C(0xaaaa, LOGOP, 0x26, 0x00000043), - _C(0xaaaa, LOGOP, 0x26, 0x00000054), - _C(0xaaaa, LOGOP, 0x26, 0x00000065)), - _Q(TH_INST_EXECUTED_0, 6, _C(0xaaaa, LOGOP, 0xa3, 0x00000000), - _C(0xaaaa, LOGOP, 0xa3, 0x00000011), - _C(0xaaaa, LOGOP, 0xa3, 0x00000022), - _C(0xaaaa, LOGOP, 0xa3, 0x00000033), - _C(0xaaaa, LOGOP, 0xa3, 0x00000044), - _C(0xaaaa, LOGOP, 0xa3, 0x00000055)), - _Q(TH_INST_EXECUTED_1, 6, _C(0xaaaa, LOGOP, 0xa5, 0x00000000), - _C(0xaaaa, LOGOP, 0xa5, 0x00000011), - _C(0xaaaa, LOGOP, 0xa5, 0x00000022), - _C(0xaaaa, LOGOP, 0xa5, 0x00000033), - _C(0xaaaa, LOGOP, 0xa5, 0x00000044), - _C(0xaaaa, LOGOP, 0xa5, 0x00000055)), - _Q(TH_INST_EXECUTED_2, 6, _C(0xaaaa, LOGOP, 0xa4, 0x00000000), - _C(0xaaaa, LOGOP, 0xa4, 0x00000011), - _C(0xaaaa, LOGOP, 0xa4, 0x00000022), - _C(0xaaaa, LOGOP, 0xa4, 0x00000033), - _C(0xaaaa, LOGOP, 0xa4, 0x00000044), - _C(0xaaaa, LOGOP, 0xa4, 0x00000055)), - _Q(TH_INST_EXECUTED_3, 6, _C(0xaaaa, LOGOP, 0xa6, 0x00000000), - _C(0xaaaa, LOGOP, 0xa6, 0x00000011), - _C(0xaaaa, LOGOP, 0xa6, 0x00000022), - _C(0xaaaa, LOGOP, 0xa6, 0x00000033), - _C(0xaaaa, LOGOP, 0xa6, 0x00000044), - _C(0xaaaa, LOGOP, 0xa6, 0x00000055)), - _Q(WARPS_LAUNCHED, 1, _C(0xaaaa, LOGOP, 0x26, 0x00000000)), + _Q(ACTIVE_CYCLES, 1, _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000)), + _Q(ACTIVE_WARPS, 6, _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), + _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040), + _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050), + _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060)), + _Q(ATOM_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030)), + _Q(BRANCH, 2, _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), + _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010)), + _Q(DIVERGENT_BRANCH, 2, _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030)), + _Q(GLD_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030)), + _Q(GRED_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040)), + _Q(GST_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060)), + _Q(INST_EXECUTED, 3, _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), + _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020)), + _Q(INST_ISSUED1_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010)), + _Q(INST_ISSUED1_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040)), + _Q(INST_ISSUED2_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020)), + _Q(INST_ISSUED2_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050)), + _Q(LOCAL_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020)), + _Q(LOCAL_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050)), + _Q(PROF_TRIGGER_0, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000)), + _Q(PROF_TRIGGER_1, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010)), + _Q(PROF_TRIGGER_2, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020)), + _Q(PROF_TRIGGER_3, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030)), + _Q(PROF_TRIGGER_4, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040)), + _Q(PROF_TRIGGER_5, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050)), + _Q(PROF_TRIGGER_6, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060)), + _Q(PROF_TRIGGER_7, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070)), + _Q(SHARED_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010)), + _Q(SHARED_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040)), + _Q(THREADS_LAUNCHED, 6, _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), + _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040), + _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050), + _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060)), + _Q(TH_INST_EXECUTED_0, 6, _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), + _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030), + _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040), + _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050)), + _Q(TH_INST_EXECUTED_1, 6, _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), + _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030), + _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040), + _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050)), + _Q(TH_INST_EXECUTED_2, 6, _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), + _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030), + _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040), + _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050)), + _Q(TH_INST_EXECUTED_3, 6, _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), + _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), + _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), + _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030), + _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040), + _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050)), + _Q(WARPS_LAUNCHED, 1, _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000)), }; #undef _Q @@ -479,6 +479,8 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) hq->sequence++; for (i = 0; i < cfg->num_counters; ++i) { + uint32_t mask_sel = 0x00000000; + if (!screen->pm.num_hw_sm_active[0]) { BEGIN_NVC0(push, SUBC_SW(0x0600), 1); PUSH_DATA (push, 0x80000000); @@ -493,11 +495,20 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) } } + /* Oddly-enough, the signal id depends on the slot selected on Fermi but + * not on Kepler. Fortunately, the signal ids are just offseted by the + * slot id! */ + mask_sel |= c; + mask_sel |= (c << 8); + mask_sel |= (c << 16); + mask_sel |= (c << 24); + mask_sel &= cfg->ctr[i].src_mask; + /* configure and reset the counter(s) */ BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1); PUSH_DATA (push, cfg->ctr[i].sig_sel); BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1); - PUSH_DATA (push, cfg->ctr[i].src_sel); + PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel); BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1); PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1); From c4896c99cbe10b829981250465baf0b00e18ba40 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 11:22:20 +0200 Subject: [PATCH 188/270] nvc0: fix unaligned mem access when reading MP counters on Fermi Memory access have to be aligned to 128-bits. Note that this doesn't happen when the card only has TPC. This patch fixes the following dmesg fail: gr: GPC0/TPC1/MP trap: global 00000004 [MULTIPLE_WARP_ERRORS] warp 000f [UNALIGNED_MEM_ACCESS] Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 2f827b0a944..8eb3b3ef14a 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -247,7 +247,7 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = * mov b32 $r11 c0[0x4] * ext u32 $r8 $r9 0x414 * (not $p0) exit - * mul $r8 u32 $r8 u32 36 + * mul $r8 u32 $r8 u32 48 * add b32 $r10 $c $r10 $r8 * add b32 $r11 $r11 0x0 $c * mov b32 $r8 c0[0x8] @@ -270,7 +270,7 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = 0x280040001002dde4ULL, 0x7000c01050921c03ULL, 0x80000000000021e7ULL, - 0x1000000090821c02ULL, + 0x10000000c0821c02ULL, 0x4801000020a29c03ULL, 0x0800000000b2dc42ULL, 0x2800400020021de4ULL, @@ -473,7 +473,7 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) /* set sequence field to 0 (used to check if result is available) */ for (i = 0; i < screen->mp_count; ++i) { - const unsigned b = (0x24 / 4) * i; + const unsigned b = (0x30 / 4) * i; hq->data[b + 8] = 0; } hq->sequence++; @@ -617,7 +617,7 @@ nvc0_hw_sm_query_read_data(uint32_t count[32][8], unsigned p, c; for (p = 0; p < mp_count; ++p) { - const unsigned b = (0x24 / 4) * p; + const unsigned b = (0x30 / 4) * p; for (c = 0; c < cfg->num_counters; ++c) { if (hq->data[b + 8] != hq->sequence) { @@ -815,7 +815,10 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) */ space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); } else { - /* for each MP: + /* + * Note that padding is used to align memory access to 128 bits. + * + * for each MP: * [00] = MP.C0 * [04] = MP.C1 * [08] = MP.C2 @@ -825,8 +828,11 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) * [18] = MP.C6 * [1c] = MP.C7 * [20] = MP.sequence + * [24] = padding + * [28] = padding + * [2c] = padding */ - space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t); + space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t); } if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) { From 1825898e0471915673e572db4f61f1fd42461150 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 6 Oct 2015 22:24:31 +0200 Subject: [PATCH 189/270] nvc0: store the number of GPCs to nvc0_screen NOUVEAU_GETPARAM_GRAPH_UNITS param returns the number of GPCs, the total number of TPCs and the number of ROP units. Note that when the DRM version is too old the default number of GPCs is fixed to 4. This will be used to launch the compute kernel which is used to read MP performance counters over all GPCs. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 + src/gallium/drivers/nouveau/nvc0/nvc0_screen.h | 1 + 2 files changed, 2 insertions(+) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index afd91e6feee..7f0ada0c032 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -914,6 +914,7 @@ nvc0_screen_create(struct nouveau_device *dev) else value = (16 << 8) | 4; } + screen->gpc_count = value & 0x000000ff; screen->mp_count = value >> 8; screen->mp_count_compute = screen->mp_count; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index 8cf7560e21f..857eb0316c7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -67,6 +67,7 @@ struct nvc0_screen { struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ struct nouveau_bo *poly_cache; + uint8_t gpc_count; uint16_t mp_count; uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */ From cef22f3490f9809a6e77949f73448efac23be7ee Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 16:10:19 +0200 Subject: [PATCH 190/270] nvc0: read MP counters of all GPCs on Fermi When a card has more than one GPC, the grid used by the compute kernel which reads MP performance counters seems to be too small. The consequence is that the kernel is not launched on all TPCs. Increasing the grid size using the number of GPCs now launches enough blocks and we can read MP performance counters of all TPCs. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 8eb3b3ef14a..0b4a36f57dd 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -528,7 +528,7 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) uint32_t mask; uint32_t input[3]; const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; - const uint grid[3] = { screen->mp_count, 1, 1 }; + const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 }; unsigned c; if (unlikely(!screen->pm.prog)) { From 8cd4b8478aac56f0ed516c4ff13f8af012fb8eaa Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 28 Sep 2015 17:29:37 +0200 Subject: [PATCH 191/270] nvc0: allow only one active query for the MP counters group Because we can't expose the number of hardware counters needed for each different query, we don't want to allow more than one active query simultaneously to avoid failure when the maximum number of counters is reached. Note that these groups of GPU counters are currently only used by AMD_performance_monitor. Like for Kepler, this limits the maximum number of active queries to 1 on Fermi. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index f8d4ba16237..c81b85a1804 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -371,22 +371,20 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, info->name = "MP counters"; info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU; + /* Because we can't expose the number of hardware counters needed for + * each different query, we don't want to allow more than one active + * query simultaneously to avoid failure when the maximum number of + * counters is reached. Note that these groups of GPU counters are + * currently only used by AMD_performance_monitor. + */ + info->max_active_queries = 1; + if (screen->base.class_3d == NVE4_3D_CLASS) { info->num_queries = NVE4_HW_SM_QUERY_COUNT; - - /* On NVE4+, each multiprocessor have 8 hardware counters separated - * in two distinct domains, but we allow only one active query - * simultaneously because some of them use more than one hardware - * counter and this will result in an undefined behaviour. */ - info->max_active_queries = 1; /* TODO: handle multiple hw counters */ - return 1; + return 1; } else if (screen->base.class_3d < NVE4_3D_CLASS) { info->num_queries = NVC0_HW_SM_QUERY_COUNT; - - /* On NVC0:NVE4, each multiprocessor have 8 hardware counters - * in a single domain. */ - info->max_active_queries = 8; return 1; } } From 00d61869a5e8e8ecdb5613f1b2aab5019d71d77e Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 9 Oct 2015 16:53:18 +0200 Subject: [PATCH 192/270] nvc0: enable compute support by default on Fermi Compute support was not enabled by default because weird effects on 3D state happened, but I can't reproduce them anymore. This also enables MP performance counters by default on Fermi. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 3 +-- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 7 +------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index c81b85a1804..80f311be2e8 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -291,7 +291,6 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, count += NVE4_HW_SM_QUERY_COUNT; } else if (screen->base.class_3d < NVE4_3D_CLASS) { - /* NVC0_COMPUTE is not always enabled */ count += NVC0_HW_SM_QUERY_COUNT; } } @@ -358,7 +357,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, count++; } else if (screen->base.class_3d < NVE4_3D_CLASS) { - count++; /* NVC0_COMPUTE is not always enabled */ + count++; } } } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 7f0ada0c032..f34ad0ed5d1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -561,12 +561,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen) switch (screen->base.device->chipset & ~0xf) { case 0xc0: case 0xd0: - /* Using COMPUTE has weird effects on 3D state, we need to - * investigate this further before enabling it by default. - */ - if (debug_get_bool_option("NVC0_COMPUTE", false)) - return nvc0_screen_compute_setup(screen, screen->base.pushbuf); - return 0; + return nvc0_screen_compute_setup(screen, screen->base.pushbuf); case 0xe0: return nve4_screen_compute_setup(screen, screen->base.pushbuf); case 0xf0: From ec5001d25b281455869149bff5fa9d8c497b0cd4 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 13 Oct 2015 22:16:23 +0200 Subject: [PATCH 193/270] nvc0: move SW/HW queries info to their respective files This will help for handling HW SM queries variants on Fermi. Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 185 +----------------- .../drivers/nouveau/nvc0/nvc0_query_hw.c | 14 ++ .../drivers/nouveau/nvc0/nvc0_query_hw.h | 3 + .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 133 +++++++++++++ .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h | 4 +- .../drivers/nouveau/nvc0/nvc0_query_sw.c | 64 ++++++ .../drivers/nouveau/nvc0/nvc0_query_sw.h | 3 + 7 files changed, 228 insertions(+), 178 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 80f311be2e8..e4752e2dbc5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -141,163 +141,19 @@ nvc0_render_condition(struct pipe_context *pipe, PUSH_DATA (push, hq->bo->offset + hq->offset); } -/* === DRIVER STATISTICS === */ - -#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - -static const char *nvc0_sw_query_drv_stat_names[] = -{ - "drv-tex_obj_current_count", - "drv-tex_obj_current_bytes", - "drv-buf_obj_current_count", - "drv-buf_obj_current_bytes_vid", - "drv-buf_obj_current_bytes_sys", - "drv-tex_transfers_rd", - "drv-tex_transfers_wr", - "drv-tex_copy_count", - "drv-tex_blit_count", - "drv-tex_cache_flush_count", - "drv-buf_transfers_rd", - "drv-buf_transfers_wr", - "drv-buf_read_bytes_staging_vid", - "drv-buf_write_bytes_direct", - "drv-buf_write_bytes_staging_vid", - "drv-buf_write_bytes_staging_sys", - "drv-buf_copy_bytes", - "drv-buf_non_kernel_fence_sync_count", - "drv-any_non_kernel_fence_sync_count", - "drv-query_sync_count", - "drv-gpu_serialize_count", - "drv-draw_calls_array", - "drv-draw_calls_indexed", - "drv-draw_calls_fallback_count", - "drv-user_buffer_upload_bytes", - "drv-constbuf_upload_count", - "drv-constbuf_upload_bytes", - "drv-pushbuf_count", - "drv-resource_validate_count" -}; - -#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */ - -/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ - -/* NOTE: intentionally using the same names as NV */ -static const char *nve4_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "global_ld_mem_divergence_replays", - "global_store_transaction", - "global_st_mem_divergence_replays", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued", - "inst_issued1", - "inst_issued2", - "l1_global_load_hit", - "l1_global_load_miss", - "l1_local_load_hit", - "l1_local_load_miss", - "l1_local_store_hit", - "l1_local_store_miss", - "l1_shared_load_transactions", - "l1_shared_store_transactions", - "local_load", - "local_load_transactions", - "local_store", - "local_store_transactions", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_load_replay", - "shared_store", - "shared_store_replay", - "sm_cta_launched", - "threads_launched", - "uncached_global_load_transaction", - "warps_launched", - /* metrics, i.e. functions of the MP counters */ - "metric-ipc", /* inst_executed, clock */ - "metric-ipac", /* inst_executed, active_cycles */ - "metric-ipec", /* inst_executed, (bool)inst_executed */ - "metric-achieved_occupancy", /* active_warps, active_cycles */ - "metric-sm_efficiency", /* active_cycles, clock */ - "metric-inst_replay_overhead" /* inst_issued, inst_executed */ -}; - -/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ -static const char *nvc0_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued1_0", - "inst_issued1_1", - "inst_issued2_0", - "inst_issued2_1", - "local_load", - "local_store", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_store", - "threads_launched", - "thread_inst_executed_0", - "thread_inst_executed_1", - "thread_inst_executed_2", - "thread_inst_executed_3", - "warps_launched", -}; - int nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, unsigned id, struct pipe_driver_query_info *info) { struct nvc0_screen *screen = nvc0_screen(pscreen); - int count = 0; + int num_sw_queries = 0, num_hw_queries = 0; - count += NVC0_SW_QUERY_DRV_STAT_COUNT; - - if (screen->base.device->drm_version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_SM_QUERY_COUNT; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - count += NVC0_HW_SM_QUERY_COUNT; - } - } - } + num_sw_queries = nvc0_sw_get_driver_query_info(screen, 0, NULL); + num_hw_queries = nvc0_hw_get_driver_query_info(screen, 0, NULL); if (!info) - return count; + return num_sw_queries + num_hw_queries; /* Init default values. */ info->name = "this_is_not_the_query_you_are_looking_for"; @@ -307,36 +163,11 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen, info->group_id = -1; #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS - if (id < NVC0_SW_QUERY_DRV_STAT_COUNT) { - info->name = nvc0_sw_query_drv_stat_names[id]; - info->query_type = NVC0_SW_QUERY_DRV_STAT(id); - info->max_value.u64 = 0; - if (strstr(info->name, "bytes")) - info->type = PIPE_DRIVER_QUERY_TYPE_BYTES; - info->group_id = NVC0_SW_QUERY_DRV_STAT_GROUP; - return 1; - } else + if (id < num_sw_queries) + return nvc0_sw_get_driver_query_info(screen, id, info); #endif - if (id < count) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_hw_sm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; - info->query_type = NVE4_HW_SM_QUERY(id - NVC0_SW_QUERY_DRV_STAT_COUNT); - info->max_value.u64 = - (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; - info->group_id = NVC0_HW_SM_QUERY_GROUP; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - info->name = nvc0_hw_sm_query_names[id - NVC0_SW_QUERY_DRV_STAT_COUNT]; - info->query_type = NVC0_HW_SM_QUERY(id - NVC0_SW_QUERY_DRV_STAT_COUNT); - info->group_id = NVC0_HW_SM_QUERY_GROUP; - return 1; - } - } - } - /* user asked for info about non-existing query */ - return 0; + + return nvc0_hw_get_driver_query_info(screen, id - num_sw_queries, info); } int diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index 16a639e3c48..91254bedf1e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -431,6 +431,20 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index) return q; } +int +nvc0_hw_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + int num_hw_sm_queries = 0; + + num_hw_sm_queries = nvc0_hw_sm_get_driver_query_info(screen, 0, NULL); + + if (!info) + return num_hw_sm_queries; + + return nvc0_hw_sm_get_driver_query_info(screen, id, info); +} + void nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, struct nvc0_query *q, unsigned result_offset) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h index d72d894cc5a..3701eb7100f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h @@ -42,6 +42,9 @@ nvc0_hw_query(struct nvc0_query *q) struct nvc0_query * nvc0_hw_create_query(struct nvc0_context *, unsigned, unsigned); +int +nvc0_hw_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); bool nvc0_hw_query_allocate(struct nvc0_context *, struct nvc0_query *, int); void diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 0b4a36f57dd..f4c1e52be38 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -32,6 +32,62 @@ /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ +/* NOTE: intentionally using the same names as NV */ +static const char *nve4_hw_sm_query_names[] = +{ + /* MP counters */ + "active_cycles", + "active_warps", + "atom_count", + "branch", + "divergent_branch", + "gld_request", + "global_ld_mem_divergence_replays", + "global_store_transaction", + "global_st_mem_divergence_replays", + "gred_count", + "gst_request", + "inst_executed", + "inst_issued", + "inst_issued1", + "inst_issued2", + "l1_global_load_hit", + "l1_global_load_miss", + "l1_local_load_hit", + "l1_local_load_miss", + "l1_local_store_hit", + "l1_local_store_miss", + "l1_shared_load_transactions", + "l1_shared_store_transactions", + "local_load", + "local_load_transactions", + "local_store", + "local_store_transactions", + "prof_trigger_00", + "prof_trigger_01", + "prof_trigger_02", + "prof_trigger_03", + "prof_trigger_04", + "prof_trigger_05", + "prof_trigger_06", + "prof_trigger_07", + "shared_load", + "shared_load_replay", + "shared_store", + "shared_store_replay", + "sm_cta_launched", + "threads_launched", + "uncached_global_load_transaction", + "warps_launched", + /* metrics, i.e. functions of the MP counters */ + "metric-ipc", /* inst_executed, clock */ + "metric-ipac", /* inst_executed, active_cycles */ + "metric-ipec", /* inst_executed, (bool)inst_executed */ + "metric-achieved_occupancy", /* active_warps, active_cycles */ + "metric-sm_efficiency", /* active_cycles, clock */ + "metric-inst_replay_overhead" /* inst_issued, inst_executed */ +}; + /* Code to read out MP counters: They are accessible via mmio, too, but let's * just avoid mapping registers in userspace. We'd have to know which MPs are * enabled/present, too, and that information is not presently exposed. @@ -230,6 +286,42 @@ static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = #undef _M2B /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ +static const char *nvc0_hw_sm_query_names[] = +{ + /* MP counters */ + "active_cycles", + "active_warps", + "atom_count", + "branch", + "divergent_branch", + "gld_request", + "gred_count", + "gst_request", + "inst_executed", + "inst_issued1_0", + "inst_issued1_1", + "inst_issued2_0", + "inst_issued2_1", + "local_load", + "local_store", + "prof_trigger_00", + "prof_trigger_01", + "prof_trigger_02", + "prof_trigger_03", + "prof_trigger_04", + "prof_trigger_05", + "prof_trigger_06", + "prof_trigger_07", + "shared_load", + "shared_store", + "threads_launched", + "thread_inst_executed_0", + "thread_inst_executed_1", + "thread_inst_executed_2", + "thread_inst_executed_3", + "warps_launched", +}; + static const uint64_t nvc0_read_hw_sm_counters_code[] = { /* mov b32 $r8 $tidx @@ -842,3 +934,44 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } + +int +nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + int count = 0; + + if (screen->base.device->drm_version >= 0x01000101) { + if (screen->compute) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + count += NVE4_HW_SM_QUERY_COUNT; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + count += NVC0_HW_SM_QUERY_COUNT; + } + } + } + + if (!info) + return count; + + if (id < count) { + if (screen->compute) { + if (screen->base.class_3d == NVE4_3D_CLASS) { + info->name = nve4_hw_sm_query_names[id]; + info->query_type = NVE4_HW_SM_QUERY(id); + info->max_value.u64 = + (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100; + info->group_id = NVC0_HW_SM_QUERY_GROUP; + return 1; + } else + if (screen->base.class_3d < NVE4_3D_CLASS) { + info->name = nvc0_hw_sm_query_names[id]; + info->query_type = NVC0_HW_SM_QUERY(id); + info->group_id = NVC0_HW_SM_QUERY_GROUP; + return 1; + } + } + } + return 0; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h index 0ad8a91ee6d..bb1166d1a85 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -113,5 +113,7 @@ enum nvc0_hw_sm_queries struct nvc0_hw_query * nvc0_hw_sm_create_query(struct nvc0_context *, unsigned); - +int +nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); #endif diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c index 5f33b1e019a..cd24618d564 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.c @@ -25,6 +25,45 @@ #include "nvc0_query_sw.h" +/* === DRIVER STATISTICS === */ + +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + +static const char *nvc0_sw_query_drv_stat_names[] = +{ + "drv-tex_obj_current_count", + "drv-tex_obj_current_bytes", + "drv-buf_obj_current_count", + "drv-buf_obj_current_bytes_vid", + "drv-buf_obj_current_bytes_sys", + "drv-tex_transfers_rd", + "drv-tex_transfers_wr", + "drv-tex_copy_count", + "drv-tex_blit_count", + "drv-tex_cache_flush_count", + "drv-buf_transfers_rd", + "drv-buf_transfers_wr", + "drv-buf_read_bytes_staging_vid", + "drv-buf_write_bytes_direct", + "drv-buf_write_bytes_staging_vid", + "drv-buf_write_bytes_staging_sys", + "drv-buf_copy_bytes", + "drv-buf_non_kernel_fence_sync_count", + "drv-any_non_kernel_fence_sync_count", + "drv-query_sync_count", + "drv-gpu_serialize_count", + "drv-draw_calls_array", + "drv-draw_calls_indexed", + "drv-draw_calls_fallback_count", + "drv-user_buffer_upload_bytes", + "drv-constbuf_upload_count", + "drv-constbuf_upload_bytes", + "drv-pushbuf_count", + "drv-resource_validate_count" +}; + +#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */ + static void nvc0_sw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) { @@ -96,3 +135,28 @@ nvc0_sw_create_query(struct nvc0_context *nvcO, unsigned type, unsigned index) return q; } + +int +nvc0_sw_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + int count = 0; + + count += NVC0_SW_QUERY_DRV_STAT_COUNT; + if (!info) + return count; + +#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS + if (id < count) { + info->name = nvc0_sw_query_drv_stat_names[id]; + info->query_type = NVC0_SW_QUERY_DRV_STAT(id); + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->max_value.u64 = 0; + if (strstr(info->name, "bytes")) + info->type = PIPE_DRIVER_QUERY_TYPE_BYTES; + info->group_id = NVC0_SW_QUERY_DRV_STAT_GROUP; + return 1; + } +#endif + return 0; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h index 71d23d9b41e..eaa890e4fc0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_sw.h @@ -57,5 +57,8 @@ enum nvc0_sw_query_drv_stat struct nvc0_query * nvc0_sw_create_query(struct nvc0_context *, unsigned, unsigned); +int +nvc0_sw_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); #endif From 0461260d772ed91bec7cd36727c82ca4e6d71275 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 14 Oct 2015 01:15:43 +0200 Subject: [PATCH 194/270] nvc0: add MP counters variants for GF100/GF110 GF100 and GF110 chipsets are compute capability 2.0, while the other Fermi chipsets are compute capability 2.1. That's why, some MP counters are different between these chipsets and we need to handle variants. Signed-off-by: Samuel Pitoiet Reviewed-by: Ilia Mirkin --- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 557 +++++++++++++++--- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h | 1 + 2 files changed, 482 insertions(+), 76 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index f4c1e52be38..6ab5090f7ff 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -298,6 +298,7 @@ static const char *nvc0_hw_sm_query_names[] = "gred_count", "gst_request", "inst_executed", + "inst_issued", "inst_issued1_0", "inst_issued1_1", "inst_issued2_0", @@ -373,82 +374,456 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = }; #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } -#define _Q(n, c, ...) [NVC0_HW_SM_QUERY_##n] = { \ - { __VA_ARGS__ }, c, NVC0_COUNTER_OPn_SUM, { 1, 1 }, \ -} +#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c -static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] = +/* ==== Compute capability 2.0 (GF100/GF110) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm20_active_cycles = { - _Q(ACTIVE_CYCLES, 1, _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000)), - _Q(ACTIVE_WARPS, 6, _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), - _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040), - _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050), - _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060)), - _Q(ATOM_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030)), - _Q(BRANCH, 2, _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), - _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010)), - _Q(DIVERGENT_BRANCH, 2, _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030)), - _Q(GLD_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030)), - _Q(GRED_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040)), - _Q(GST_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060)), - _Q(INST_EXECUTED, 3, _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), - _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020)), - _Q(INST_ISSUED1_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010)), - _Q(INST_ISSUED1_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040)), - _Q(INST_ISSUED2_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020)), - _Q(INST_ISSUED2_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050)), - _Q(LOCAL_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020)), - _Q(LOCAL_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050)), - _Q(PROF_TRIGGER_0, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000)), - _Q(PROF_TRIGGER_1, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010)), - _Q(PROF_TRIGGER_2, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020)), - _Q(PROF_TRIGGER_3, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030)), - _Q(PROF_TRIGGER_4, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040)), - _Q(PROF_TRIGGER_5, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050)), - _Q(PROF_TRIGGER_6, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060)), - _Q(PROF_TRIGGER_7, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070)), - _Q(SHARED_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010)), - _Q(SHARED_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040)), - _Q(THREADS_LAUNCHED, 6, _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), - _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040), - _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050), - _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060)), - _Q(TH_INST_EXECUTED_0, 6, _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), - _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030), - _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040), - _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050)), - _Q(TH_INST_EXECUTED_1, 6, _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), - _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030), - _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040), - _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050)), - _Q(TH_INST_EXECUTED_2, 6, _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), - _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030), - _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040), - _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050)), - _Q(TH_INST_EXECUTED_3, 6, _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), - _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), - _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), - _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030), - _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040), - _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050)), - _Q(WARPS_LAUNCHED, 1, _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000)), + .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_active_warps = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), + .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), + .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), + .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040), + .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050), + .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_atom_count = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_branch = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_divergent_branch = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), + .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_gld_request = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_gred_count = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_gst_request = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_inst_executed = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_inst_issued = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060), + .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070), + .num_counters = 2, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_local_ld = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_local_st = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_2 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_3 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_4 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_5 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_6 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_prof_trigger_7 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_shared_ld = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_shared_st = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_threads_launched = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), + .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), + .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), + .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040), + .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050), + .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_th_inst_executed_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_th_inst_executed_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm20_warps_launched = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] = +{ + _Q(ACTIVE_CYCLES, &sm20_active_cycles), + _Q(ACTIVE_WARPS, &sm20_active_warps), + _Q(ATOM_COUNT, &sm20_atom_count), + _Q(BRANCH, &sm20_branch), + _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), + _Q(GLD_REQUEST, &sm20_gld_request), + _Q(GRED_COUNT, &sm20_gred_count), + _Q(GST_REQUEST, &sm20_gst_request), + _Q(INST_EXECUTED, &sm20_inst_executed), + _Q(INST_ISSUED, &sm20_inst_issued), + _Q(INST_ISSUED1_0, NULL), + _Q(INST_ISSUED1_1, NULL), + _Q(INST_ISSUED2_0, NULL), + _Q(INST_ISSUED2_1, NULL), + _Q(LOCAL_LD, &sm20_local_ld), + _Q(LOCAL_ST, &sm20_local_st), + _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), + _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), + _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), + _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), + _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), + _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), + _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), + _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), + _Q(SHARED_LD, &sm20_shared_ld), + _Q(SHARED_ST, &sm20_shared_st), + _Q(THREADS_LAUNCHED, &sm20_threads_launched), + _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0), + _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1), + _Q(TH_INST_EXECUTED_2, NULL), + _Q(TH_INST_EXECUTED_3, NULL), + _Q(WARPS_LAUNCHED, &sm20_warps_launched), +}; + +/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ +static const struct nvc0_hw_sm_query_cfg +sm21_inst_executed = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020), + .num_counters = 3, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued1_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued1_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued2_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_inst_issued2_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050), + .num_counters = 1, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_0 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_1 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_2 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg +sm21_th_inst_executed_3 = +{ + .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), + .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), + .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), + .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030), + .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040), + .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050), + .num_counters = 6, + .op = NVC0_COUNTER_OPn_SUM, + .norm = { 1, 1 }, +}; + +static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] = +{ + _Q(ACTIVE_CYCLES, &sm20_active_cycles), + _Q(ACTIVE_WARPS, &sm20_active_warps), + _Q(ATOM_COUNT, &sm20_atom_count), + _Q(BRANCH, &sm20_branch), + _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), + _Q(GLD_REQUEST, &sm20_gld_request), + _Q(GRED_COUNT, &sm20_gred_count), + _Q(GST_REQUEST, &sm20_gst_request), + _Q(INST_EXECUTED, &sm21_inst_executed), + _Q(INST_ISSUED, NULL), + _Q(INST_ISSUED1_0, &sm21_inst_issued1_0), + _Q(INST_ISSUED1_1, &sm21_inst_issued1_1), + _Q(INST_ISSUED2_0, &sm21_inst_issued2_0), + _Q(INST_ISSUED2_1, &sm21_inst_issued2_1), + _Q(LOCAL_LD, &sm20_local_ld), + _Q(LOCAL_ST, &sm20_local_st), + _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), + _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), + _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), + _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), + _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), + _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), + _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), + _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), + _Q(SHARED_LD, &sm20_shared_ld), + _Q(SHARED_ST, &sm20_shared_st), + _Q(THREADS_LAUNCHED, &sm20_threads_launched), + _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0), + _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1), + _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2), + _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3), + _Q(WARPS_LAUNCHED, &sm20_warps_launched), }; #undef _Q #undef _C +static inline const struct nvc0_hw_sm_query_cfg ** +nvc0_hw_sm_get_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_sm_queries; + return sm21_hw_sm_queries; +} + static const struct nvc0_hw_sm_query_cfg * nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { @@ -457,7 +832,14 @@ nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) if (screen->base.class_3d >= NVE4_3D_CLASS) return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; - return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)]; + + if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) { + const struct nvc0_hw_sm_query_cfg **queries = + nvc0_hw_sm_get_queries(screen); + return queries[q->type - NVC0_HW_SM_QUERY(0)]; + } + debug_printf("invalid query type: %d\n", q->type); + return NULL; } static void @@ -884,11 +1266,6 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) * [04] = WS0.C1 * [08] = WS0.C2 * [0c] = WS0.C3 - * [10] = WS1.C0 - * [14] = WS1.C1 - * [18] = WS1.C2 - * [1c] = WS1.C3 - * [20] = WS2.C0 * [24] = WS2.C1 * [28] = WS2.C2 * [2c] = WS2.C3 @@ -935,6 +1312,23 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } +static int +nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries, + unsigned id) +{ + unsigned i, next = 0; + + for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { + if (!queries[i]) { + next++; + } else + if (i >= id && queries[id + next]) { + break; + } + } + return id + next; +} + int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) @@ -947,7 +1341,14 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, count += NVE4_HW_SM_QUERY_COUNT; } else if (screen->base.class_3d < NVE4_3D_CLASS) { - count += NVC0_HW_SM_QUERY_COUNT; + const struct nvc0_hw_sm_query_cfg **queries = + nvc0_hw_sm_get_queries(screen); + unsigned i; + + for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { + if (queries[i]) + count++; + } } } } @@ -966,6 +1367,10 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, return 1; } else if (screen->base.class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_sm_query_cfg **queries = + nvc0_hw_sm_get_queries(screen); + + id = nvc0_hw_sm_get_next_query_id(queries, id); info->name = nvc0_hw_sm_query_names[id]; info->query_type = NVC0_HW_SM_QUERY(id); info->group_id = NVC0_HW_SM_QUERY_GROUP; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h index bb1166d1a85..26bde0c3e0d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -86,6 +86,7 @@ enum nvc0_hw_sm_queries NVC0_HW_SM_QUERY_GRED_COUNT, NVC0_HW_SM_QUERY_GST_REQUEST, NVC0_HW_SM_QUERY_INST_EXECUTED, + NVC0_HW_SM_QUERY_INST_ISSUED, NVC0_HW_SM_QUERY_INST_ISSUED1_0, NVC0_HW_SM_QUERY_INST_ISSUED1_1, NVC0_HW_SM_QUERY_INST_ISSUED2_0, From a3b17575512f0bb614234ab1834b7b36e1124082 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 16 Oct 2015 10:21:44 +0200 Subject: [PATCH 195/270] nvc0: add a note about MP counters on GF100/GF110 MP counters on GF100/GF110 (compute capability 2.0) are buggy because there is a context-switch problem that we need to fix. Results might be wrong sometimes, be careful! Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 6ab5090f7ff..44b222e5134 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -286,6 +286,11 @@ static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = #undef _M2B /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ +/* NOTES: + * - MP counters on GF100/GF110 (compute capability 2.0) are buggy + * because there is a context-switch problem that we need to fix. + * Results might be wrong sometimes, be careful! + */ static const char *nvc0_hw_sm_query_names[] = { /* MP counters */ From 2023906667cf349284cd3d3921dc1dcbb7506014 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 16 Oct 2015 14:31:22 -0600 Subject: [PATCH 196/270] tgsi: initialize ctx.file in tgsi_dump_instruction() Fixes segfault because of uninitialized file pointer. Trivial. --- src/gallium/auxiliary/tgsi/tgsi_dump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index 8ceb5b47584..5d80cca5b0e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -648,6 +648,7 @@ tgsi_dump_instruction( ctx.indent = 0; ctx.dump_printf = dump_ctx_printf; ctx.indentation = 0; + ctx.file = NULL; iter_instruction( &ctx.iter, (struct tgsi_full_instruction *)inst ); } From fd5e0581dda40fbf72f166cb583ea37e0f812d09 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Wed, 14 Oct 2015 15:49:58 -0400 Subject: [PATCH 197/270] configure: show which gallium drivers/sts are built MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilia Mirkin Reviewed-by: Emil Velikov Reviewed-by: Michel Dänzer --- configure.ac | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 217281f7950..0a3329021c0 100644 --- a/configure.ac +++ b/configure.ac @@ -1576,6 +1576,8 @@ fi AM_CONDITIONAL(HAVE_EGL, test "x$enable_egl" = xyes) AC_SUBST([EGL_LIB_DEPS]) +gallium_st="mesa" + dnl dnl XA configuration dnl @@ -1589,6 +1591,7 @@ if test "x$enable_xa" = xyes; then Example: ./configure --enable-xa --with-gallium-drivers=svga...]) fi enable_gallium_loader=$enable_shared_pipe_drivers + gallium_st="$gallium_st xa" fi AM_CONDITIONAL(HAVE_ST_XA, test "x$enable_xa" = xyes) @@ -1634,24 +1637,28 @@ AM_CONDITIONAL(NEED_GALLIUM_VL_WINSYS, test "x$need_gallium_vl_winsys" = xyes) if test "x$enable_xvmc" = xyes; then PKG_CHECK_MODULES([XVMC], [xvmc >= $XVMC_REQUIRED]) enable_gallium_loader=$enable_shared_pipe_drivers + gallium_st="$gallium_st xvmc" fi AM_CONDITIONAL(HAVE_ST_XVMC, test "x$enable_xvmc" = xyes) if test "x$enable_vdpau" = xyes; then PKG_CHECK_MODULES([VDPAU], [vdpau >= $VDPAU_REQUIRED]) enable_gallium_loader=$enable_shared_pipe_drivers + gallium_st="$gallium_st vdpau" fi AM_CONDITIONAL(HAVE_ST_VDPAU, test "x$enable_vdpau" = xyes) if test "x$enable_omx" = xyes; then PKG_CHECK_MODULES([OMX], [libomxil-bellagio >= $LIBOMXIL_BELLAGIO_REQUIRED]) enable_gallium_loader=$enable_shared_pipe_drivers + gallium_st="$gallium_st omx" fi AM_CONDITIONAL(HAVE_ST_OMX, test "x$enable_omx" = xyes) if test "x$enable_va" = xyes; then PKG_CHECK_MODULES([VA], [libva >= $LIBVA_REQUIRED]) enable_gallium_loader=$enable_shared_pipe_drivers + gallium_st="$gallium_st va" fi AM_CONDITIONAL(HAVE_ST_VA, test "x$enable_va" = xyes) @@ -1674,6 +1681,7 @@ if test "x$enable_nine" = xyes; then fi enable_gallium_loader=$enable_shared_pipe_drivers + gallium_st="$gallium_st nine" fi AM_CONDITIONAL(HAVE_ST_NINE, test "x$enable_nine" = xyes) @@ -1713,6 +1721,7 @@ if test "x$enable_opencl" = xyes; then # XXX: Use $enable_shared_pipe_drivers once converted to use static/shared pipe-drivers enable_gallium_loader=yes + gallium_st="$gallium_st clover" if test "x$enable_opencl_icd" = xyes; then OPENCL_LIBNAME="MesaOpenCL" @@ -2513,7 +2522,8 @@ fi echo "" if test -n "$with_gallium_drivers"; then - echo " Gallium: yes" + echo " Gallium drivers: $gallium_drivers" + echo " Gallium st: $gallium_st" else echo " Gallium: no" fi From 2e1798f183427d704d32d68165248519781ae864 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 23 Jul 2015 08:31:59 +1000 Subject: [PATCH 198/270] nir: wrapper for glsl_type arrays_of_arrays_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Tapani Pälli Reviewed-by: Ian Romanick --- src/glsl/nir/nir_types.cpp | 6 ++++++ src/glsl/nir/nir_types.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp index da9807f0e62..965f42320be 100644 --- a/src/glsl/nir/nir_types.cpp +++ b/src/glsl/nir/nir_types.cpp @@ -106,6 +106,12 @@ glsl_get_length(const struct glsl_type *type) return type->is_matrix() ? type->matrix_columns : type->length; } +unsigned +glsl_get_aoa_size(const struct glsl_type *type) +{ + return type->arrays_of_arrays_size(); +} + const char * glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index) { diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h index 49d6a65e7c4..009a0fb9918 100644 --- a/src/glsl/nir/nir_types.h +++ b/src/glsl/nir/nir_types.h @@ -59,6 +59,8 @@ unsigned glsl_get_matrix_columns(const struct glsl_type *type); unsigned glsl_get_length(const struct glsl_type *type); +unsigned glsl_get_aoa_size(const struct glsl_type *type); + const char *glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index); From 3c87377d0b0d07249ba94d6cb22a8b324bb06ba6 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 23 Jul 2015 08:32:00 +1000 Subject: [PATCH 199/270] nir: add atomic lowering support for AoA Cc: Francisco Jerez Reviewed-by: Jason Ekstrand --- src/glsl/nir/nir_lower_atomics.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c index 6f9ecc019ec..46e137652a1 100644 --- a/src/glsl/nir/nir_lower_atomics.c +++ b/src/glsl/nir/nir_lower_atomics.c @@ -72,20 +72,22 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl) nir_ssa_def *offset_def = &offset_const->def; - if (instr->variables[0]->deref.child != NULL) { - assert(instr->variables[0]->deref.child->deref_type == - nir_deref_type_array); - nir_deref_array *deref_array = - nir_deref_as_array(instr->variables[0]->deref.child); - assert(deref_array->deref.child == NULL); + nir_deref *tail = &instr->variables[0]->deref; + while (tail->child != NULL) { + assert(tail->child->deref_type == nir_deref_type_array); + nir_deref_array *deref_array = nir_deref_as_array(tail->child); + tail = tail->child; - offset_const->value.u[0] += - deref_array->base_offset * ATOMIC_COUNTER_SIZE; + unsigned child_array_elements = tail->child != NULL ? + glsl_get_aoa_size(tail->type) : 1; + + offset_const->value.u[0] += deref_array->base_offset * + child_array_elements * ATOMIC_COUNTER_SIZE; if (deref_array->deref_array_type == nir_deref_array_type_indirect) { nir_load_const_instr *atomic_counter_size = nir_load_const_instr_create(mem_ctx, 1); - atomic_counter_size->value.u[0] = ATOMIC_COUNTER_SIZE; + atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE; nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr); nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul); @@ -102,7 +104,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl) add->src[0].src.is_ssa = true; add->src[0].src.ssa = &mul->dest.dest.ssa; add->src[1].src.is_ssa = true; - add->src[1].src.ssa = &offset_const->def; + add->src[1].src.ssa = offset_def; nir_instr_insert_before(&instr->instr, &add->instr); offset_def = &add->dest.dest.ssa; From 698cdbf49207c30bc2ac38e2f16e531dd3a54db3 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 15 Oct 2015 14:10:35 +1100 Subject: [PATCH 200/270] glsl: initialise record array count to 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was only being done in one of the two process methods. Fixes an issue with samplers using the array size of a previous record. Tested-by: Marek Olšák Cc: Jason Ekstrand --- src/glsl/link_uniforms.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp index 647aa2bbdd8..fe00aa30d07 100644 --- a/src/glsl/link_uniforms.cpp +++ b/src/glsl/link_uniforms.cpp @@ -161,6 +161,7 @@ program_resource_visitor::process(ir_variable *var) false, record_array_count); ralloc_free(name); } else { + this->set_record_array_count(record_array_count); this->visit_field(t, var->name, row_major, NULL, packing, false); } } From 33de998230414bf5603927424ffca85792b176d1 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 9 Oct 2015 16:27:45 -0400 Subject: [PATCH 201/270] glsl: couple shader_enums cleanups Add missing enum to gl_system_value_name() and move VARYING_SLOT_MAX / FRAG_RESULT_MAX / etc into shader_enums.h as suggested by Emil. v2: add STATIC_ASSERT()'s Reported-by: Emil Velikov Acked-by: Emil Velikov Reviewed-by: Jason Ekstrand Signed-off-by: Rob Clark --- src/glsl/nir/shader_enums.c | 8 ++++++++ src/glsl/nir/shader_enums.h | 7 +++++++ src/mesa/main/mtypes.h | 5 ----- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/glsl/nir/shader_enums.c b/src/glsl/nir/shader_enums.c index 3722475731b..66a25e72344 100644 --- a/src/glsl/nir/shader_enums.c +++ b/src/glsl/nir/shader_enums.c @@ -28,6 +28,7 @@ #include "shader_enums.h" #include "util/macros.h" +#include "mesa/main/config.h" #define ENUM(x) [x] = #x #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN") @@ -42,6 +43,7 @@ const char * gl_shader_stage_name(gl_shader_stage stage) ENUM(MESA_SHADER_FRAGMENT), ENUM(MESA_SHADER_COMPUTE), }; + STATIC_ASSERT(ARRAY_SIZE(names) == MESA_SHADER_STAGES); return NAME(stage); } @@ -82,6 +84,7 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib) ENUM(VERT_ATTRIB_GENERIC14), ENUM(VERT_ATTRIB_GENERIC15), }; + STATIC_ASSERT(ARRAY_SIZE(names) == VERT_ATTRIB_MAX); return NAME(attrib); } @@ -147,6 +150,7 @@ const char * gl_varying_slot_name(gl_varying_slot slot) ENUM(VARYING_SLOT_VAR30), ENUM(VARYING_SLOT_VAR31), }; + STATIC_ASSERT(ARRAY_SIZE(names) == VARYING_SLOT_MAX); return NAME(slot); } @@ -169,8 +173,10 @@ const char * gl_system_value_name(gl_system_value sysval) ENUM(SYSTEM_VALUE_TESS_LEVEL_INNER), ENUM(SYSTEM_VALUE_LOCAL_INVOCATION_ID), ENUM(SYSTEM_VALUE_WORK_GROUP_ID), + ENUM(SYSTEM_VALUE_NUM_WORK_GROUPS), ENUM(SYSTEM_VALUE_VERTEX_CNT), }; + STATIC_ASSERT(ARRAY_SIZE(names) == SYSTEM_VALUE_MAX); return NAME(sysval); } @@ -182,6 +188,7 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual) ENUM(INTERP_QUALIFIER_FLAT), ENUM(INTERP_QUALIFIER_NOPERSPECTIVE), }; + STATIC_ASSERT(ARRAY_SIZE(names) == INTERP_QUALIFIER_COUNT); return NAME(qual); } @@ -201,5 +208,6 @@ const char * gl_frag_result_name(gl_frag_result result) ENUM(FRAG_RESULT_DATA6), ENUM(FRAG_RESULT_DATA7), }; + STATIC_ASSERT(ARRAY_SIZE(names) == FRAG_RESULT_MAX); return NAME(result); } diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h index 2a5d2c5bfa7..77638ba4e34 100644 --- a/src/glsl/nir/shader_enums.h +++ b/src/glsl/nir/shader_enums.h @@ -233,6 +233,11 @@ typedef enum VARYING_SLOT_VAR31, } gl_varying_slot; + +#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING) +#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX) +#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING) + const char * gl_varying_slot_name(gl_varying_slot slot); /** @@ -473,4 +478,6 @@ typedef enum const char * gl_frag_result_name(gl_frag_result result); +#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS) + #endif /* SHADER_ENUMS_H */ diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index f7118c1e7a6..e9d8ea42bce 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -94,11 +94,6 @@ struct vbo_context; #define PRIM_OUTSIDE_BEGIN_END (PRIM_MAX + 1) #define PRIM_UNKNOWN (PRIM_MAX + 2) -#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING) -#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX) -#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING) -#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS) - /** * Determine if the given gl_varying_slot appears in the fragment shader. */ From 60690cb3b3082b7397c48769ec28b5570f6b7d7e Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 10 Oct 2015 12:39:57 -0400 Subject: [PATCH 202/270] glsl: move builtin vector types to glsl_types.cpp First step at untangling NIR's dependency on glsl_types without bringing in the dependency on glsl_symbol_table. The builtin types are now in glsl_types (which will end up in NIR), but adding them to the symbol- table stays in builtin_types.cpp (which will not be part of NIR). Reviewed-by: Jason Ekstrand Reviewed-by: Emil Velikov Signed-off-by: Rob Clark --- src/glsl/builtin_types.cpp | 4 +--- src/glsl/glsl_types.cpp | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp index 0aedbb3546a..bbdcd199e92 100644 --- a/src/glsl/builtin_types.cpp +++ b/src/glsl/builtin_types.cpp @@ -43,9 +43,7 @@ * convenience pointers (glsl_type::foo_type). * @{ */ -#define DECL_TYPE(NAME, ...) \ - const glsl_type glsl_type::_##NAME##_type = glsl_type(__VA_ARGS__, #NAME); \ - const glsl_type *const glsl_type::NAME##_type = &glsl_type::_##NAME##_type; +#define DECL_TYPE(NAME, ...) #define STRUCT_TYPE(NAME) \ const glsl_type glsl_type::_struct_##NAME##_type = \ diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp index 27934478e2d..1c66dce85c4 100644 --- a/src/glsl/glsl_types.cpp +++ b/src/glsl/glsl_types.cpp @@ -1729,3 +1729,17 @@ glsl_type::coordinate_components() const return size; } + +/** + * Declarations of type flyweights (glsl_type::_foo_type) and + * convenience pointers (glsl_type::foo_type). + * @{ + */ +#define DECL_TYPE(NAME, ...) \ + const glsl_type glsl_type::_##NAME##_type = glsl_type(__VA_ARGS__, #NAME); \ + const glsl_type *const glsl_type::NAME##_type = &glsl_type::_##NAME##_type; + +#define STRUCT_TYPE(NAME) + +#include "builtin_type_macros.h" +/** @} */ From 183db3a64557d5d231ef58ab5666286f323ff333 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 10 Oct 2015 13:26:03 -0400 Subject: [PATCH 203/270] glsl: move half<->float convertion to util Needed in NIR too, so move out of mesa/main/imports.c Reviewed-by: Jason Ekstrand Reviewed-by: Emil Velikov Signed-off-by: Rob Clark --- src/glsl/Makefile.am | 1 + src/glsl/ir_constant_expression.cpp | 1 + src/glsl/nir/nir_constant_expressions.py | 1 + src/mesa/drivers/dri/i965/brw_sampler_state.c | 1 + src/mesa/main/format_utils.h | 1 + src/mesa/main/imports.c | 148 --------------- src/mesa/main/imports.h | 7 - src/mesa/main/mipmap.c | 1 + src/mesa/main/texcompress_bptc.c | 1 + src/mesa/tnl/t_draw.c | 1 + src/util/Makefile.sources | 2 + src/util/half_float.c | 177 ++++++++++++++++++ src/util/half_float.h | 41 ++++ 13 files changed, 228 insertions(+), 155 deletions(-) create mode 100644 src/util/half_float.c create mode 100644 src/util/half_float.h diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am index 32653911f6c..347919b1d0a 100644 --- a/src/glsl/Makefile.am +++ b/src/glsl/Makefile.am @@ -160,6 +160,7 @@ glsl_compiler_SOURCES = \ glsl_compiler_LDADD = \ libglsl.la \ $(top_builddir)/src/libglsl_util.la \ + $(top_builddir)/src/util/libmesautil.la \ $(PTHREAD_LIBS) glsl_test_SOURCES = \ diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp index 309b6b72b5b..67ed3605a8c 100644 --- a/src/glsl/ir_constant_expression.cpp +++ b/src/glsl/ir_constant_expression.cpp @@ -36,6 +36,7 @@ #include #include "main/core.h" /* for MAX2, MIN2, CLAMP */ #include "util/rounding.h" /* for _mesa_roundeven */ +#include "util/half_float.h" #include "ir.h" #include "glsl_types.h" #include "program/hash_table.h" diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py index 8fd9b1039a7..2ba8554645d 100644 --- a/src/glsl/nir/nir_constant_expressions.py +++ b/src/glsl/nir/nir_constant_expressions.py @@ -29,6 +29,7 @@ template = """\ #include #include "main/core.h" #include "util/rounding.h" /* for _mesa_roundeven */ +#include "util/half_float.h" #include "nir_constant_expressions.h" #if defined(__SUNPRO_CC) diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c index c2db5f69560..6d73444dad0 100644 --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c @@ -44,6 +44,7 @@ #include "main/macros.h" #include "main/samplerobj.h" +#include "util/half_float.h" /** * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet. diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h index 618f43d0aaa..378997b38b2 100644 --- a/src/mesa/main/format_utils.h +++ b/src/mesa/main/format_utils.h @@ -34,6 +34,7 @@ #include "imports.h" #include "macros.h" #include "util/rounding.h" +#include "util/half_float.h" extern const mesa_array_format RGBA32_FLOAT; extern const mesa_array_format RGBA8_UBYTE; diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c index 350e6752c8b..230ebbc67f4 100644 --- a/src/mesa/main/imports.c +++ b/src/mesa/main/imports.c @@ -307,154 +307,6 @@ _mesa_bitcount_64(uint64_t n) } #endif - -/** - * Convert a 4-byte float to a 2-byte half float. - * - * Not all float32 values can be represented exactly as a float16 value. We - * round such intermediate float32 values to the nearest float16. When the - * float32 lies exactly between to float16 values, we round to the one with - * an even mantissa. - * - * This rounding behavior has several benefits: - * - It has no sign bias. - * - * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's - * GPU ISA. - * - * - By reproducing the behavior of the GPU (at least on Intel hardware), - * compile-time evaluation of constant packHalf2x16 GLSL expressions will - * result in the same value as if the expression were executed on the GPU. - */ -GLhalfARB -_mesa_float_to_half(float val) -{ - const fi_type fi = {val}; - const int flt_m = fi.i & 0x7fffff; - const int flt_e = (fi.i >> 23) & 0xff; - const int flt_s = (fi.i >> 31) & 0x1; - int s, e, m = 0; - GLhalfARB result; - - /* sign bit */ - s = flt_s; - - /* handle special cases */ - if ((flt_e == 0) && (flt_m == 0)) { - /* zero */ - /* m = 0; - already set */ - e = 0; - } - else if ((flt_e == 0) && (flt_m != 0)) { - /* denorm -- denorm float maps to 0 half */ - /* m = 0; - already set */ - e = 0; - } - else if ((flt_e == 0xff) && (flt_m == 0)) { - /* infinity */ - /* m = 0; - already set */ - e = 31; - } - else if ((flt_e == 0xff) && (flt_m != 0)) { - /* NaN */ - m = 1; - e = 31; - } - else { - /* regular number */ - const int new_exp = flt_e - 127; - if (new_exp < -14) { - /* The float32 lies in the range (0.0, min_normal16) and is rounded - * to a nearby float16 value. The result will be either zero, subnormal, - * or normal. - */ - e = 0; - m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f)); - } - else if (new_exp > 15) { - /* map this value to infinity */ - /* m = 0; - already set */ - e = 31; - } - else { - /* The float32 lies in the range - * [min_normal16, max_normal16 + max_step16) - * and is rounded to a nearby float16 value. The result will be - * either normal or infinite. - */ - e = new_exp + 15; - m = _mesa_lroundevenf(flt_m / (float) (1 << 13)); - } - } - - assert(0 <= m && m <= 1024); - if (m == 1024) { - /* The float32 was rounded upwards into the range of the next exponent, - * so bump the exponent. This correctly handles the case where f32 - * should be rounded up to float16 infinity. - */ - ++e; - m = 0; - } - - result = (s << 15) | (e << 10) | m; - return result; -} - - -/** - * Convert a 2-byte half float to a 4-byte float. - * Based on code from: - * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html - */ -float -_mesa_half_to_float(GLhalfARB val) -{ - /* XXX could also use a 64K-entry lookup table */ - const int m = val & 0x3ff; - const int e = (val >> 10) & 0x1f; - const int s = (val >> 15) & 0x1; - int flt_m, flt_e, flt_s; - fi_type fi; - float result; - - /* sign bit */ - flt_s = s; - - /* handle special cases */ - if ((e == 0) && (m == 0)) { - /* zero */ - flt_m = 0; - flt_e = 0; - } - else if ((e == 0) && (m != 0)) { - /* denorm -- denorm half will fit in non-denorm single */ - const float half_denorm = 1.0f / 16384.0f; /* 2^-14 */ - float mantissa = ((float) (m)) / 1024.0f; - float sign = s ? -1.0f : 1.0f; - return sign * mantissa * half_denorm; - } - else if ((e == 31) && (m == 0)) { - /* infinity */ - flt_e = 0xff; - flt_m = 0; - } - else if ((e == 31) && (m != 0)) { - /* NaN */ - flt_e = 0xff; - flt_m = 1; - } - else { - /* regular */ - flt_e = e + 112; - flt_m = m << 13; - } - - fi.i = (flt_s << 31) | (flt_e << 23) | flt_m; - result = fi.f; - return result; -} - /*@}*/ diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h index 90247587be3..042147fd8bb 100644 --- a/src/mesa/main/imports.h +++ b/src/mesa/main/imports.h @@ -396,13 +396,6 @@ _mesa_flsll(uint64_t n) #endif } - -extern GLhalfARB -_mesa_float_to_half(float f); - -extern float -_mesa_half_to_float(GLhalfARB h); - static inline bool _mesa_half_is_negative(GLhalfARB h) { diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c index ab16c2854a8..50469956c6e 100644 --- a/src/mesa/main/mipmap.c +++ b/src/mesa/main/mipmap.c @@ -37,6 +37,7 @@ #include "texstore.h" #include "image.h" #include "macros.h" +#include "util/half_float.h" #include "../../gallium/auxiliary/util/u_format_rgb9e5.h" #include "../../gallium/auxiliary/util/u_format_r11g11b10f.h" diff --git a/src/mesa/main/texcompress_bptc.c b/src/mesa/main/texcompress_bptc.c index f0f6553a01b..26e59158007 100644 --- a/src/mesa/main/texcompress_bptc.c +++ b/src/mesa/main/texcompress_bptc.c @@ -30,6 +30,7 @@ #include "texcompress.h" #include "texcompress_bptc.h" #include "util/format_srgb.h" +#include "util/half_float.h" #include "texstore.h" #include "macros.h" #include "image.h" diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c index c130ab3f93d..6f29abbe1ba 100644 --- a/src/mesa/tnl/t_draw.c +++ b/src/mesa/tnl/t_draw.c @@ -35,6 +35,7 @@ #include "main/mtypes.h" #include "main/macros.h" #include "main/enums.h" +#include "util/half_float.h" #include "t_context.h" #include "tnl.h" diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources index e45431d1de8..a87114601c8 100644 --- a/src/util/Makefile.sources +++ b/src/util/Makefile.sources @@ -3,6 +3,8 @@ MESA_UTIL_FILES := \ debug.c \ debug.h \ format_srgb.h \ + half_float.c \ + half_float.h \ hash_table.c \ hash_table.h \ list.h \ diff --git a/src/util/half_float.c b/src/util/half_float.c new file mode 100644 index 00000000000..4df64c2ccf9 --- /dev/null +++ b/src/util/half_float.c @@ -0,0 +1,177 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include "half_float.h" +#include "rounding.h" + +typedef union { float f; int32_t i; uint32_t u; } fi_type; + +/** + * Convert a 4-byte float to a 2-byte half float. + * + * Not all float32 values can be represented exactly as a float16 value. We + * round such intermediate float32 values to the nearest float16. When the + * float32 lies exactly between to float16 values, we round to the one with + * an even mantissa. + * + * This rounding behavior has several benefits: + * - It has no sign bias. + * + * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's + * GPU ISA. + * + * - By reproducing the behavior of the GPU (at least on Intel hardware), + * compile-time evaluation of constant packHalf2x16 GLSL expressions will + * result in the same value as if the expression were executed on the GPU. + */ +uint16_t +_mesa_float_to_half(float val) +{ + const fi_type fi = {val}; + const int flt_m = fi.i & 0x7fffff; + const int flt_e = (fi.i >> 23) & 0xff; + const int flt_s = (fi.i >> 31) & 0x1; + int s, e, m = 0; + uint16_t result; + + /* sign bit */ + s = flt_s; + + /* handle special cases */ + if ((flt_e == 0) && (flt_m == 0)) { + /* zero */ + /* m = 0; - already set */ + e = 0; + } + else if ((flt_e == 0) && (flt_m != 0)) { + /* denorm -- denorm float maps to 0 half */ + /* m = 0; - already set */ + e = 0; + } + else if ((flt_e == 0xff) && (flt_m == 0)) { + /* infinity */ + /* m = 0; - already set */ + e = 31; + } + else if ((flt_e == 0xff) && (flt_m != 0)) { + /* NaN */ + m = 1; + e = 31; + } + else { + /* regular number */ + const int new_exp = flt_e - 127; + if (new_exp < -14) { + /* The float32 lies in the range (0.0, min_normal16) and is rounded + * to a nearby float16 value. The result will be either zero, subnormal, + * or normal. + */ + e = 0; + m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f)); + } + else if (new_exp > 15) { + /* map this value to infinity */ + /* m = 0; - already set */ + e = 31; + } + else { + /* The float32 lies in the range + * [min_normal16, max_normal16 + max_step16) + * and is rounded to a nearby float16 value. The result will be + * either normal or infinite. + */ + e = new_exp + 15; + m = _mesa_lroundevenf(flt_m / (float) (1 << 13)); + } + } + + assert(0 <= m && m <= 1024); + if (m == 1024) { + /* The float32 was rounded upwards into the range of the next exponent, + * so bump the exponent. This correctly handles the case where f32 + * should be rounded up to float16 infinity. + */ + ++e; + m = 0; + } + + result = (s << 15) | (e << 10) | m; + return result; +} + + +/** + * Convert a 2-byte half float to a 4-byte float. + * Based on code from: + * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html + */ +float +_mesa_half_to_float(uint16_t val) +{ + /* XXX could also use a 64K-entry lookup table */ + const int m = val & 0x3ff; + const int e = (val >> 10) & 0x1f; + const int s = (val >> 15) & 0x1; + int flt_m, flt_e, flt_s; + fi_type fi; + float result; + + /* sign bit */ + flt_s = s; + + /* handle special cases */ + if ((e == 0) && (m == 0)) { + /* zero */ + flt_m = 0; + flt_e = 0; + } + else if ((e == 0) && (m != 0)) { + /* denorm -- denorm half will fit in non-denorm single */ + const float half_denorm = 1.0f / 16384.0f; /* 2^-14 */ + float mantissa = ((float) (m)) / 1024.0f; + float sign = s ? -1.0f : 1.0f; + return sign * mantissa * half_denorm; + } + else if ((e == 31) && (m == 0)) { + /* infinity */ + flt_e = 0xff; + flt_m = 0; + } + else if ((e == 31) && (m != 0)) { + /* NaN */ + flt_e = 0xff; + flt_m = 1; + } + else { + /* regular */ + flt_e = e + 112; + flt_m = m << 13; + } + + fi.i = (flt_s << 31) | (flt_e << 23) | flt_m; + result = fi.f; + return result; +} diff --git a/src/util/half_float.h b/src/util/half_float.h new file mode 100644 index 00000000000..64f20421018 --- /dev/null +++ b/src/util/half_float.h @@ -0,0 +1,41 @@ +/* + * Mesa 3-D graphics library + * + * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HALF_FLOAT_H_ +#define _HALF_FLOAT_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +uint16_t _mesa_float_to_half(float val); +float _mesa_half_to_float(uint16_t val); + +#ifdef __cplusplus +} /* extern C */ +#endif + +#endif /* _HALF_FLOAT_H_ */ From b9b40ef9b7644ea24768bc8b7464b1719efe99bf Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 10 Oct 2015 13:55:07 -0400 Subject: [PATCH 204/270] nir: remove dependency on glsl Move glsl_types into NIR, now that the dependency on glsl_symbol_table has been split out. Possibly makes sense to rename things at this point, but if we do that I'd like to keep it split out into a separate patch to make git history easier to follow (IMHO). v2: fix android build v3: I f***ing hate scons.. but at least it builds Reviewed-by: Jason Ekstrand Signed-off-by: Rob Clark --- src/gallium/targets/libgl-xlib/SConscript | 3 +++ src/gallium/targets/libgl-xlib/glsl_types_hack.cpp | 3 +++ src/glsl/Makefile.am | 3 --- src/glsl/Makefile.sources | 4 ++-- src/glsl/SConscript | 2 ++ src/glsl/{ => nir}/builtin_type_macros.h | 0 src/glsl/{ => nir}/glsl_types.cpp | 0 src/glsl/{ => nir}/glsl_types.h | 0 src/glsl/nir/nir_types.h | 2 +- src/mesa/Android.libmesa_dricore.mk | 1 + src/mesa/Android.libmesa_glsl_utils.mk | 2 ++ src/mesa/Android.libmesa_st_mesa.mk | 1 + src/mesa/Makefile.sources | 1 + src/mesa/SConscript | 1 + src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs.h | 2 +- src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +- src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp | 2 +- src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 2 +- src/mesa/drivers/x11/SConscript | 3 +++ src/mesa/main/ff_fragment_shader.cpp | 2 +- src/mesa/main/uniforms.h | 2 +- src/mesa/program/Android.mk | 1 + src/mesa/program/ir_to_mesa.cpp | 2 +- src/mesa/program/sampler.cpp | 2 +- 29 files changed, 34 insertions(+), 19 deletions(-) create mode 100644 src/gallium/targets/libgl-xlib/glsl_types_hack.cpp rename src/glsl/{ => nir}/builtin_type_macros.h (100%) rename src/glsl/{ => nir}/glsl_types.cpp (100%) rename src/glsl/{ => nir}/glsl_types.h (100%) diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript index df5a220ac25..fedc522fbdc 100644 --- a/src/gallium/targets/libgl-xlib/SConscript +++ b/src/gallium/targets/libgl-xlib/SConscript @@ -6,6 +6,8 @@ Import('*') env = env.Clone() env.Append(CPPPATH = [ + '#/src/glsl', + '#/src/glsl/nir', '#/src/mapi', '#/src/mesa', '#/src/mesa/main', @@ -36,6 +38,7 @@ env.Prepend(LIBS = [ sources = [ 'xlib.c', + 'glsl_types_hack.cpp', ] if True: diff --git a/src/gallium/targets/libgl-xlib/glsl_types_hack.cpp b/src/gallium/targets/libgl-xlib/glsl_types_hack.cpp new file mode 100644 index 00000000000..5c042f23e3b --- /dev/null +++ b/src/gallium/targets/libgl-xlib/glsl_types_hack.cpp @@ -0,0 +1,3 @@ +/* errrg scons.. otherwise "scons: *** Two environments with different actions were specified for the same target: $mesa/build/linux-x86_64-debug/glsl/nir/glsl_types.os" */ +#include "glsl_types.cpp" + diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am index 347919b1d0a..437c6a5fbcd 100644 --- a/src/glsl/Makefile.am +++ b/src/glsl/Makefile.am @@ -148,9 +148,6 @@ libglsl_la_SOURCES = \ libnir_la_SOURCES = \ - glsl_types.cpp \ - builtin_types.cpp \ - glsl_symbol_table.cpp \ $(NIR_FILES) \ $(NIR_GENERATED_FILES) diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index 4da64f43873..ca870367640 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -20,6 +20,8 @@ NIR_GENERATED_FILES = \ NIR_FILES = \ nir/glsl_to_nir.cpp \ nir/glsl_to_nir.h \ + nir/glsl_types.cpp \ + nir/glsl_types.h \ nir/nir.c \ nir/nir.h \ nir/nir_array.h \ @@ -104,8 +106,6 @@ LIBGLSL_FILES = \ glsl_parser_extras.h \ glsl_symbol_table.cpp \ glsl_symbol_table.h \ - glsl_types.cpp \ - glsl_types.h \ hir_field_selection.cpp \ ir_basic_block.cpp \ ir_basic_block.h \ diff --git a/src/glsl/SConscript b/src/glsl/SConscript index 89c603580a5..927cbdcdb78 100644 --- a/src/glsl/SConscript +++ b/src/glsl/SConscript @@ -16,6 +16,7 @@ env.Prepend(CPPPATH = [ '#src/gallium/include', '#src/gallium/auxiliary', '#src/glsl', + '#src/glsl/nir', '#src/glsl/glcpp', ]) @@ -80,6 +81,7 @@ mesa_objs = env.StaticObject([ 'prog_hash_table.c', 'symbol_table.c', 'dummy_errors.c', + 'nir/glsl_types.cpp', ]) compiler_objs += mesa_objs diff --git a/src/glsl/builtin_type_macros.h b/src/glsl/nir/builtin_type_macros.h similarity index 100% rename from src/glsl/builtin_type_macros.h rename to src/glsl/nir/builtin_type_macros.h diff --git a/src/glsl/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp similarity index 100% rename from src/glsl/glsl_types.cpp rename to src/glsl/nir/glsl_types.cpp diff --git a/src/glsl/glsl_types.h b/src/glsl/nir/glsl_types.h similarity index 100% rename from src/glsl/glsl_types.h rename to src/glsl/nir/glsl_types.h diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h index 009a0fb9918..60d561b25ee 100644 --- a/src/glsl/nir/nir_types.h +++ b/src/glsl/nir/nir_types.h @@ -31,7 +31,7 @@ /* C wrapper around glsl_types.h */ -#include "../glsl_types.h" +#include "glsl_types.h" #ifdef __cplusplus extern "C" { diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk index fef76c8582c..cd31e148222 100644 --- a/src/mesa/Android.libmesa_dricore.mk +++ b/src/mesa/Android.libmesa_dricore.mk @@ -60,6 +60,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa/main \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary diff --git a/src/mesa/Android.libmesa_glsl_utils.mk b/src/mesa/Android.libmesa_glsl_utils.mk index ed620ac648c..9e150eaa3c0 100644 --- a/src/mesa/Android.libmesa_glsl_utils.mk +++ b/src/mesa/Android.libmesa_glsl_utils.mk @@ -37,6 +37,7 @@ LOCAL_MODULE := libmesa_glsl_utils LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary @@ -62,6 +63,7 @@ LOCAL_CFLAGS := -D_POSIX_C_SOURCE=199309L LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk index b4b7fd97722..427a35f4f6e 100644 --- a/src/mesa/Android.libmesa_st_mesa.mk +++ b/src/mesa/Android.libmesa_st_mesa.mk @@ -55,6 +55,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa/main \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/src/gallium/include diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index 13208b5e421..34fb4461985 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -620,6 +620,7 @@ INCLUDE_DIRS = \ -I$(top_srcdir)/include \ -I$(top_srcdir)/src \ -I$(top_srcdir)/src/glsl \ + -I$(top_srcdir)/src/glsl/nir \ -I$(top_builddir)/src/glsl \ -I$(top_builddir)/src/glsl/nir \ -I$(top_srcdir)/src/glsl/glcpp \ diff --git a/src/mesa/SConscript b/src/mesa/SConscript index 5b80a216fef..c986326d2bf 100644 --- a/src/mesa/SConscript +++ b/src/mesa/SConscript @@ -16,6 +16,7 @@ env.Append(CPPPATH = [ '#/src', '#/src/mapi', '#/src/glsl', + '#/src/glsl/nir', '#/src/mesa', '#/src/gallium/include', '#/src/gallium/auxiliary', diff --git a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp index 33571292007..33d2048e657 100644 --- a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp +++ b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp @@ -30,7 +30,7 @@ * \author Eric Anholt */ -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "program/prog_instruction.h" /* For WRITEMASK_* */ diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 01a7c99a4a6..a2fd4411d38 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -47,7 +47,7 @@ #include "brw_dead_control_flow.h" #include "main/uniforms.h" #include "brw_fs_live_variables.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "program/sampler.h" using namespace brw; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index e8b511f9ce6..29a009ed406 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -48,7 +48,7 @@ extern "C" { #include "brw_wm.h" #include "intel_asm_annotation.h" } -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "glsl/nir/nir.h" #include "program/sampler.h" diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 277b6cc3a60..a13d001291c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -45,7 +45,7 @@ #include "brw_wm.h" #include "glsl/ir.h" #include "glsl/ir_expression_flattening.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" class ir_channel_expressions_visitor : public ir_hierarchical_visitor { public: diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index c3a037be4b1..36388fad98d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -27,7 +27,7 @@ #include "brw_fs.h" #include "brw_cfg.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_optimization.h" using namespace brw; diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp index 6000e35b9b9..cab5af318a2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp @@ -42,7 +42,7 @@ #include "glsl/ir.h" #include "glsl/ir_visitor.h" #include "glsl/ir_rvalue_visitor.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "util/hash_table.h" static bool debug = false; diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 8aee2c087f7..eac1ec0c932 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -43,7 +43,7 @@ #include "brw_vec4.h" #include "brw_fs.h" #include "main/uniforms.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_optimization.h" #include "program/sampler.h" diff --git a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp index 8c59b9e415b..4219d471def 100644 --- a/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp +++ b/src/mesa/drivers/dri/i965/brw_lower_unnormalized_offset.cpp @@ -31,7 +31,7 @@ * \author Chris Forbes */ -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "glsl/ir_builder.h" diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 4e43e5ccdbd..b710c60148c 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -29,7 +29,7 @@ #include "brw_vec4.h" #include "brw_cfg.h" #include "brw_shader.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_optimization.h" using namespace brw; diff --git a/src/mesa/drivers/x11/SConscript b/src/mesa/drivers/x11/SConscript index d29f9874f44..aa1e73a9d4a 100644 --- a/src/mesa/drivers/x11/SConscript +++ b/src/mesa/drivers/x11/SConscript @@ -4,6 +4,8 @@ env = env.Clone() env.Append(CPPPATH = [ '#/src', + '#/src/glsl', + '#/src/glsl/nir', '#/src/mapi', '#/src/mesa', '#/src/mesa/main', @@ -31,6 +33,7 @@ sources = [ 'xm_dd.c', 'xm_line.c', 'xm_tri.c', + '../../../glsl/nir/glsl_types.cpp', ] # Disallow undefined symbols diff --git a/src/mesa/main/ff_fragment_shader.cpp b/src/mesa/main/ff_fragment_shader.cpp index aad726689cc..e63d0f1ec55 100644 --- a/src/mesa/main/ff_fragment_shader.cpp +++ b/src/mesa/main/ff_fragment_shader.cpp @@ -40,7 +40,7 @@ #include "glsl/ir_optimization.h" #include "glsl/glsl_parser_extras.h" #include "glsl/glsl_symbol_table.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "program/ir_to_mesa.h" #include "program/program.h" #include "program/programopt.h" diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h index bec035cdc97..2f88b65043d 100644 --- a/src/mesa/main/uniforms.h +++ b/src/mesa/main/uniforms.h @@ -27,7 +27,7 @@ #define UNIFORMS_H #include "main/glheader.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir_uniform.h" #include "program/prog_parameter.h" diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk index ccb0fa5f32b..cc67f8aeadd 100644 --- a/src/mesa/program/Android.mk +++ b/src/mesa/program/Android.mk @@ -75,6 +75,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa \ $(MESA_TOP)/src/glsl \ + $(MESA_TOP)/src/glsl/nir \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/src/gallium/include diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index e81f459da28..61a5064d074 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -42,7 +42,7 @@ #include "glsl/ir_optimization.h" #include "glsl/ir_uniform.h" #include "glsl/glsl_parser_extras.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/linker.h" #include "glsl/program.h" #include "program/hash_table.h" diff --git a/src/mesa/program/sampler.cpp b/src/mesa/program/sampler.cpp index 1198a3c45f1..84e2504baba 100644 --- a/src/mesa/program/sampler.cpp +++ b/src/mesa/program/sampler.cpp @@ -24,7 +24,7 @@ */ #include "main/mtypes.h" -#include "glsl/glsl_types.h" +#include "glsl/nir/glsl_types.h" #include "glsl/ir.h" #include "glsl/ir_uniform.h" #include "glsl/ir_visitor.h" From a6a6a71092ba912803ae2b47eb56e3afdf36feb5 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 10 Oct 2015 14:13:50 -0400 Subject: [PATCH 205/270] glsl: (mostly) remove libglsl_util Now that NIR does not depend on glsl, we can (mostly[*]) get rid of the libglsl_util hack. [*] glsl_compiler is the one remaining user of libglsl_util Reviewed-by: Jason Ekstrand Reviewed-by: Emil Velikov Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/Makefile.am | 3 +-- src/gallium/targets/d3dadapter9/Makefile.am | 1 - src/gallium/targets/pipe-loader/Makefile.am | 1 - src/gallium/targets/xa/Makefile.am | 1 - src/glsl/Makefile.am | 6 ------ 5 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index dff95ba5270..3de8e0fd5ad 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -19,7 +19,7 @@ libfreedreno_la_SOURCES = \ noinst_PROGRAMS = ir3_compiler -# XXX: Required due to the C++ sources in libnir/libglsl_util +# XXX: Required due to the C++ sources in libnir nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp ir3_compiler_SOURCES = \ ir3/ir3_cmdline.c @@ -28,7 +28,6 @@ ir3_compiler_LDADD = \ libfreedreno.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(GALLIUM_COMMON_LIB_DEPS) \ $(FREEDRENO_LIBS) diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am index e26ca33a521..b5221472ef0 100644 --- a/src/gallium/targets/d3dadapter9/Makefile.am +++ b/src/gallium/targets/d3dadapter9/Makefile.am @@ -76,7 +76,6 @@ d3dadapter9_la_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/gallium/state_trackers/nine/libninetracker.la \ $(top_builddir)/src/util/libmesautil.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am index 4d9f7be2ec9..4f25b4f6073 100644 --- a/src/gallium/targets/pipe-loader/Makefile.am +++ b/src/gallium/targets/pipe-loader/Makefile.am @@ -53,7 +53,6 @@ endif PIPE_LIBS += \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(top_builddir)/src/gallium/drivers/rbug/librbug.la \ $(top_builddir)/src/gallium/drivers/trace/libtrace.la \ diff --git a/src/gallium/targets/xa/Makefile.am b/src/gallium/targets/xa/Makefile.am index 92173dedce3..02c42c665ed 100644 --- a/src/gallium/targets/xa/Makefile.am +++ b/src/gallium/targets/xa/Makefile.am @@ -38,7 +38,6 @@ libxatracker_la_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(LIBDRM_LIBS) \ $(GALLIUM_COMMON_LIB_DEPS) diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am index 437c6a5fbcd..ebea816b96d 100644 --- a/src/glsl/Makefile.am +++ b/src/glsl/Makefile.am @@ -96,7 +96,6 @@ tests_general_ir_test_CFLAGS = \ tests_general_ir_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) tests_uniform_initializer_test_SOURCES = \ @@ -109,7 +108,6 @@ tests_uniform_initializer_test_CFLAGS = \ tests_uniform_initializer_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) tests_sampler_types_test_SOURCES = \ @@ -119,7 +117,6 @@ tests_sampler_types_test_CFLAGS = \ tests_sampler_types_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) libglcpp_la_LIBADD = \ @@ -134,7 +131,6 @@ glcpp_glcpp_SOURCES = \ glcpp/glcpp.c glcpp_glcpp_LDADD = \ libglcpp.la \ - $(top_builddir)/src/libglsl_util.la \ -lm libglsl_la_LIBADD = libglcpp.la @@ -168,7 +164,6 @@ glsl_test_SOURCES = \ glsl_test_LDADD = \ libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) # We write our own rules for yacc and lex below. We'd rather use automake, @@ -272,6 +267,5 @@ nir_tests_control_flow_tests_CFLAGS = \ nir_tests_control_flow_tests_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libnir.la \ - $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(PTHREAD_LIBS) From c188235d1be7d4fc65d99d2620d0dcdec5a21c84 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sun, 11 Oct 2015 11:23:54 +0200 Subject: [PATCH 206/270] nvc0: add support for performance monitoring metrics on Fermi As explained in the CUDA toolkit documentation, "a metric is a characteristic of an application that is calculated from one or more event values." Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/Makefile.sources | 2 + .../drivers/nouveau/nvc0/nvc0_query_hw.c | 19 +- .../nouveau/nvc0/nvc0_query_hw_metric.c | 440 ++++++++++++++++++ .../nouveau/nvc0/nvc0_query_hw_metric.h | 42 ++ 4 files changed, 500 insertions(+), 3 deletions(-) create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c create mode 100644 src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index edc6cf41885..c18e9f5b435 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -154,6 +154,8 @@ NVC0_C_SOURCES := \ nvc0/nvc0_query.h \ nvc0/nvc0_query_hw.c \ nvc0/nvc0_query_hw.h \ + nvc0/nvc0_query_hw_metric.c \ + nvc0/nvc0_query_hw_metric.h \ nvc0/nvc0_query_hw_sm.c \ nvc0/nvc0_query_hw_sm.h \ nvc0/nvc0_query_sw.c \ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index 91254bedf1e..90ee82f21e5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -25,6 +25,7 @@ #include "nvc0/nvc0_context.h" #include "nvc0/nvc0_query_hw.h" +#include "nvc0/nvc0_query_hw_metric.h" #include "nvc0/nvc0_query_hw_sm.h" #define NVC0_HW_QUERY_STATE_READY 0 @@ -371,6 +372,12 @@ nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index) return (struct nvc0_query *)hq; } + hq = nvc0_hw_metric_create_query(nvc0, type); + if (hq) { + hq->base.funcs = &hw_query_funcs; + return (struct nvc0_query *)hq; + } + hq = CALLOC_STRUCT(nvc0_hw_query); if (!hq) return NULL; @@ -435,14 +442,20 @@ int nvc0_hw_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) { - int num_hw_sm_queries = 0; + int num_hw_sm_queries = 0, num_hw_metric_queries = 0; num_hw_sm_queries = nvc0_hw_sm_get_driver_query_info(screen, 0, NULL); + num_hw_metric_queries = + nvc0_hw_metric_get_driver_query_info(screen, 0, NULL); if (!info) - return num_hw_sm_queries; + return num_hw_sm_queries + num_hw_metric_queries; - return nvc0_hw_sm_get_driver_query_info(screen, id, info); + if (id < num_hw_sm_queries) + return nvc0_hw_sm_get_driver_query_info(screen, id, info); + + return nvc0_hw_metric_get_driver_query_info(screen, + id - num_hw_sm_queries, info); } void diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c new file mode 100644 index 00000000000..25aa09be42a --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -0,0 +1,440 @@ +/* + * Copyright 2015 Samuel Pitoiset + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nvc0/nvc0_context.h" +#include "nvc0/nvc0_query_hw_metric.h" +#include "nvc0/nvc0_query_hw_sm.h" + +/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */ +static const char *nvc0_hw_metric_names[] = +{ + "metric-achieved_occupancy", + "metric-branch_efficiency", + "metric-inst_issued", + "metric-inst_per_wrap", + "metric-inst_replay_overhead", + "metric-issued_ipc", + "metric-issue_slots", + "metric-issue_slot_utilization", + "metric-ipc", +}; + +struct nvc0_hw_metric_query_cfg { + uint32_t queries[8]; + uint32_t num_queries; +}; + +#define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n) +#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c + +/* ==== Compute capability 2.0 (GF100/GF110) ==== */ +static const struct nvc0_hw_metric_query_cfg +sm20_achieved_occupancy = +{ + .queries[0] = _SM(ACTIVE_WARPS), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_branch_efficiency = +{ + .queries[0] = _SM(BRANCH), + .queries[1] = _SM(DIVERGENT_BRANCH), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_inst_per_wrap = +{ + .queries[0] = _SM(INST_EXECUTED), + .queries[1] = _SM(WARPS_LAUNCHED), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_inst_replay_overhead = +{ + .queries[0] = _SM(INST_ISSUED), + .queries[1] = _SM(INST_EXECUTED), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_issued_ipc = +{ + .queries[0] = _SM(INST_ISSUED), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg +sm20_ipc = +{ + .queries[0] = _SM(INST_EXECUTED), + .queries[1] = _SM(ACTIVE_CYCLES), + .num_queries = 2, +}; + +static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] = +{ + _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), + _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), + _M(INST_ISSUED, NULL), + _M(INST_PER_WRAP, &sm20_inst_per_wrap), + _M(INST_REPLAY_OVERHEAD, &sm20_inst_replay_overhead), + _M(ISSUED_IPC, &sm20_issued_ipc), + _M(ISSUE_SLOTS, NULL), + _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc), + _M(IPC, &sm20_ipc), +}; + +/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ +static const struct nvc0_hw_metric_query_cfg +sm21_inst_issued = +{ + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .num_queries = 4, +}; + +static const struct nvc0_hw_metric_query_cfg +sm21_inst_replay_overhead = +{ + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .queries[4] = _SM(INST_EXECUTED), + .num_queries = 5, +}; + +static const struct nvc0_hw_metric_query_cfg +sm21_issued_ipc = +{ + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .queries[4] = _SM(ACTIVE_CYCLES), + .num_queries = 5, +}; + +static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = +{ + _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy), + _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency), + _M(INST_ISSUED, &sm21_inst_issued), + _M(INST_PER_WRAP, &sm20_inst_per_wrap), + _M(INST_REPLAY_OVERHEAD, &sm21_inst_replay_overhead), + _M(ISSUED_IPC, &sm21_issued_ipc), + _M(ISSUE_SLOTS, &sm21_inst_issued), + _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc), + _M(IPC, &sm20_ipc), +}; + +#undef _SM +#undef _M + +static inline const struct nvc0_hw_metric_query_cfg ** +nvc0_hw_metric_get_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return sm20_hw_metric_queries; + return sm21_hw_metric_queries; +} + +static const struct nvc0_hw_metric_query_cfg * +nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq) +{ + const struct nvc0_hw_metric_query_cfg **queries; + struct nvc0_screen *screen = nvc0->screen; + struct nvc0_query *q = &hq->base; + + queries = nvc0_hw_metric_get_queries(screen); + return queries[q->type - NVC0_HW_METRIC_QUERY(0)]; +} + +static void +nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) + hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]); + FREE(hmq); +} + +static boolean +nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + boolean ret = false; + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) { + ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]); + if (!ret) + return ret; + } + return ret; +} + +static void +nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) + hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]); +} + +static uint64_t +sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) +{ + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: + /* (active_warps / active_cycles) / max. number of warps on a MP */ + if (res64[1]) + return (res64[0] / (double)res64[1]) / 48; + break; + case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: + /* (branch / (branch + divergent_branch)) * 100 */ + if (res64[0] + res64[1]) + return (res64[0] / (double)(res64[0] + res64[1])) * 100; + break; + case NVC0_HW_METRIC_QUERY_INST_PER_WRAP: + /* inst_executed / warps_launched */ + if (res64[1]) + return res64[0] / (double)res64[1]; + break; + case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: + /* (inst_issued - inst_executed) / inst_executed */ + if (res64[1]) + return (res64[0] - res64[1]) / (double)res64[1]; + break; + case NVC0_HW_METRIC_QUERY_ISSUED_IPC: + /* inst_issued / active_cycles */ + if (res64[1]) + return res64[0] / (double)res64[1]; + break; + case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: + /* ((inst_issued / 2) / active_cycles) * 100 */ + if (res64[1]) + return ((res64[0] / 2) / (double)res64[1]) * 100; + break; + case NVC0_HW_METRIC_QUERY_IPC: + /* inst_executed / active_cycles */ + if (res64[1]) + return res64[0] / (double)res64[1]; + break; + default: + debug_printf("invalid metric type: %d\n", + hq->base.type - NVC0_HW_METRIC_QUERY(0)); + break; + } + return 0; +} + +static uint64_t +sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) +{ + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: + return sm20_hw_metric_calc_result(hq, res64); + case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: + return sm20_hw_metric_calc_result(hq, res64); + case NVC0_HW_METRIC_QUERY_INST_ISSUED: + /* issued1_0 + issued1_1 + (issued2_0 + issued2_1) * 2 */ + return res64[0] + res64[1] + (res64[2] + res64[3]) * 2; + break; + case NVC0_HW_METRIC_QUERY_INST_PER_WRAP: + return sm20_hw_metric_calc_result(hq, res64); + case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD: + /* (metric-inst_issued - inst_executed) / inst_executed */ + if (res64[4]) + return (((res64[0] + res64[1] + (res64[2] + res64[3]) * 2) - + res64[4]) / (double)res64[4]); + break; + case NVC0_HW_METRIC_QUERY_ISSUED_IPC: + /* metric-inst_issued / active_cycles */ + if (res64[4]) + return (res64[0] + res64[1] + (res64[2] + res64[3]) * 2) / + (double)res64[4]; + break; + case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS: + /* issued1_0 + issued1_1 + issued2_0 + issued2_1 */ + return res64[0] + res64[1] + res64[2] + res64[3]; + break; + case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION: + /* ((metric-issue_slots / 2) / active_cycles) * 100 */ + if (res64[4]) + return (((res64[0] + res64[1] + res64[2] + res64[3]) / 2) / + (double)res64[4]) * 100; + break; + case NVC0_HW_METRIC_QUERY_IPC: + return sm20_hw_metric_calc_result(hq, res64); + default: + debug_printf("invalid metric type: %d\n", + hq->base.type - NVC0_HW_METRIC_QUERY(0)); + break; + } + return 0; +} + +static boolean +nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, + struct nvc0_hw_query *hq, boolean wait, + union pipe_query_result *result) +{ + struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_device *dev = screen->base.device; + union pipe_query_result results[8] = {}; + uint64_t res64[8] = {}; + uint64_t value = 0; + boolean ret = false; + unsigned i; + + for (i = 0; i < hmq->num_queries; i++) { + ret = hmq->queries[i]->funcs->get_query_result(nvc0, hmq->queries[i], + wait, &results[i]); + if (!ret) + return ret; + res64[i] = *(uint64_t *)&results[i]; + } + + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + value = sm20_hw_metric_calc_result(hq, res64); + else + value = sm21_hw_metric_calc_result(hq, res64); + + *(uint64_t *)result = value; + return ret; +} + +static const struct nvc0_hw_query_funcs hw_metric_query_funcs = { + .destroy_query = nvc0_hw_metric_destroy_query, + .begin_query = nvc0_hw_metric_begin_query, + .end_query = nvc0_hw_metric_end_query, + .get_query_result = nvc0_hw_metric_get_query_result, +}; + +struct nvc0_hw_query * +nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type) +{ + const struct nvc0_hw_metric_query_cfg *cfg; + struct nvc0_hw_metric_query *hmq; + struct nvc0_hw_query *hq; + unsigned i; + + if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST) + return NULL; + + hmq = CALLOC_STRUCT(nvc0_hw_metric_query); + if (!hmq) + return NULL; + + hq = &hmq->base; + hq->funcs = &hw_metric_query_funcs; + hq->base.type = type; + + cfg = nvc0_hw_metric_query_get_cfg(nvc0, hq); + + for (i = 0; i < cfg->num_queries; i++) { + hmq->queries[i] = nvc0_hw_sm_create_query(nvc0, cfg->queries[i]); + if (!hmq->queries[i]) { + nvc0_hw_metric_destroy_query(nvc0, hq); + return NULL; + } + hmq->num_queries++; + } + + return hq; +} + +static int +nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries, + unsigned id) +{ + unsigned i, next = 0; + + for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { + if (!queries[i]) { + next++; + } else + if (i >= id && queries[id + next]) { + break; + } + } + return id + next; +} + +int +nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, + struct pipe_driver_query_info *info) +{ + uint16_t class_3d = screen->base.class_3d; + int count = 0; + + if (screen->base.device->drm_version >= 0x01000101) { + if (screen->compute) { + if (class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_metric_query_cfg **queries = + nvc0_hw_metric_get_queries(screen); + unsigned i; + + for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) { + if (queries[i]) + count++; + } + } + } + } + + if (!info) + return count; + + if (id < count) { + if (screen->compute) { + if (class_3d < NVE4_3D_CLASS) { + const struct nvc0_hw_metric_query_cfg **queries = + nvc0_hw_metric_get_queries(screen); + + id = nvc0_hw_metric_get_next_query_id(queries, id); + info->name = nvc0_hw_metric_names[id]; + info->query_type = NVC0_HW_METRIC_QUERY(id); + info->group_id = -1; + return 1; + } + } + } + return 0; +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h new file mode 100644 index 00000000000..95675fd19b7 --- /dev/null +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h @@ -0,0 +1,42 @@ +#ifndef __NVC0_QUERY_HW_METRIC_H__ +#define __NVC0_QUERY_HW_METRIC_H__ + +#include "nvc0_query_hw.h" + +struct nvc0_hw_metric_query { + struct nvc0_hw_query base; + struct nvc0_hw_query *queries[8]; + unsigned num_queries; +}; + +static inline struct nvc0_hw_metric_query * +nvc0_hw_metric_query(struct nvc0_hw_query *hq) +{ + return (struct nvc0_hw_metric_query *)hq; +} + +/* + * Driver metrics queries: + */ +#define NVC0_HW_METRIC_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i)) +#define NVC0_HW_METRIC_QUERY_LAST NVC0_HW_METRIC_QUERY(NVC0_HW_METRIC_QUERY_COUNT - 1) +enum nvc0_hw_metric_queries +{ + NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY = 0, + NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, + NVC0_HW_METRIC_QUERY_INST_ISSUED, + NVC0_HW_METRIC_QUERY_INST_PER_WRAP, + NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD, + NVC0_HW_METRIC_QUERY_ISSUED_IPC, + NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, + NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, + NVC0_HW_METRIC_QUERY_IPC, + NVC0_HW_METRIC_QUERY_COUNT +}; + +struct nvc0_hw_query * +nvc0_hw_metric_create_query(struct nvc0_context *, unsigned); +int +nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *, unsigned, + struct pipe_driver_query_info *); +#endif From b7963b6926c06666b24453344c47f12e369e8fd1 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 17 Oct 2015 09:28:23 -0400 Subject: [PATCH 207/270] build: fix out-of-tree build after b9b40ef commit b9b40ef9b7644ea24768bc8b7464b1719efe99bf Author: Rob Clark AuthorDate: Sat Oct 10 13:55:07 2015 -0400 nir: remove dependency on glsl broke things for i965 out of tree build. Signed-off-by: Rob Clark --- src/mesa/drivers/dri/i965/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index 2e241511049..04b3f9cc8ce 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -33,6 +33,7 @@ AM_CFLAGS = \ -I$(top_srcdir)/src/mesa/drivers/dri/common \ -I$(top_srcdir)/src/mesa/drivers/dri/intel/server \ -I$(top_srcdir)/src/gtest/include \ + -I$(top_srcdir)/src/glsl/nir \ -I$(top_builddir)/src/glsl/nir \ -I$(top_builddir)/src/mesa/drivers/dri/common \ $(DEFINES) \ From 7e6aafd6ab89ad8bc113f67eb666a90b979cdb8e Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 17 Oct 2015 09:49:19 -0400 Subject: [PATCH 208/270] build: fix make-check after a6a6a71 commit a6a6a71092ba912803ae2b47eb56e3afdf36feb5 Author: Rob Clark AuthorDate: Sat Oct 10 14:13:50 2015 -0400 glsl: (mostly) remove libglsl_util Was a bit too ambitious on removal of libglsl_util.. it is still needed by some of the tests. Signed-off-by: Rob Clark --- src/glsl/Makefile.am | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am index ebea816b96d..33a34e4ccc8 100644 --- a/src/glsl/Makefile.am +++ b/src/glsl/Makefile.am @@ -96,6 +96,7 @@ tests_general_ir_test_CFLAGS = \ tests_general_ir_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libglsl.la \ + $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) tests_uniform_initializer_test_SOURCES = \ @@ -108,6 +109,7 @@ tests_uniform_initializer_test_CFLAGS = \ tests_uniform_initializer_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libglsl.la \ + $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) tests_sampler_types_test_SOURCES = \ @@ -117,6 +119,7 @@ tests_sampler_types_test_CFLAGS = \ tests_sampler_types_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(top_builddir)/src/glsl/libglsl.la \ + $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) libglcpp_la_LIBADD = \ @@ -131,6 +134,7 @@ glcpp_glcpp_SOURCES = \ glcpp/glcpp.c glcpp_glcpp_LDADD = \ libglcpp.la \ + $(top_builddir)/src/libglsl_util.la \ -lm libglsl_la_LIBADD = libglcpp.la @@ -164,6 +168,7 @@ glsl_test_SOURCES = \ glsl_test_LDADD = \ libglsl.la \ + $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) # We write our own rules for yacc and lex below. We'd rather use automake, From 3272f632eec768c79000836e9bc45b90229789e7 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Sat, 17 Oct 2015 10:02:04 -0600 Subject: [PATCH 209/270] scons: fix MSVC, MinGW build Duplicate the glsl_types_hack.cpp work-around from the libgl-xlib target. --- src/gallium/targets/libgl-gdi/SConscript | 10 +++++++++- src/gallium/targets/libgl-gdi/glsl_types_hack.cpp | 3 +++ src/gallium/targets/osmesa/SConscript | 7 ++++++- src/gallium/targets/osmesa/glsl_types_hack.cpp | 3 +++ 4 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 src/gallium/targets/libgl-gdi/glsl_types_hack.cpp create mode 100644 src/gallium/targets/osmesa/glsl_types_hack.cpp diff --git a/src/gallium/targets/libgl-gdi/SConscript b/src/gallium/targets/libgl-gdi/SConscript index 594f34d7fad..eb777a86cb2 100644 --- a/src/gallium/targets/libgl-gdi/SConscript +++ b/src/gallium/targets/libgl-gdi/SConscript @@ -7,6 +7,10 @@ env = env.Clone() env.Append(CPPPATH = [ '#src', + '#src/mesa', + '#src/mapi', + '#src/glsl', + '#src/glsl/nir', '#src/gallium/state_trackers/wgl', '#src/gallium/winsys/sw', ]) @@ -20,7 +24,11 @@ env.Append(LIBS = [ env.Prepend(LIBS = [mesautil]) -sources = ['libgl_gdi.c'] +sources = [ + 'libgl_gdi.c', + 'glsl_types_hack.cpp' +] + drivers = [] if True: diff --git a/src/gallium/targets/libgl-gdi/glsl_types_hack.cpp b/src/gallium/targets/libgl-gdi/glsl_types_hack.cpp new file mode 100644 index 00000000000..5c042f23e3b --- /dev/null +++ b/src/gallium/targets/libgl-gdi/glsl_types_hack.cpp @@ -0,0 +1,3 @@ +/* errrg scons.. otherwise "scons: *** Two environments with different actions were specified for the same target: $mesa/build/linux-x86_64-debug/glsl/nir/glsl_types.os" */ +#include "glsl_types.cpp" + diff --git a/src/gallium/targets/osmesa/SConscript b/src/gallium/targets/osmesa/SConscript index 4a9115ba1cf..78930a98e03 100644 --- a/src/gallium/targets/osmesa/SConscript +++ b/src/gallium/targets/osmesa/SConscript @@ -5,6 +5,8 @@ env = env.Clone() env.Prepend(CPPPATH = [ '#src/mapi', '#src/mesa', + '#src/glsl', + '#src/glsl/nir', #Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers ]) @@ -22,7 +24,10 @@ env.Prepend(LIBS = [ env.Append(CPPDEFINES = ['GALLIUM_TRACE', 'GALLIUM_SOFTPIPE']) -sources = ['target.c'] +sources = [ + 'target.c', + 'glsl_types_hack.cpp' +] if env['llvm']: env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE') diff --git a/src/gallium/targets/osmesa/glsl_types_hack.cpp b/src/gallium/targets/osmesa/glsl_types_hack.cpp new file mode 100644 index 00000000000..5c042f23e3b --- /dev/null +++ b/src/gallium/targets/osmesa/glsl_types_hack.cpp @@ -0,0 +1,3 @@ +/* errrg scons.. otherwise "scons: *** Two environments with different actions were specified for the same target: $mesa/build/linux-x86_64-debug/glsl/nir/glsl_types.os" */ +#include "glsl_types.cpp" + From 006fcc0da674ca18ebf07771e3c309997ab32798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 17 Oct 2015 19:05:46 +0200 Subject: [PATCH 210/270] gallium/hud: fix possible NULL pointer dereference Trivial. --- src/gallium/auxiliary/hud/hud_context.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c index 95eed2698bc..ffe30b8fa79 100644 --- a/src/gallium/auxiliary/hud/hud_context.c +++ b/src/gallium/auxiliary/hud/hud_context.c @@ -987,6 +987,9 @@ hud_parse_env_var(struct hud_context *hud, const char *env) case ',': env++; + if (!pane) + break; + y += height + hud->font.glyph_height * (pane->num_graphs + 2); height = 100; From 3c6156a4a7b647cc55cbe3a4c13d53b5ffe505e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 17 Oct 2015 14:20:01 +0200 Subject: [PATCH 211/270] st/mesa: fix clip state dependencies This allows removing FLUSH_VERTICES in MatrixMode. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Brian Paul --- src/mesa/state_tracker/st_atom_clip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c index 506a770499f..b820d843385 100644 --- a/src/mesa/state_tracker/st_atom_clip.c +++ b/src/mesa/state_tracker/st_atom_clip.c @@ -56,6 +56,9 @@ static void update_clip( struct st_context *st ) use_eye = TRUE; } + /* _ClipUserPlane = _NEW_TRANSFORM | _NEW_PROJECTION + * EyeUserPlane = _NEW_TRANSFORM + */ memcpy(clip.ucp, use_eye ? ctx->Transform.EyeUserPlane : ctx->Transform._ClipUserPlane, sizeof(clip.ucp)); @@ -70,7 +73,7 @@ static void update_clip( struct st_context *st ) const struct st_tracked_state st_update_clip = { "st_update_clip", /* name */ { /* dirty */ - _NEW_TRANSFORM, /* mesa */ + _NEW_TRANSFORM | _NEW_PROJECTION, /* mesa */ ST_NEW_VERTEX_PROGRAM, /* st */ }, update_clip /* update */ From 8c5647db5e7ade454745caf97ac7c04f64b08c79 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Wed, 14 Oct 2015 09:08:50 -0600 Subject: [PATCH 212/270] mesa: remove FLUSH_VERTICES() in _mesa_MatrixMode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changing the matrix mode alone has no effect on rendering and does not need to trigger a flush or state validation. Reviewed-by: Eric Anholt Signed-off-by: Marek Olšák --- src/mesa/main/matrix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c index 2b8016a4a72..5ff5ac5bfe1 100644 --- a/src/mesa/main/matrix.c +++ b/src/mesa/main/matrix.c @@ -151,7 +151,6 @@ _mesa_MatrixMode( GLenum mode ) if (ctx->Transform.MatrixMode == mode && mode != GL_TEXTURE) return; - FLUSH_VERTICES(ctx, _NEW_TRANSFORM); switch (mode) { case GL_MODELVIEW: From 82335978bb3a68f1acf200bdfb683db4d8dd3cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 01:11:31 +0200 Subject: [PATCH 213/270] tgsi: move pipe_shader_from_tgsi_processor function to util MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- src/gallium/auxiliary/tgsi/tgsi_ureg.c | 26 ++------------------------ src/gallium/auxiliary/util/u_inlines.h | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c index 3d213195090..f2f518130fb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c @@ -35,6 +35,7 @@ #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_sanity.h" #include "util/u_debug.h" +#include "util/u_inlines.h" #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_bitmask.h" @@ -1830,29 +1831,6 @@ void ureg_free_tokens( const struct tgsi_token *tokens ) } -static inline unsigned -pipe_shader_from_tgsi_processor(unsigned processor) -{ - switch (processor) { - case TGSI_PROCESSOR_VERTEX: - return PIPE_SHADER_VERTEX; - case TGSI_PROCESSOR_TESS_CTRL: - return PIPE_SHADER_TESS_CTRL; - case TGSI_PROCESSOR_TESS_EVAL: - return PIPE_SHADER_TESS_EVAL; - case TGSI_PROCESSOR_GEOMETRY: - return PIPE_SHADER_GEOMETRY; - case TGSI_PROCESSOR_FRAGMENT: - return PIPE_SHADER_FRAGMENT; - case TGSI_PROCESSOR_COMPUTE: - return PIPE_SHADER_COMPUTE; - default: - assert(0); - return PIPE_SHADER_VERTEX; - } -} - - struct ureg_program * ureg_create(unsigned processor) { @@ -1872,7 +1850,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen) ureg->supports_any_inout_decl_range = screen && screen->get_shader_param(screen, - pipe_shader_from_tgsi_processor(processor), + util_pipe_shader_from_tgsi_processor(processor), PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0; for (i = 0; i < Elements(ureg->properties); i++) diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h index bb99a02ce49..384e267b593 100644 --- a/src/gallium/auxiliary/util/u_inlines.h +++ b/src/gallium/auxiliary/util/u_inlines.h @@ -651,6 +651,28 @@ util_max_layer(const struct pipe_resource *r, unsigned level) } } +static inline unsigned +util_pipe_shader_from_tgsi_processor(unsigned processor) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + return PIPE_SHADER_VERTEX; + case TGSI_PROCESSOR_TESS_CTRL: + return PIPE_SHADER_TESS_CTRL; + case TGSI_PROCESSOR_TESS_EVAL: + return PIPE_SHADER_TESS_EVAL; + case TGSI_PROCESSOR_GEOMETRY: + return PIPE_SHADER_GEOMETRY; + case TGSI_PROCESSOR_FRAGMENT: + return PIPE_SHADER_FRAGMENT; + case TGSI_PROCESSOR_COMPUTE: + return PIPE_SHADER_COMPUTE; + default: + assert(0); + return PIPE_SHADER_VERTEX; + } +} + #ifdef __cplusplus } #endif From 73e3fba3356a58dadf46f2cc5d68d8eda824fccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 7 Oct 2015 01:28:18 +0200 Subject: [PATCH 214/270] radeonsi: clean up si_llvm_init_export_args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_shader.c | 77 +++++++++++------------- 1 file changed, 35 insertions(+), 42 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 32a702fcdf5..4b9f6912b73 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1306,6 +1306,23 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, unsigned compressed = 0; unsigned chan; + /* XXX: This controls which components of the output + * registers actually get exported. (e.g bit 0 means export + * X component, bit 1 means export Y component, etc.) I'm + * hard coding this to 0xf for now. In the future, we might + * want to do something else. + */ + args[0] = lp_build_const_int32(base->gallivm, 0xf); + + /* Specify whether the EXEC mask represents the valid mask */ + args[1] = uint->zero; + + /* Specify whether this is the last export */ + args[2] = uint->zero; + + /* Specify the target we are exporting */ + args[3] = lp_build_const_int32(base->gallivm, target); + if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) { int cbuf = target - V_008DFC_SQ_EXP_MRT; @@ -1323,55 +1340,31 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, } } + /* Set COMPR flag */ + args[4] = compressed ? uint->one : uint->zero; + if (compressed) { /* Pixel shader needs to pack output values before export */ - for (chan = 0; chan < 2; chan++ ) { - args[0] = values[2 * chan]; - args[1] = values[2 * chan + 1]; - args[chan + 5] = - lp_build_intrinsic(base->gallivm->builder, - "llvm.SI.packf16", - LLVMInt32TypeInContext(base->gallivm->context), - args, 2, - LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = { + values[2 * chan], + values[2 * chan + 1] + }; + LLVMValueRef packed; + + packed = lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.packf16", + LLVMInt32TypeInContext(base->gallivm->context), + pack_args, 2, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); args[chan + 7] = args[chan + 5] = LLVMBuildBitCast(base->gallivm->builder, - args[chan + 5], + packed, LLVMFloatTypeInContext(base->gallivm->context), ""); } - - /* Set COMPR flag */ - args[4] = uint->one; - } else { - for (chan = 0; chan < 4; chan++ ) - /* +5 because the first output value will be - * the 6th argument to the intrinsic. */ - args[chan + 5] = values[chan]; - - /* Clear COMPR flag */ - args[4] = uint->zero; - } - - /* XXX: This controls which components of the output - * registers actually get exported. (e.g bit 0 means export - * X component, bit 1 means export Y component, etc.) I'm - * hard coding this to 0xf for now. In the future, we might - * want to do something else. */ - args[0] = lp_build_const_int32(base->gallivm, 0xf); - - /* Specify whether the EXEC mask represents the valid mask */ - args[1] = uint->zero; - - /* Specify whether this is the last export */ - args[2] = uint->zero; - - /* Specify the target we are exporting */ - args[3] = lp_build_const_int32(base->gallivm, target); - - /* XXX: We probably need to keep track of the output - * values, so we know what we are passing to the next - * stage. */ + } else + memcpy(&args[5], values, sizeof(values[0]) * 4); } /* Load from output pointers and initialize arguments for the shader export intrinsic */ From b11edf887236b53b489f5df14152ac651b0b3857 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 8 Oct 2015 22:23:18 +0200 Subject: [PATCH 215/270] radeonsi: disable NaNs for LS and HS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They're disabled for all other shaders except compute, but I forgot to do this for tess stages. Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index f673388b121..24891018679 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -122,7 +122,8 @@ static void si_shader_ls(struct si_shader *shader) shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | S_00B528_SGPRS((num_sgprs - 1) / 8) | - S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt); + S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) | + S_00B528_DX10_CLAMP(shader->dx10_clamp_mode); shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0); } @@ -154,7 +155,8 @@ static void si_shader_hs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40); si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, S_00B428_VGPRS((shader->num_vgprs - 1) / 4) | - S_00B428_SGPRS((num_sgprs - 1) / 8)); + S_00B428_SGPRS((num_sgprs - 1) / 8) | + S_00B428_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, S_00B42C_USER_SGPR(num_user_sgprs) | S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); From 4f4f477d6dd8ba5e0d37dbba50d28c1a85accf39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 00:20:30 +0200 Subject: [PATCH 216/270] radeonsi: print export_prim_id from the shader key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_shader.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4b9f6912b73..fa1529056b1 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3975,6 +3975,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) key->vs.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); + fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); break; case PIPE_SHADER_TESS_CTRL: @@ -3986,6 +3987,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", key->tes.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->tes.as_es); + fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); break; case PIPE_SHADER_GEOMETRY: From c4f086f3999894c9b6fe2de466add3bb09c8b354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 00:49:13 +0200 Subject: [PATCH 217/270] radeonsi: remove an unused ctx parameter in si_shader_destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_compute.c | 4 ++-- src/gallium/drivers/radeonsi/si_shader.c | 4 ++-- src/gallium/drivers/radeonsi/si_shader.h | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index c6605346771..697e60a50d9 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -469,7 +469,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){ if (program->kernels) { for (int i = 0; i < program->num_kernels; i++){ if (program->kernels[i].bo){ - si_shader_destroy(ctx, &program->kernels[i]); + si_shader_destroy(&program->kernels[i]); } } FREE(program->kernels); @@ -482,7 +482,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){ FREE(program->shader.binary.config); FREE(program->shader.binary.rodata); FREE(program->shader.binary.global_symbol_offsets); - si_shader_destroy(ctx, &program->shader); + si_shader_destroy(&program->shader); #endif pipe_resource_reference( diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index fa1529056b1..085794ba3b3 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -4191,10 +4191,10 @@ out: return r; } -void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader) +void si_shader_destroy(struct si_shader *shader) { if (shader->gs_copy_shader) - si_shader_destroy(ctx, shader->gs_copy_shader); + si_shader_destroy(shader->gs_copy_shader); if (shader->scratch_bo) r600_resource_reference(&shader->scratch_bo, NULL); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b92fa02a171..cccc4607977 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -324,7 +324,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f); int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, LLVMTargetMachineRef tm, LLVMModuleRef mod); -void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader); +void si_shader_destroy(struct si_shader *shader); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 24891018679..9d05cb5aad6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -942,7 +942,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, break; } - si_shader_destroy(ctx, p); + si_shader_destroy(p); free(p); p = c; } From aa060e276c203baf4691d4a4722accd5bdbb8526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 00:54:17 +0200 Subject: [PATCH 218/270] radeonsi: fix a GS copy shader leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_shader.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 085794ba3b3..49ab9404b81 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -4193,8 +4193,10 @@ out: void si_shader_destroy(struct si_shader *shader) { - if (shader->gs_copy_shader) + if (shader->gs_copy_shader) { si_shader_destroy(shader->gs_copy_shader); + FREE(shader->gs_copy_shader); + } if (shader->scratch_bo) r600_resource_reference(&shader->scratch_bo, NULL); From b0167809f1e88f3676db78b1c1934aea35e55be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 01:08:42 +0200 Subject: [PATCH 219/270] radeonsi: unify shader delete functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- .../drivers/radeonsi/si_state_shaders.c | 84 ++++--------------- 1 file changed, 17 insertions(+), 67 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 9d05cb5aad6..cc053bb6ef2 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -907,11 +907,21 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->cb_target_mask); } -static void si_delete_shader_selector(struct pipe_context *ctx, - struct si_shader_selector *sel) +static void si_delete_shader_selector(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)state; struct si_shader *p = sel->current, *c; + struct si_shader_selector **current_shader[SI_NUM_SHADERS] = { + [PIPE_SHADER_VERTEX] = &sctx->vs_shader, + [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader, + [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, + [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader, + [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader, + }; + + if (*current_shader[sel->type] == sel) + *current_shader[sel->type] = NULL; while (p) { c = p->next_variant; @@ -951,66 +961,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, free(sel); } -static void si_delete_vs_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->vs_shader == sel) { - sctx->vs_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_gs_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->gs_shader == sel) { - sctx->gs_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_ps_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->ps_shader == sel) { - sctx->ps_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_tcs_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->tcs_shader == sel) { - sctx->tcs_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - -static void si_delete_tes_shader(struct pipe_context *ctx, void *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - if (sctx->tes_shader == sel) { - sctx->tes_shader = NULL; - } - - si_delete_shader_selector(ctx, sel); -} - static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; @@ -1675,9 +1625,9 @@ void si_init_shader_functions(struct si_context *sctx) sctx->b.b.bind_gs_state = si_bind_gs_shader; sctx->b.b.bind_fs_state = si_bind_ps_shader; - sctx->b.b.delete_vs_state = si_delete_vs_shader; - sctx->b.b.delete_tcs_state = si_delete_tcs_shader; - sctx->b.b.delete_tes_state = si_delete_tes_shader; - sctx->b.b.delete_gs_state = si_delete_gs_shader; - sctx->b.b.delete_fs_state = si_delete_ps_shader; + sctx->b.b.delete_vs_state = si_delete_shader_selector; + sctx->b.b.delete_tcs_state = si_delete_shader_selector; + sctx->b.b.delete_tes_state = si_delete_shader_selector; + sctx->b.b.delete_gs_state = si_delete_shader_selector; + sctx->b.b.delete_fs_state = si_delete_shader_selector; } From 938a1bee34cc8e0c22a9b99c6b3d247b88e94cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 01:14:12 +0200 Subject: [PATCH 220/270] radeonsi: unify shader create functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shader specifies the processor type, so use that instead. Reviewed-by: Michel Dänzer --- .../drivers/radeonsi/si_state_shaders.c | 49 ++++--------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index cc053bb6ef2..c1d61d527cb 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -647,9 +647,8 @@ static int si_shader_select(struct pipe_context *ctx, return 0; } -static void *si_create_shader_state(struct pipe_context *ctx, - const struct pipe_shader_state *state, - unsigned pipe_shader_type) +static void *si_create_shader_selector(struct pipe_context *ctx, + const struct pipe_shader_state *state) { struct si_screen *sscreen = (struct si_screen *)ctx->screen; struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector); @@ -658,7 +657,6 @@ static void *si_create_shader_state(struct pipe_context *ctx, if (!sel) return NULL; - sel->type = pipe_shader_type; sel->tokens = tgsi_dup_tokens(state->tokens); if (!sel->tokens) { FREE(sel); @@ -667,6 +665,7 @@ static void *si_create_shader_state(struct pipe_context *ctx, sel->so = state->stream_output; tgsi_scan_shader(state->tokens, &sel->info); + sel->type = util_pipe_shader_from_tgsi_processor(sel->info.processor); p_atomic_inc(&sscreen->b.num_shaders_created); /* First set which opcode uses which (i,j) pair. */ @@ -697,7 +696,7 @@ static void *si_create_shader_state(struct pipe_context *ctx, sel->info.uses_linear_centroid + sel->info.uses_linear_sample >= 2; - switch (pipe_shader_type) { + switch (sel->type) { case PIPE_SHADER_GEOMETRY: sel->gs_output_prim = sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]; @@ -763,36 +762,6 @@ static void *si_create_shader_state(struct pipe_context *ctx, return sel; } -static void *si_create_fs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_FRAGMENT); -} - -static void *si_create_gs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_GEOMETRY); -} - -static void *si_create_vs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX); -} - -static void *si_create_tcs_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL); -} - -static void *si_create_tes_state(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL); -} - /** * Normally, we only emit 1 viewport and 1 scissor if no shader is using * the VIEWPORT_INDEX output, and emitting the other viewports and scissors @@ -1613,11 +1582,11 @@ void si_init_shader_functions(struct si_context *sctx) si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map); si_init_atom(sctx, &sctx->spi_ps_input, &sctx->atoms.s.spi_ps_input, si_emit_spi_ps_input); - sctx->b.b.create_vs_state = si_create_vs_state; - sctx->b.b.create_tcs_state = si_create_tcs_state; - sctx->b.b.create_tes_state = si_create_tes_state; - sctx->b.b.create_gs_state = si_create_gs_state; - sctx->b.b.create_fs_state = si_create_fs_state; + sctx->b.b.create_vs_state = si_create_shader_selector; + sctx->b.b.create_tcs_state = si_create_shader_selector; + sctx->b.b.create_tes_state = si_create_shader_selector; + sctx->b.b.create_gs_state = si_create_shader_selector; + sctx->b.b.create_fs_state = si_create_shader_selector; sctx->b.b.bind_vs_state = si_bind_vs_shader; sctx->b.b.bind_tcs_state = si_bind_tcs_shader; From 9098d7e9bd97fd3d674fc93441ea9ab01c2779c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 01:35:32 +0200 Subject: [PATCH 221/270] radeonsi: clean up copy-pasted scratch buffer updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- .../drivers/radeonsi/si_state_shaders.c | 39 +++++++------------ 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c1d61d527cb..9395c3149d0 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1243,7 +1243,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) int r; if (scratch_needed_size > 0) { - if (scratch_needed_size > current_scratch_buffer_size) { /* Create a bigger scratch buffer */ pipe_resource_reference( @@ -1282,38 +1281,26 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4); /* VS can be bound as LS, ES, or VS. */ - if (sctx->tes_shader) { - r = si_update_scratch_buffer(sctx, sctx->vs_shader); - if (r < 0) - return false; - if (r == 1) + r = si_update_scratch_buffer(sctx, sctx->vs_shader); + if (r < 0) + return false; + if (r == 1) { + if (sctx->tes_shader) si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4); - } else if (sctx->gs_shader) { - r = si_update_scratch_buffer(sctx, sctx->vs_shader); - if (r < 0) - return false; - if (r == 1) + else if (sctx->gs_shader) si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4); - } else { - r = si_update_scratch_buffer(sctx, sctx->vs_shader); - if (r < 0) - return false; - if (r == 1) + else si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4); } /* TES can be bound as ES or VS. */ - if (sctx->gs_shader) { - r = si_update_scratch_buffer(sctx, sctx->tes_shader); - if (r < 0) - return false; - if (r == 1) + r = si_update_scratch_buffer(sctx, sctx->tes_shader); + if (r < 0) + return false; + if (r == 1) { + if (sctx->gs_shader) si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4); - } else { - r = si_update_scratch_buffer(sctx, sctx->tes_shader); - if (r < 0) - return false; - if (r == 1) + else si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4); } } From acc6a07874e6af133310adcc6c58898cf4312a6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 9 Oct 2015 01:37:57 +0200 Subject: [PATCH 222/270] radeonsi: clean up other scratch buffer functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Michel Dänzer --- .../drivers/radeonsi/si_state_shaders.c | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 9395c3149d0..71349a56323 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1205,30 +1205,23 @@ static int si_update_scratch_buffer(struct si_context *sctx, static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) { - if (!sctx->scratch_buffer) - return 0; - - return sctx->scratch_buffer->b.b.width0; + return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0; } -static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx, - struct si_shader_selector *sel) +static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader_selector *sel) { - if (!sel) - return 0; - - return sel->current->scratch_bytes_per_wave; + return sel ? sel->current->scratch_bytes_per_wave : 0; } static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) { unsigned bytes = 0; - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader)); return bytes; } From 208d1ed38ddb7de8211a9ffc3d89ae176ef7e9d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 7 Oct 2015 01:47:00 +0200 Subject: [PATCH 223/270] radeonsi: implement fragment color clamping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit using the shader key for now. Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_pipe.c | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 13 +++++++++++++ src/gallium/drivers/radeonsi/si_shader.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 2 +- src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_shaders.c | 1 + 6 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a0283b7c966..f03d02bd287 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -271,6 +271,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_START_INSTANCE: case PIPE_CAP_NPOT_TEXTURES: case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: case PIPE_CAP_TGSI_INSTANCEID: case PIPE_CAP_COMPUTE: @@ -330,7 +331,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) /* Unsupported features. */ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_FAKE_SW_MSAA: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 49ab9404b81..c7ebb0f29bd 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2110,6 +2110,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) struct lp_build_context * base = &bld_base->base; struct lp_build_context * uint = &bld_base->uint_bld; struct tgsi_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = base->gallivm->builder; LLVMValueRef args[9]; LLVMValueRef last_args[9] = { 0 }; int depth_index = -1, stencil_index = -1, samplemask_index = -1; @@ -2136,6 +2137,16 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) target = V_008DFC_SQ_EXP_MRT + semantic_index; alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3]; + if (si_shader_ctx->shader->key.ps.clamp_color) { + for (int j = 0; j < 4; j++) { + LLVMValueRef ptr = si_shader_ctx->radeon_bld.soa.outputs[i][j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + + result = radeon_llvm_saturate(bld_base, result); + LLVMBuildStore(builder, result, ptr); + } + } + if (si_shader_ctx->shader->key.ps.alpha_to_one) LLVMBuildStore(base->gallivm->builder, base->one, alpha_ptr); @@ -2146,6 +2157,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base) if (si_shader_ctx->shader->key.ps.poly_line_smoothing) si_scale_alpha_by_sample_mask(bld_base, alpha_ptr); + break; default: target = 0; @@ -4000,6 +4012,7 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, " alpha_func = %u\n", key->ps.alpha_func); fprintf(f, " alpha_to_one = %u\n", key->ps.alpha_to_one); fprintf(f, " poly_stipple = %u\n", key->ps.poly_stipple); + fprintf(f, " clamp_color = %u\n", key->ps.clamp_color); break; default: diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index cccc4607977..fa5930ad1d5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -227,6 +227,7 @@ union si_shader_key { unsigned alpha_to_one:1; unsigned poly_stipple:1; unsigned poly_line_smoothing:1; + unsigned clamp_color:1; } ps; struct { unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS]; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 00d4bc1fbc2..3aafe8a602f 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -694,7 +694,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, rs->poly_smooth = state->poly_smooth; rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri; - + rs->clamp_fragment_color = state->clamp_fragment_color; rs->flatshade = state->flatshade; rs->sprite_coord_enable = state->sprite_coord_enable; rs->pa_sc_line_stipple = state->line_stipple_enable ? diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 6a567688ee4..fba6619d2fd 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -60,6 +60,7 @@ struct si_state_rasterizer { bool line_smooth; bool poly_smooth; bool uses_poly_offset; + bool clamp_fragment_color; }; struct si_dsa_stencil_ref_part { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 71349a56323..c00f8f4101c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -572,6 +572,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && sctx->framebuffer.nr_samples <= 1; + key->ps.clamp_color = rs->clamp_fragment_color; } key->ps.alpha_func = PIPE_FUNC_ALWAYS; From 5bc871a4caf97f4e07830ea463f445994c8d13b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 7 Oct 2015 02:36:38 +0200 Subject: [PATCH 224/270] radeonsi: implement vertex color clamping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is only supported in the compatibility profile (without GS and tess). Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_pipe.c | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 42 +++++++++++++++++++ src/gallium/drivers/radeonsi/si_shader.h | 8 +++- src/gallium/drivers/radeonsi/si_state.c | 2 + .../drivers/radeonsi/si_state_shaders.c | 2 +- 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index f03d02bd287..53c80dba602 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -271,6 +271,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_START_INSTANCE: case PIPE_CAP_NPOT_TEXTURES: case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: case PIPE_CAP_TGSI_INSTANCEID: @@ -331,7 +332,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) /* Unsupported features. */ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index c7ebb0f29bd..a119cbdc16c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2076,6 +2076,45 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base) outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); + /* Vertex color clamping. + * + * This uses a state constant loaded in a user data SGPR and + * an IF statement is added that clamps all colors if the constant + * is true. + */ + if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && + !si_shader_ctx->shader->is_gs_copy_shader) { + struct lp_build_if_state if_ctx; + LLVMValueRef cond = NULL; + LLVMValueRef addr, val; + + for (i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && + info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) + continue; + + /* We've found a color. */ + if (!cond) { + /* The state is in the first bit of the user SGPR. */ + cond = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, + SI_PARAM_VS_STATE_BITS); + cond = LLVMBuildTrunc(gallivm->builder, cond, + LLVMInt1TypeInContext(gallivm->context), ""); + lp_build_if(&if_ctx, gallivm, cond); + } + + for (j = 0; j < 4; j++) { + addr = si_shader_ctx->radeon_bld.soa.outputs[i][j]; + val = LLVMBuildLoad(gallivm->builder, addr, ""); + val = radeon_llvm_saturate(bld_base, val); + LLVMBuildStore(gallivm->builder, val, addr); + } + } + + if (cond) + lp_build_endif(&if_ctx); + } + for (i = 0; i < info->num_outputs; i++) { outputs[i].name = info->output_semantic_name[i]; outputs[i].sid = info->output_semantic_index[i]; @@ -3445,6 +3484,9 @@ static void create_function(struct si_shader_context *si_shader_ctx) if (shader->is_gs_copy_shader) { last_array_pointer = SI_PARAM_CONST; num_params = SI_PARAM_CONST+1; + } else { + params[SI_PARAM_VS_STATE_BITS] = i32; + num_params = SI_PARAM_VS_STATE_BITS+1; } /* The locations of the other parameters are assigned dynamically. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fa5930ad1d5..54dad726d01 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -83,6 +83,7 @@ struct radeon_shader_reloc; #define SI_SGPR_VERTEX_BUFFER 8 /* VS only */ #define SI_SGPR_BASE_VERTEX 10 /* VS only */ #define SI_SGPR_START_INSTANCE 11 /* VS only */ +#define SI_SGPR_VS_STATE_BITS 12 /* VS(VS) only */ #define SI_SGPR_LS_OUT_LAYOUT 12 /* VS(LS) only */ #define SI_SGPR_TCS_OUT_OFFSETS 8 /* TCS & TES only */ #define SI_SGPR_TCS_OUT_LAYOUT 9 /* TCS & TES only */ @@ -90,8 +91,9 @@ struct radeon_shader_reloc; #define SI_SGPR_ALPHA_REF 8 /* PS only */ #define SI_SGPR_PS_STATE_BITS 9 /* PS only */ -#define SI_VS_NUM_USER_SGPR 12 -#define SI_LS_NUM_USER_SGPR 13 +#define SI_VS_NUM_USER_SGPR 13 /* API VS */ +#define SI_ES_NUM_USER_SGPR 12 /* API VS */ +#define SI_LS_NUM_USER_SGPR 13 /* API VS */ #define SI_TCS_NUM_USER_SGPR 11 #define SI_TES_NUM_USER_SGPR 10 #define SI_GS_NUM_USER_SGPR 8 @@ -108,6 +110,8 @@ struct radeon_shader_reloc; #define SI_PARAM_VERTEX_BUFFER 4 #define SI_PARAM_BASE_VERTEX 5 #define SI_PARAM_START_INSTANCE 6 +/* [0] = clamp vertex color */ +#define SI_PARAM_VS_STATE_BITS 7 /* the other VS parameters are assigned dynamically */ /* Offsets where TCS outputs and TCS patch outputs live in LDS: diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 3aafe8a602f..e6475364f98 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -760,6 +760,8 @@ static void *si_create_rs_state(struct pipe_context *ctx, state->fill_back != PIPE_POLYGON_MODE_FILL) | S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); + si_pm4_set_reg(pm4, R_00B130_SPI_SHADER_USER_DATA_VS_0 + + SI_SGPR_VS_STATE_BITS * 4, state->clamp_vertex_color); /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ for (i = 0; i < 3; i++) { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c00f8f4101c..c98509bb0b9 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -179,7 +179,7 @@ static void si_shader_es(struct si_shader *shader) if (shader->selector->type == PIPE_SHADER_VERTEX) { vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0; - num_user_sgprs = SI_VS_NUM_USER_SGPR; + num_user_sgprs = SI_ES_NUM_USER_SGPR; } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { vgpr_comp_cnt = 3; /* all components are needed for TES */ num_user_sgprs = SI_TES_NUM_USER_SGPR; From a2197cac7fd6ce3f897a89121f5e499d28a3888e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 21:24:28 +0200 Subject: [PATCH 225/270] gallivm: set correct opcode info from unary/binary/ternary emits and clear the emit_data structure. The new radeonsi min/max opcode implementation requires this. (it looks good according to Roland S.) --- src/gallium/auxiliary/gallivm/lp_bld_tgsi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c index c4ae30461cb..c88dfbf974a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c @@ -129,7 +129,8 @@ lp_build_emit_llvm_unary( unsigned tgsi_opcode, LLVMValueRef arg0) { - struct lp_build_emit_data emit_data; + struct lp_build_emit_data emit_data = {{0}}; + emit_data.info = tgsi_get_opcode_info(tgsi_opcode); emit_data.arg_count = 1; emit_data.args[0] = arg0; return lp_build_emit_llvm(bld_base, tgsi_opcode, &emit_data); @@ -142,7 +143,8 @@ lp_build_emit_llvm_binary( LLVMValueRef arg0, LLVMValueRef arg1) { - struct lp_build_emit_data emit_data; + struct lp_build_emit_data emit_data = {{0}}; + emit_data.info = tgsi_get_opcode_info(tgsi_opcode); emit_data.arg_count = 2; emit_data.args[0] = arg0; emit_data.args[1] = arg1; @@ -157,7 +159,8 @@ lp_build_emit_llvm_ternary( LLVMValueRef arg1, LLVMValueRef arg2) { - struct lp_build_emit_data emit_data; + struct lp_build_emit_data emit_data = {{0}}; + emit_data.info = tgsi_get_opcode_info(tgsi_opcode); emit_data.arg_count = 3; emit_data.args[0] = arg0; emit_data.args[1] = arg1; From 529c5e77402012aa0c0a11ee71a1a65e51edb496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 22:05:58 +0200 Subject: [PATCH 226/270] gallivm: implement the correct version of LRP The previous version has precision issues. This can be a problem with tessellation. Sadly, I can't find the article where I read it anymore. I'm not sure if the unsafe-fp-math flag would be enough to revert this. v2: added the comment --- .../auxiliary/gallivm/lp_bld_tgsi_action.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c index 0ad78b0ace2..3d5e2cb316b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c @@ -538,12 +538,19 @@ lrp_emit( struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) { - LLVMValueRef tmp; - tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB, - emit_data->args[1], - emit_data->args[2]); - emit_data->output[emit_data->chan] = lp_build_emit_llvm_ternary(bld_base, - TGSI_OPCODE_MAD, emit_data->args[0], tmp, emit_data->args[2]); + struct lp_build_context *bld = &bld_base->base; + LLVMValueRef inv, a, b; + + /* This uses the correct version: (1 - t)*a + t*b + * + * An alternative version is "a + t*(b-a)". The problem is this version + * doesn't return "b" for t = 1, because "a + (b-a)" isn't equal to "b" + * because of the floating-point rounding. + */ + inv = lp_build_sub(bld, bld_base->base.one, emit_data->args[0]); + a = lp_build_mul(bld, emit_data->args[1], emit_data->args[0]); + b = lp_build_mul(bld, emit_data->args[2], inv); + emit_data->output[emit_data->chan] = lp_build_add(bld, a, b); } /* TGSI_OPCODE_MAD */ From 6660ca7121183b5ce777b1bd3613afdd19640b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 02:40:20 +0200 Subject: [PATCH 227/270] radeonsi: initialize output, temp, and address registers to "undef" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes "v_mov v0, 0" which typically occurs before exports. Totals: SGPRS: 345216 -> 344552 (-0.19 %) VGPRS: 197684 -> 197132 (-0.28 %) Code Size: 7390408 -> 7375376 (-0.20 %) bytes LDS: 91 -> 91 (0.00 %) blocks Scratch: 1842176 -> 1679360 (-8.84 %) bytes per wave Totals from affected shaders: SGPRS: 101336 -> 100672 (-0.66 %) VGPRS: 53920 -> 53368 (-1.02 %) Code Size: 2170176 -> 2155144 (-0.69 %) bytes LDS: 2 -> 2 (0.00 %) blocks Scratch: 1015808 -> 852992 (-16.03 %) bytes per wave Reviewed-by: Michel Dänzer Reviewed-by: Tom Stellard --- .../drivers/radeon/radeon_setup_tgsi_llvm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 2e9a0135647..f548d1ac36f 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -272,6 +272,15 @@ static LLVMValueRef fetch_system_value( return bitcast(bld_base, type, cval); } +static LLVMValueRef si_build_alloca_undef(struct gallivm_state *gallivm, + LLVMTypeRef type, + const char *name) +{ + LLVMValueRef ptr = lp_build_alloca(gallivm, type, name); + LLVMBuildStore(gallivm->builder, LLVMGetUndef(type), ptr); + return ptr; +} + static void emit_declaration( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_declaration *decl) @@ -285,7 +294,7 @@ static void emit_declaration( for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - ctx->soa.addr[idx][chan] = lp_build_alloca( + ctx->soa.addr[idx][chan] = si_build_alloca_undef( &ctx->gallivm, ctx->soa.bld_base.uint_bld.elem_type, ""); } @@ -315,8 +324,9 @@ static void emit_declaration( for (idx = first; idx <= last; idx++) { for (i = 0; i < TGSI_NUM_CHANNELS; i++) { ctx->temps[idx * TGSI_NUM_CHANNELS + i] = - lp_build_alloca(bld_base->base.gallivm, bld_base->base.vec_type, - "temp"); + si_build_alloca_undef(bld_base->base.gallivm, + bld_base->base.vec_type, + "temp"); } } break; @@ -347,7 +357,8 @@ static void emit_declaration( unsigned chan; assert(idx < RADEON_LLVM_MAX_OUTPUTS); for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - ctx->soa.outputs[idx][chan] = lp_build_alloca(&ctx->gallivm, + ctx->soa.outputs[idx][chan] = si_build_alloca_undef( + &ctx->gallivm, ctx->soa.bld_base.base.elem_type, ""); } } From d72a26ec5de04d0b951b4acd66e00352afdc279d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 19:59:57 +0200 Subject: [PATCH 228/270] radeonsi: don't emit AMDGPU intrinsics for EX2, ROUND, TRUNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No difference according to shader-db. Reviewed-by: Michel Dänzer Reviewed-by: Tom Stellard --- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index f548d1ac36f..91cf6587181 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -1481,7 +1481,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit; bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit; bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp."; + bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.exp2.f32"; bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32"; bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem; @@ -1530,7 +1530,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_POW].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32"; bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest."; + bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.rint.f32"; bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name = "llvm.AMDGPU.rsq.clamped.f32"; bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp; @@ -1546,7 +1546,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32"; bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg; bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc"; + bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.trunc.f32"; bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd; bld_base->op_actions[TGSI_OPCODE_UBFE].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_UBFE].intr_name = "llvm.AMDGPU.bfe.u32"; From eb11efc989020d6786e834d07dbfdb426802a696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 21:27:24 +0200 Subject: [PATCH 229/270] radeonsi: don't emit AMDGPU intrinsics for integer abs, min, max MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No difference according to shader-db. (with the new S_ABS_I32 pattern) Reviewed-by: Michel Dänzer Reviewed-by: Tom Stellard --- .../drivers/radeon/radeon_setup_tgsi_llvm.c | 60 +++++++++++++++---- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 91cf6587181..23ea23a0c28 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -1393,6 +1393,51 @@ static void emit_imsb(const struct lp_build_tgsi_action * action, LLVMBuildSelect(builder, cond, all_ones, msb, ""); } +static void emit_iabs(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + + emit_data->output[emit_data->chan] = + lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_IMAX, + emit_data->args[0], + LLVMBuildNeg(builder, + emit_data->args[0], "")); +} + +static void emit_minmax_int(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMIntPredicate op; + + switch (emit_data->info->opcode) { + default: + assert(0); + case TGSI_OPCODE_IMAX: + op = LLVMIntSGT; + break; + case TGSI_OPCODE_IMIN: + op = LLVMIntSLT; + break; + case TGSI_OPCODE_UMAX: + op = LLVMIntUGT; + break; + case TGSI_OPCODE_UMIN: + op = LLVMIntULT; + break; + } + + emit_data->output[emit_data->chan] = + LLVMBuildSelect(builder, + LLVMBuildICmp(builder, op, emit_data->args[0], + emit_data->args[1], ""), + emit_data->args[0], + emit_data->args[1], ""); +} + void radeon_llvm_context_init(struct radeon_llvm_context * ctx) { struct lp_type type; @@ -1493,17 +1538,14 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_FSGE].emit = emit_fcmp; bld_base->op_actions[TGSI_OPCODE_FSLT].emit = emit_fcmp; bld_base->op_actions[TGSI_OPCODE_FSNE].emit = emit_fcmp; - bld_base->op_actions[TGSI_OPCODE_IABS].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_IABS].intr_name = "llvm.AMDIL.abs."; + bld_base->op_actions[TGSI_OPCODE_IABS].emit = emit_iabs; bld_base->op_actions[TGSI_OPCODE_IBFE].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_IBFE].intr_name = "llvm.AMDGPU.bfe.i32"; bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv; bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit; bld_base->op_actions[TGSI_OPCODE_UIF].emit = uif_emit; - bld_base->op_actions[TGSI_OPCODE_IMAX].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_IMAX].intr_name = "llvm.AMDGPU.imax"; - bld_base->op_actions[TGSI_OPCODE_IMIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_IMIN].intr_name = "llvm.AMDGPU.imin"; + bld_base->op_actions[TGSI_OPCODE_IMAX].emit = emit_minmax_int; + bld_base->op_actions[TGSI_OPCODE_IMIN].emit = emit_minmax_int; bld_base->op_actions[TGSI_OPCODE_IMSB].emit = emit_imsb; bld_base->op_actions[TGSI_OPCODE_INEG].emit = emit_ineg; bld_base->op_actions[TGSI_OPCODE_ISHR].emit = emit_ishr; @@ -1551,10 +1593,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_UBFE].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_UBFE].intr_name = "llvm.AMDGPU.bfe.u32"; bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv; - bld_base->op_actions[TGSI_OPCODE_UMAX].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_UMAX].intr_name = "llvm.AMDGPU.umax"; - bld_base->op_actions[TGSI_OPCODE_UMIN].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_UMIN].intr_name = "llvm.AMDGPU.umin"; + bld_base->op_actions[TGSI_OPCODE_UMAX].emit = emit_minmax_int; + bld_base->op_actions[TGSI_OPCODE_UMIN].emit = emit_minmax_int; bld_base->op_actions[TGSI_OPCODE_UMOD].emit = emit_umod; bld_base->op_actions[TGSI_OPCODE_USEQ].emit = emit_icmp; bld_base->op_actions[TGSI_OPCODE_USGE].emit = emit_icmp; From f2cdb68c8bb905cd76edae383e1cbbe0ae2c69ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 22:19:19 +0200 Subject: [PATCH 230/270] radeonsi: use LRP from gallivm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Totals: SGPRS: 344552 -> 344368 (-0.05 %) VGPRS: 197132 -> 197552 (0.21 %) Code Size: 7375376 -> 7366304 (-0.12 %) bytes LDS: 91 -> 91 (0.00 %) blocks Scratch: 1679360 -> 1615872 (-3.78 %) bytes per wave Totals from affected shaders: SGPRS: 47736 -> 47552 (-0.39 %) VGPRS: 27952 -> 28372 (1.50 %) Code Size: 1392724 -> 1383652 (-0.65 %) bytes LDS: 39 -> 39 (0.00 %) blocks Scratch: 513024 -> 449536 (-12.38 %) bytes per wave Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 23ea23a0c28..c22ea7c2552 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -1561,8 +1561,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_LSB].emit = emit_lsb; bld_base->op_actions[TGSI_OPCODE_LG2].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_LG2].intr_name = "llvm.log2.f32"; - bld_base->op_actions[TGSI_OPCODE_LRP].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_LRP].intr_name = "llvm.AMDGPU.lrp"; bld_base->op_actions[TGSI_OPCODE_MOD].emit = emit_mod; bld_base->op_actions[TGSI_OPCODE_UMSB].emit = emit_umsb; bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not; From 7c10af64258ca3a839d9fc1f14957ef556878b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 10 Oct 2015 22:43:19 +0200 Subject: [PATCH 231/270] radeonsi: don't use the AMDGPU intrinsic for CMP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No difference according to shader-db. Reviewed-by: Michel Dänzer Reviewed-by: Tom Stellard --- .../drivers/radeon/radeon_setup_tgsi_llvm.c | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index c22ea7c2552..ac99e732c94 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -919,7 +919,21 @@ static void emit_ucmp( LLVMBuildSelect(builder, v, emit_data->args[1], emit_data->args[2], ""); } -static void emit_cmp( +static void emit_cmp(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMValueRef cond, *args = emit_data->args; + + cond = LLVMBuildFCmp(builder, LLVMRealOLT, args[0], + bld_base->base.zero, ""); + + emit_data->output[emit_data->chan] = + LLVMBuildSelect(builder, cond, args[1], args[2], ""); +} + +static void emit_set_cond( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) @@ -1503,8 +1517,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32"; bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp."; - bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt"; + bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cmp; bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit; bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32"; @@ -1573,13 +1586,13 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.rint.f32"; bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name = "llvm.AMDGPU.rsq.clamped.f32"; bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem; - bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp; + bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_set_cond; bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl; - bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_cmp; - bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_cmp; + bld_base->op_actions[TGSI_OPCODE_SLE].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SLT].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SNE].emit = emit_set_cond; + bld_base->op_actions[TGSI_OPCODE_SGT].emit = emit_set_cond; bld_base->op_actions[TGSI_OPCODE_SIN].emit = build_tgsi_intrinsic_nomem; bld_base->op_actions[TGSI_OPCODE_SIN].intr_name = "llvm.sin.f32"; bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem; From dbac0a6352053bd6106feff88d95b0fd38b82afe Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 15 Oct 2015 15:17:19 -0700 Subject: [PATCH 232/270] i965/nir: Switch on shader stage in nir_lower_outputs(). VS, GS, and FS continue doing the same thing they did before. We can simplify the FS code a bit because it is always scalar. Compute shaders now assert that there are no outputs instead of doing a loop over 0 outputs. Signed-off-by: Kenneth Graunke Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_nir.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index af9d0414d51..1b4dace84fb 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -112,11 +112,27 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) static void brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) { - if (is_scalar) { - nir_assign_var_locations(&nir->outputs, &nir->num_outputs, type_size_scalar); - } else { - nir_foreach_variable(var, &nir->outputs) - var->data.driver_location = var->data.location; + switch (nir->stage) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_GEOMETRY: + if (is_scalar) { + nir_assign_var_locations(&nir->outputs, &nir->num_outputs, + type_size_scalar); + } else { + nir_foreach_variable(var, &nir->outputs) + var->data.driver_location = var->data.location; + } + break; + case MESA_SHADER_FRAGMENT: + nir_assign_var_locations(&nir->outputs, &nir->num_outputs, + type_size_scalar); + break; + case MESA_SHADER_COMPUTE: + /* Compute shaders have no outputs. */ + assert(exec_list_is_empty(&nir->outputs)); + break; + default: + unreachable("unsupported shader stage"); } } From ca2b807ca32dcf531fbf96d9fa0026679abbf111 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 15 Oct 2015 15:34:06 -0700 Subject: [PATCH 233/270] i965/vs: Drop hack that created NIR for fixed function vertex programs. Marek made core Mesa call ProgramStringNotify(), which solves this properly. The hack is no longer needed. Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner --- src/mesa/drivers/dri/i965/brw_vs.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index de9a8677599..725311732ce 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -57,18 +57,6 @@ brw_codegen_vs_prog(struct brw_context *brw, bool start_busy = false; double start_time = 0; - if (!vp->program.Base.nir) { - /* Normally we generate NIR in LinkShader() or - * ProgramStringNotify(), but Mesa's fixed-function vertex program - * handling doesn't notify the driver at all. Just do it here, at - * the last minute, even though it's lame. - */ - assert(vp->program.Base.Id == 0 && prog == NULL); - vp->program.Base.nir = - brw_create_nir(brw, NULL, &vp->program.Base, MESA_SHADER_VERTEX, - brw->intelScreen->compiler->scalar_vs); - } - if (prog) vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; From fc5ae0c13f71f049065b1422c20491d2264ae164 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Sat, 17 Oct 2015 17:33:14 +0200 Subject: [PATCH 234/270] nvc0: do not bind input params at compute state init on Fermi It looks like binding a constant buffer on compute overwrites the 3D state. To avoid that, we already re-bind all the 3D constant buffers after launching a compute grid but this is not enough. Binding the constant buffer of input parameters for the compute state at initialization corrupts the 3D constant buffers, and it's just useless to bind it because this is not needed until we really launch a grid. This fixes some piglit regressions related to interpolation tests introduced in "nvc0: enable compute support by default on Fermi". Fixes: 00d6186 (nvc0: enable compute support by default on Fermi) Signed-off-by: Samuel Pitoiset Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_compute.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index 96d753c79f3..e33af042620 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -105,14 +105,6 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATAh(push, screen->text->offset); PUSH_DATA (push, screen->text->offset); - /* bind parameters buffer */ - BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); - PUSH_DATA (push, screen->parm->size); - PUSH_DATAh(push, screen->parm->offset); - PUSH_DATA (push, screen->parm->offset); - BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); - PUSH_DATA (push, (0 << 8) | 1); - /* TODO: textures & samplers */ return 0; From 3fe568e2a472b764e96b67cf57ef63f40cdce5a6 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Tue, 13 Oct 2015 14:05:41 +0800 Subject: [PATCH 235/270] ilo: remove u_memory.h inclusion from ilo_core.h We do not make allocations generally in the core. --- src/gallium/drivers/ilo/core/ilo_builder.c | 2 ++ src/gallium/drivers/ilo/core/ilo_core.h | 1 - src/gallium/drivers/ilo/ilo_common.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c index 4e05a3aca1e..9d5195129b7 100644 --- a/src/gallium/drivers/ilo/core/ilo_builder.c +++ b/src/gallium/drivers/ilo/core/ilo_builder.c @@ -25,6 +25,8 @@ * Chia-I Wu */ +#include "util/u_memory.h" + #include "ilo_builder.h" #include "ilo_builder_render.h" /* for ilo_builder_batch_patch_sba() */ diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h index da7db90a54b..36d14b0b2ff 100644 --- a/src/gallium/drivers/ilo/core/ilo_core.h +++ b/src/gallium/drivers/ilo/core/ilo_core.h @@ -32,6 +32,5 @@ #include "util/u_debug.h" #include "util/u_math.h" -#include "util/u_memory.h" #endif /* ILO_CORE_H */ diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h index 3dbe79fb872..d3016590551 100644 --- a/src/gallium/drivers/ilo/ilo_common.h +++ b/src/gallium/drivers/ilo/ilo_common.h @@ -34,6 +34,7 @@ #include "util/list.h" #include "util/u_format.h" #include "util/u_inlines.h" +#include "util/u_memory.h" #include "util/u_pointer.h" #include "core/ilo_core.h" From 29a0f7479dd09ca60bed084fd6e5f736a6340cb5 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Tue, 13 Oct 2015 14:09:24 +0800 Subject: [PATCH 236/270] ilo: remove u_debug.h inclusion from ilo_core.h Move it to ilo_debug.h. --- src/gallium/drivers/ilo/core/ilo_core.h | 1 - src/gallium/drivers/ilo/core/ilo_debug.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h index 36d14b0b2ff..cbc568c4cd0 100644 --- a/src/gallium/drivers/ilo/core/ilo_core.h +++ b/src/gallium/drivers/ilo/core/ilo_core.h @@ -30,7 +30,6 @@ #include "pipe/p_compiler.h" -#include "util/u_debug.h" #include "util/u_math.h" #endif /* ILO_CORE_H */ diff --git a/src/gallium/drivers/ilo/core/ilo_debug.h b/src/gallium/drivers/ilo/core/ilo_debug.h index 9833233d796..532a2aa7ed6 100644 --- a/src/gallium/drivers/ilo/core/ilo_debug.h +++ b/src/gallium/drivers/ilo/core/ilo_debug.h @@ -28,6 +28,8 @@ #ifndef ILO_DEBUG_H #define ILO_DEBUG_H +#include "util/u_debug.h" + #include "ilo_core.h" /* enable debug flags affecting hot pathes only with debug builds */ From 6e132f4730a41baa36bf31c1b5f47933d07cee8c Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Fri, 16 Oct 2015 09:46:25 +0800 Subject: [PATCH 237/270] ilo: remove unused ilo_shader_get_type() --- src/gallium/drivers/ilo/ilo_shader.c | 9 --------- src/gallium/drivers/ilo/ilo_shader.h | 3 --- 2 files changed, 12 deletions(-) diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c index 5f2b01017e2..1ba3edce4ae 100644 --- a/src/gallium/drivers/ilo/ilo_shader.c +++ b/src/gallium/drivers/ilo/ilo_shader.c @@ -986,15 +986,6 @@ ilo_shader_destroy(struct ilo_shader_state *shader) FREE(shader); } -/** - * Return the type (PIPE_SHADER_x) of the shader. - */ -int -ilo_shader_get_type(const struct ilo_shader_state *shader) -{ - return shader->info.type; -} - /** * Select a kernel for the given context. This will compile a new kernel if * none of the existing kernels work with the context. diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h index d9f02a4746a..afa1efa718d 100644 --- a/src/gallium/drivers/ilo/ilo_shader.h +++ b/src/gallium/drivers/ilo/ilo_shader.h @@ -149,9 +149,6 @@ ilo_shader_create_cs(const struct ilo_dev *dev, void ilo_shader_destroy(struct ilo_shader_state *shader); -int -ilo_shader_get_type(const struct ilo_shader_state *shader); - bool ilo_shader_select_kernel(struct ilo_shader_state *shader, const struct ilo_state_vector *vec, From a445e0f7efcaa0ff21b5fe4faa3f73f3b73dcfe8 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Fri, 16 Oct 2015 09:50:12 +0800 Subject: [PATCH 238/270] ilo: remove some unused kernel params --- src/gallium/drivers/ilo/ilo_shader.c | 16 ---------------- src/gallium/drivers/ilo/ilo_shader.h | 6 ------ 2 files changed, 22 deletions(-) diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c index 1ba3edce4ae..73b625e9de4 100644 --- a/src/gallium/drivers/ilo/ilo_shader.c +++ b/src/gallium/drivers/ilo/ilo_shader.c @@ -1248,9 +1248,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, case ILO_KERNEL_SAMPLER_COUNT: val = shader->info.num_samplers; break; - case ILO_KERNEL_URB_DATA_START_REG: - val = kernel->in.start_grf; - break; case ILO_KERNEL_SKIP_CBUF0_UPLOAD: val = kernel->skip_cbuf0_upload; break; @@ -1302,9 +1299,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, case ILO_KERNEL_VS_GEN6_SO: val = kernel->stream_output; break; - case ILO_KERNEL_VS_GEN6_SO_START_REG: - val = kernel->gs_start_grf; - break; case ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET: val = kernel->gs_offsets[0]; break; @@ -1331,16 +1325,6 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader, val = kernel->bt.gen6_so_count; break; - case ILO_KERNEL_FS_INPUT_Z: - case ILO_KERNEL_FS_INPUT_W: - val = kernel->in.has_pos; - break; - case ILO_KERNEL_FS_OUTPUT_Z: - val = kernel->out.has_pos; - break; - case ILO_KERNEL_FS_USE_KILL: - val = kernel->has_kill; - break; case ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS: val = kernel->in.barycentric_interpolation_mode; break; diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h index afa1efa718d..01de54146b1 100644 --- a/src/gallium/drivers/ilo/ilo_shader.h +++ b/src/gallium/drivers/ilo/ilo_shader.h @@ -36,7 +36,6 @@ enum ilo_kernel_param { ILO_KERNEL_INPUT_COUNT, ILO_KERNEL_OUTPUT_COUNT, ILO_KERNEL_SAMPLER_COUNT, - ILO_KERNEL_URB_DATA_START_REG, ILO_KERNEL_SKIP_CBUF0_UPLOAD, ILO_KERNEL_PCB_CBUF0_SIZE, @@ -53,7 +52,6 @@ enum ilo_kernel_param { ILO_KERNEL_VS_INPUT_EDGEFLAG, ILO_KERNEL_VS_PCB_UCP_SIZE, ILO_KERNEL_VS_GEN6_SO, - ILO_KERNEL_VS_GEN6_SO_START_REG, ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET, ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET, ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET, @@ -64,10 +62,6 @@ enum ilo_kernel_param { ILO_KERNEL_GS_GEN6_SURFACE_SO_BASE, ILO_KERNEL_GS_GEN6_SURFACE_SO_COUNT, - ILO_KERNEL_FS_INPUT_Z, - ILO_KERNEL_FS_INPUT_W, - ILO_KERNEL_FS_OUTPUT_Z, - ILO_KERNEL_FS_USE_KILL, ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS, ILO_KERNEL_FS_DISPATCH_16_OFFSET, ILO_KERNEL_FS_SURFACE_RT_BASE, From d04126a773f8a70ff6ae549751dc674133ea26b0 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Fri, 16 Oct 2015 16:19:30 +0800 Subject: [PATCH 239/270] ilo: ignore prefer_linear_threshold when zero This was the intended behavior but it did not work as intended until now. --- src/gallium/drivers/ilo/core/ilo_image.c | 4 ++-- src/gallium/drivers/ilo/core/ilo_image.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c index fa547ac5c36..6eefc8f46d2 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.c +++ b/src/gallium/drivers/ilo/core/ilo_image.c @@ -286,8 +286,8 @@ image_get_gen6_tiling(const struct ilo_dev *dev, info->bind_surface_dp_typed)) return GEN6_TILING_NONE; - if (estimated_size <= 64 || - estimated_size > info->prefer_linear_threshold) + if (estimated_size <= 64 || (info->prefer_linear_threshold && + estimated_size > info->prefer_linear_threshold)) return GEN6_TILING_NONE; if (estimated_size <= 2048) diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h index 646ed6f5727..546e0ff7739 100644 --- a/src/gallium/drivers/ilo/core/ilo_image.h +++ b/src/gallium/drivers/ilo/core/ilo_image.h @@ -102,7 +102,7 @@ struct ilo_image_info { /* * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the - * threshold + * threshold; ignored when zero */ uint32_t prefer_linear_threshold; From 86ccb2a16f6d21be29cd99d38831eab6079ce107 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Fri, 16 Oct 2015 22:53:05 +0800 Subject: [PATCH 240/270] ilo: set VME for 3DSTATE_PS When the bit is not set, we can see sampling artifacts on triangle edges when the mip filter is not GEN6_MIPFILTER_NONE. --- src/gallium/drivers/ilo/core/ilo_state_shader_ps.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c index f4d801e9b56..ceeb68a460e 100644 --- a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c +++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c @@ -592,7 +592,12 @@ ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps, ILO_DEV_ASSERT(dev, 8, 8); - dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | + /* + * Set VME here for correct computation of LODs and others. Not sure why + * it is needed now. + */ + dw3 = GEN6_THREADDISP_VME | + ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT | ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT; if (false) From fba582efc7d66140b5c18ada4d5cd93c9c1e5967 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 16 Oct 2015 11:16:46 +0200 Subject: [PATCH 241/270] main: Use NumUniformBlocks to count UBOs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have separate index spaces for UBOs and SSBOs we do not need to iterate through BufferInterfaceBlocks any more, we can just take the UBO count directly from NumUniformBlocks. Reviewed-by: Kristian Høgsberg Reviewed-by: Marek Olšák --- src/mesa/main/shaderapi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index 6a2f60db77e..26995add386 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -729,11 +729,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname, if (!has_ubo) break; - *params = 0; - for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { - if (!shProg->BufferInterfaceBlocks[i].IsShaderStorage) - (*params)++; - } + *params = shProg->NumUniformBlocks; return; case GL_PROGRAM_BINARY_RETRIEVABLE_HINT: /* This enum isn't part of the OES extension for OpenGL ES 2.0. It is From 14c3db7bc59a6b10f5a13930c0274d4155cb8791 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 16 Oct 2015 11:27:43 +0200 Subject: [PATCH 242/270] main: GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH is about UBOS, not SSBOs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Kristian Høgsberg Reviewed-by: Marek Olšák --- src/mesa/main/shaderapi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index 26995add386..18e463d4ccc 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -713,10 +713,10 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname, if (!has_ubo) break; - for (i = 0; i < shProg->NumBufferInterfaceBlocks; i++) { + for (i = 0; i < shProg->NumUniformBlocks; i++) { /* Add one for the terminating NUL character. */ - const GLint len = strlen(shProg->BufferInterfaceBlocks[i].Name) + 1; + const GLint len = strlen(shProg->UniformBlocks[i]->Name) + 1; if (len > max_len) max_len = len; From 55403665b6bff3778ba335e9fd7821fc9a11ad2b Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 16 Oct 2015 11:31:46 +0200 Subject: [PATCH 243/270] i965: Do not use NumBufferInterfaceBlocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the only place in the driver where we use this. Since we now work with separate index spaces, always use NumUniformBlocks and NumShaderStorageBlocks instead of NumBufferInterfaceBlocks to be more consistent with the rest of the code. Reviewed-by: Kristian Høgsberg Reviewed-by: Marek Olšák --- src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index a304eec3249..6ebe6481c32 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -972,7 +972,7 @@ brw_upload_ubo_surfaces(struct brw_context *brw, } } - if (shader->NumBufferInterfaceBlocks) + if (shader->NumUniformBlocks || shader->NumShaderStorageBlocks) brw->ctx.NewDriverState |= BRW_NEW_SURFACES; } From 5a9ff87d0f10bae5dba4b2bebd28f2625cd485aa Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 16 Oct 2015 11:40:52 +0200 Subject: [PATCH 244/270] st/mesa: Use {Num}UniformBlocks instead of {Num}BufferInterfaceBlocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The latter holds both UBOs and SSBOs, but here we only want UBOs. Reviewed-by: Kristian Høgsberg Reviewed-by: Marek Olšák --- src/mesa/state_tracker/st_atom_constbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c index 69e26cb6c26..acaa85d9356 100644 --- a/src/mesa/state_tracker/st_atom_constbuf.c +++ b/src/mesa/state_tracker/st_atom_constbuf.c @@ -234,11 +234,11 @@ static void st_bind_ubos(struct st_context *st, if (!shader) return; - for (i = 0; i < shader->NumBufferInterfaceBlocks; i++) { + for (i = 0; i < shader->NumUniformBlocks; i++) { struct gl_uniform_buffer_binding *binding; struct st_buffer_object *st_obj; - binding = &st->ctx->UniformBufferBindings[shader->BufferInterfaceBlocks[i].Binding]; + binding = &st->ctx->UniformBufferBindings[shader->UniformBlocks[i]->Binding]; st_obj = st_buffer_object(binding->BufferObject); cb.buffer = st_obj->buffer; From 36c93e96590b39362bb0159f5f55f1ad1db5e145 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 16 Oct 2015 11:43:18 +0200 Subject: [PATCH 245/270] glsl_to_tgsi: Use {Num}UniformBlocks instead of {Num}BufferInterfaceBlocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The latter holds both UBOs and SSBOs, but here we only want UBOs. Reviewed-by: Kristian Høgsberg Reviewed-by: Marek Olšák --- src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 06f510db536..f481e8902d8 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -5388,10 +5388,10 @@ st_translate_program( } if (program->shader) { - unsigned num_ubos = program->shader->NumBufferInterfaceBlocks; + unsigned num_ubos = program->shader->NumUniformBlocks; for (i = 0; i < num_ubos; i++) { - unsigned size = program->shader->BufferInterfaceBlocks[i].UniformBufferSize; + unsigned size = program->shader->UniformBlocks[i]->UniformBufferSize; unsigned num_const_vecs = (size + 15) / 16; unsigned first, last; assert(num_const_vecs > 0); From 381c17d695b39f9ab501f5aa5a3cc42c8519ac3b Mon Sep 17 00:00:00 2001 From: Indrajit Das Date: Thu, 15 Oct 2015 15:42:43 +0530 Subject: [PATCH 246/270] st/va: Used correct parameter to derive the value of the "h" variable in vlVaCreateImage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: "11.0" Reviewed-by: Christian König Reviewed-by: Emil Velikov --- src/gallium/state_trackers/va/image.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index 022240df84f..3b36430541e 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -116,7 +116,7 @@ vlVaCreateImage(VADriverContextP ctx, VAImageFormat *format, int width, int heig img->width = width; img->height = height; w = align(width, 2); - h = align(width, 2); + h = align(height, 2); switch (format->fourcc) { case VA_FOURCC('N','V','1','2'): From b0a44f1017be51c3eb612da2a6ccd5df5695c25a Mon Sep 17 00:00:00 2001 From: Indrajit Das Date: Fri, 16 Oct 2015 12:18:45 +0530 Subject: [PATCH 247/270] st/va: Added support for NV12 to IYUV conversion in vlVaGetImage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Christian König --- src/gallium/state_trackers/va/image.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index 3b36430541e..b37a9714437 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -240,9 +240,11 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y, return VA_STATUS_ERROR_OPERATION_FAILED; if (format != surf->buffer->buffer_format) { - /* support NV12 to YV12 conversion now only */ - if (format == PIPE_FORMAT_YV12 && - surf->buffer->buffer_format == PIPE_FORMAT_NV12) + /* support NV12 to YV12 and IYUV conversion now only */ + if ((format == PIPE_FORMAT_YV12 && + surf->buffer->buffer_format == PIPE_FORMAT_NV12) || + (format == PIPE_FORMAT_IYUV && + surf->buffer->buffer_format == PIPE_FORMAT_NV12)) convert = true; else return VA_STATUS_ERROR_OPERATION_FAILED; From 6f3954618b0fe273af76af79ce9ec56566b79b2a Mon Sep 17 00:00:00 2001 From: Samuel Iglesias Gonsalvez Date: Mon, 19 Oct 2015 10:37:14 +0200 Subject: [PATCH 248/270] glsl: fix segfault when indirect indexing a buffer variable which is an array Fixes a regression added by bb5aeb854915ba67abc56257f830d002c956439e. Signed-off-by: Samuel Iglesias Gonsalvez Reviewed-by: Timothy Arceri --- src/glsl/lower_ubo_reference.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index 1fbb09de0b1..e818c048461 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -284,7 +284,8 @@ interface_field_name(void *mem_ctx, char *base_name, ir_rvalue *d, if (array_index->type != glsl_type::uint_type) array_index = i2u(array_index); - if (a->array->type->fields.array->is_array()) { + if (a->array->type->is_array() && + a->array->type->fields.array->is_array()) { ir_constant *base_size = new(mem_ctx) ir_constant(a->array->type->fields.array->arrays_of_arrays_size()); array_index = mul(array_index, base_size); From 530eb39c71d2f42ef5d6c556aff77c322434f4e2 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Mon, 19 Oct 2015 08:41:37 -0600 Subject: [PATCH 249/270] svga: fix incorrect round-down arithmetic Spotted by Roland. Luckily, this code should never really be hit since the const buffer size and offset should already be multiples of 16. I could probably add more assertions to that effect, but let's just fix the arithmetic for now. Reviewed-by: Roland Scheidegger --- src/gallium/drivers/svga/svga_state_constants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c index 75592d3bf8b..c93d2a5e565 100644 --- a/src/gallium/drivers/svga/svga_state_constants.c +++ b/src/gallium/drivers/svga/svga_state_constants.c @@ -718,7 +718,7 @@ emit_consts_vgpu10(struct svga_context *svga, unsigned shader) /* round down to mulitple of 16 (this may cause rendering problems * but should avoid a device error). */ - size &= ~16; + size &= ~15; } } From b23a4859f4dbbcca7d6a637010167d470211e45b Mon Sep 17 00:00:00 2001 From: Jose Fonseca Date: Mon, 19 Oct 2015 14:29:28 +0100 Subject: [PATCH 250/270] scons: Build nir/glsl_types.cpp once. Undoes early hacks, and ensures nir/glsl_types.cpp is built once, and only once. The root problem is that SCons doesn't know about NIR nor any source file in the NIR_FILES source list. Tested with libgl-gdi and libgl-xlib scons targets. Reviewed-by: Brian Paul --- src/gallium/targets/libgl-gdi/SConscript | 10 +--------- src/gallium/targets/libgl-gdi/glsl_types_hack.cpp | 3 --- src/gallium/targets/libgl-xlib/SConscript | 3 --- src/gallium/targets/libgl-xlib/glsl_types_hack.cpp | 3 --- src/gallium/targets/osmesa/SConscript | 7 +------ src/gallium/targets/osmesa/glsl_types_hack.cpp | 3 --- src/glsl/SConscript | 7 ++++++- src/mesa/drivers/x11/SConscript | 1 - 8 files changed, 8 insertions(+), 29 deletions(-) delete mode 100644 src/gallium/targets/libgl-gdi/glsl_types_hack.cpp delete mode 100644 src/gallium/targets/libgl-xlib/glsl_types_hack.cpp delete mode 100644 src/gallium/targets/osmesa/glsl_types_hack.cpp diff --git a/src/gallium/targets/libgl-gdi/SConscript b/src/gallium/targets/libgl-gdi/SConscript index eb777a86cb2..594f34d7fad 100644 --- a/src/gallium/targets/libgl-gdi/SConscript +++ b/src/gallium/targets/libgl-gdi/SConscript @@ -7,10 +7,6 @@ env = env.Clone() env.Append(CPPPATH = [ '#src', - '#src/mesa', - '#src/mapi', - '#src/glsl', - '#src/glsl/nir', '#src/gallium/state_trackers/wgl', '#src/gallium/winsys/sw', ]) @@ -24,11 +20,7 @@ env.Append(LIBS = [ env.Prepend(LIBS = [mesautil]) -sources = [ - 'libgl_gdi.c', - 'glsl_types_hack.cpp' -] - +sources = ['libgl_gdi.c'] drivers = [] if True: diff --git a/src/gallium/targets/libgl-gdi/glsl_types_hack.cpp b/src/gallium/targets/libgl-gdi/glsl_types_hack.cpp deleted file mode 100644 index 5c042f23e3b..00000000000 --- a/src/gallium/targets/libgl-gdi/glsl_types_hack.cpp +++ /dev/null @@ -1,3 +0,0 @@ -/* errrg scons.. otherwise "scons: *** Two environments with different actions were specified for the same target: $mesa/build/linux-x86_64-debug/glsl/nir/glsl_types.os" */ -#include "glsl_types.cpp" - diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript index fedc522fbdc..df5a220ac25 100644 --- a/src/gallium/targets/libgl-xlib/SConscript +++ b/src/gallium/targets/libgl-xlib/SConscript @@ -6,8 +6,6 @@ Import('*') env = env.Clone() env.Append(CPPPATH = [ - '#/src/glsl', - '#/src/glsl/nir', '#/src/mapi', '#/src/mesa', '#/src/mesa/main', @@ -38,7 +36,6 @@ env.Prepend(LIBS = [ sources = [ 'xlib.c', - 'glsl_types_hack.cpp', ] if True: diff --git a/src/gallium/targets/libgl-xlib/glsl_types_hack.cpp b/src/gallium/targets/libgl-xlib/glsl_types_hack.cpp deleted file mode 100644 index 5c042f23e3b..00000000000 --- a/src/gallium/targets/libgl-xlib/glsl_types_hack.cpp +++ /dev/null @@ -1,3 +0,0 @@ -/* errrg scons.. otherwise "scons: *** Two environments with different actions were specified for the same target: $mesa/build/linux-x86_64-debug/glsl/nir/glsl_types.os" */ -#include "glsl_types.cpp" - diff --git a/src/gallium/targets/osmesa/SConscript b/src/gallium/targets/osmesa/SConscript index 78930a98e03..4a9115ba1cf 100644 --- a/src/gallium/targets/osmesa/SConscript +++ b/src/gallium/targets/osmesa/SConscript @@ -5,8 +5,6 @@ env = env.Clone() env.Prepend(CPPPATH = [ '#src/mapi', '#src/mesa', - '#src/glsl', - '#src/glsl/nir', #Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers ]) @@ -24,10 +22,7 @@ env.Prepend(LIBS = [ env.Append(CPPDEFINES = ['GALLIUM_TRACE', 'GALLIUM_SOFTPIPE']) -sources = [ - 'target.c', - 'glsl_types_hack.cpp' -] +sources = ['target.c'] if env['llvm']: env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE') diff --git a/src/gallium/targets/osmesa/glsl_types_hack.cpp b/src/gallium/targets/osmesa/glsl_types_hack.cpp deleted file mode 100644 index 5c042f23e3b..00000000000 --- a/src/gallium/targets/osmesa/glsl_types_hack.cpp +++ /dev/null @@ -1,3 +0,0 @@ -/* errrg scons.. otherwise "scons: *** Two environments with different actions were specified for the same target: $mesa/build/linux-x86_64-debug/glsl/nir/glsl_types.os" */ -#include "glsl_types.cpp" - diff --git a/src/glsl/SConscript b/src/glsl/SConscript index 927cbdcdb78..70bf5b09c3c 100644 --- a/src/glsl/SConscript +++ b/src/glsl/SConscript @@ -61,6 +61,12 @@ source_lists = env.ParseSourceList('Makefile.sources') for l in ('LIBGLCPP_FILES', 'LIBGLSL_FILES'): glsl_sources += source_lists[l] +# add nir/glsl_types.cpp manually, because SCons still doesn't know about NIR. +# XXX: Remove this once we build NIR and NIR_FILES. +glsl_sources += [ + 'nir/glsl_types.cpp', +] + if env['msvc']: env.Prepend(CPPPATH = ['#/src/getopt']) env.PrependUnique(LIBS = [getopt]) @@ -81,7 +87,6 @@ mesa_objs = env.StaticObject([ 'prog_hash_table.c', 'symbol_table.c', 'dummy_errors.c', - 'nir/glsl_types.cpp', ]) compiler_objs += mesa_objs diff --git a/src/mesa/drivers/x11/SConscript b/src/mesa/drivers/x11/SConscript index aa1e73a9d4a..cd5cccda0d1 100644 --- a/src/mesa/drivers/x11/SConscript +++ b/src/mesa/drivers/x11/SConscript @@ -33,7 +33,6 @@ sources = [ 'xm_dd.c', 'xm_line.c', 'xm_tri.c', - '../../../glsl/nir/glsl_types.cpp', ] # Disallow undefined symbols From e00314bc57a59b3f816daba6249e7b7157761f86 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Oct 2015 15:49:34 -0700 Subject: [PATCH 251/270] i965/asm: Explicitly use a nir_instr for IR annotations Now that everything goes through NIR, we don't need this to be a void pointer anymore. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 2 +- .../drivers/dri/i965/brw_vec4_generator.cpp | 2 +- .../drivers/dri/i965/intel_asm_annotation.c | 17 ++--------------- .../drivers/dri/i965/intel_asm_annotation.h | 3 +-- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 17e19cf807a..49884909f70 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -2187,7 +2187,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) 100.0f * (before_size - after_size) / before_size); dump_assembly(p->store, annotation.ann_count, annotation.ann, - p->devinfo, prog); + p->devinfo); ralloc_free(annotation.ann); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index dcacc900540..2a1e4159a6b 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1663,7 +1663,7 @@ vec4_generator::generate_code(const cfg_t *cfg) 100.0f * (before_size - after_size) / before_size); dump_assembly(p->store, annotation.ann_count, annotation.ann, - p->devinfo, prog); + p->devinfo); ralloc_free(annotation.ann); } diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.c b/src/mesa/drivers/dri/i965/intel_asm_annotation.c index bb8bb8d38c9..b3d6324a5fe 100644 --- a/src/mesa/drivers/dri/i965/intel_asm_annotation.c +++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.c @@ -33,8 +33,7 @@ void dump_assembly(void *assembly, int num_annotations, struct annotation *annotation, - const struct brw_device_info *devinfo, - const struct gl_program *prog) + const struct brw_device_info *devinfo) { const char *last_annotation_string = NULL; const void *last_annotation_ir = NULL; @@ -57,19 +56,7 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation last_annotation_ir = annotation[i].ir; if (last_annotation_ir) { fprintf(stderr, " "); - if (prog->nir) - nir_print_instr(annotation[i].ir, stderr); - else if (!prog->Instructions) - fprint_ir(stderr, annotation[i].ir); - else { - const struct prog_instruction *pi = - (const struct prog_instruction *)annotation[i].ir; - fprintf(stderr, "%d: ", - (int)(pi - prog->Instructions)); - _mesa_fprint_instruction_opt(stderr, - pi, - 0, PROG_PRINT_DEBUG, NULL); - } + nir_print_instr(annotation[i].ir, stderr); fprintf(stderr, "\n"); } } diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.h b/src/mesa/drivers/dri/i965/intel_asm_annotation.h index d9c69bc41b0..6c72326f058 100644 --- a/src/mesa/drivers/dri/i965/intel_asm_annotation.h +++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.h @@ -60,8 +60,7 @@ struct annotation_info { void dump_assembly(void *assembly, int num_annotations, struct annotation *annotation, - const struct brw_device_info *devinfo, - const struct gl_program *prog); + const struct brw_device_info *devinfo); void annotate(const struct brw_device_info *devinfo, From 5d4bc5ec1339fcdafae957e3473f3c2c9931bb23 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Oct 2015 16:54:36 -0700 Subject: [PATCH 252/270] nir: Add a label to nir_shader_info Reviewed-by: Topi Pohjolainen --- src/glsl/nir/glsl_to_nir.cpp | 2 ++ src/glsl/nir/nir.h | 3 +++ src/glsl/nir/nir_sweep.c | 2 ++ 3 files changed, 7 insertions(+) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index cf5bb9360c8..edc6f5bd9b4 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -151,6 +151,8 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, num_textures = i; shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); + if (shader_prog->Label) + shader->info.label = ralloc_strdup(shader, shader_prog->Label); shader->info.num_textures = num_textures; shader->info.num_ubos = sh->NumUniformBlocks; shader->info.num_abos = shader_prog->NumAtomicBuffers; diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index c867e6d9f18..fb8d59038d8 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1460,6 +1460,9 @@ typedef struct nir_shader_compiler_options { typedef struct nir_shader_info { const char *name; + /* Descriptive name provided by the client; may be NULL */ + const char *label; + /* Number of textures used by this shader */ unsigned num_textures; /* Number of uniform buffers used by this shader */ diff --git a/src/glsl/nir/nir_sweep.c b/src/glsl/nir/nir_sweep.c index b6ce43b5224..5a22f509f50 100644 --- a/src/glsl/nir/nir_sweep.c +++ b/src/glsl/nir/nir_sweep.c @@ -155,6 +155,8 @@ nir_sweep(nir_shader *nir) ralloc_adopt(rubbish, nir); ralloc_steal(nir, (char *)nir->info.name); + if (nir->info.label) + ralloc_steal(nir, (char *)nir->info.label); /* Variables and registers are not dead. Steal them back. */ steal_list(nir, nir_variable, &nir->uniforms); From 16619477bc800d32b5bf2f38dd544960cf66c284 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 15:59:56 -0700 Subject: [PATCH 253/270] mesa: Move gl_frag_depth_layout from mtypes.h to shader_enums.h Reviewed-by: Topi Pohjolainen --- src/glsl/nir/shader_enums.h | 17 +++++++++++++++++ src/mesa/main/mtypes.h | 18 ------------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h index 77638ba4e34..d1cf7ca04cc 100644 --- a/src/glsl/nir/shader_enums.h +++ b/src/glsl/nir/shader_enums.h @@ -480,4 +480,21 @@ const char * gl_frag_result_name(gl_frag_result result); #define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS) +/** + * \brief Layout qualifiers for gl_FragDepth. + * + * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with + * a layout qualifier. + * + * \see enum ir_depth_layout + */ +enum gl_frag_depth_layout +{ + FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */ + FRAG_DEPTH_LAYOUT_ANY, + FRAG_DEPTH_LAYOUT_GREATER, + FRAG_DEPTH_LAYOUT_LESS, + FRAG_DEPTH_LAYOUT_UNCHANGED +}; + #endif /* SHADER_ENUMS_H */ diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index e9d8ea42bce..9ca6deaabb6 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -1858,24 +1858,6 @@ typedef enum } gl_register_file; -/** - * \brief Layout qualifiers for gl_FragDepth. - * - * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with - * a layout qualifier. - * - * \see enum ir_depth_layout - */ -enum gl_frag_depth_layout -{ - FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */ - FRAG_DEPTH_LAYOUT_ANY, - FRAG_DEPTH_LAYOUT_GREATER, - FRAG_DEPTH_LAYOUT_LESS, - FRAG_DEPTH_LAYOUT_UNCHANGED -}; - - /** * Base class for any kind of program object */ From fe399f3a69689a78ca4a5fb5a6b76435adcd41e5 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 15:36:51 -0700 Subject: [PATCH 254/270] nir/info: Move the GS info into a stage-specific info union This way we can have other stage-specific info without consuming too much extra space. While we're at it, we make sure that the geometry info is only set if we're actually a goemetry shader. Reviewed-by: Topi Pohjolainen --- src/glsl/nir/glsl_to_nir.cpp | 12 ++++++++++-- src/glsl/nir/nir.h | 14 ++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index edc6f5bd9b4..95b2312f30e 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -164,11 +164,19 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.uses_texture_gather = sh->Program->UsesGather; shader->info.uses_clip_distance_out = sh->Program->UsesClipDistanceOut; shader->info.separate_shader = shader_prog->SeparateShader; - shader->info.gs.vertices_out = sh->Geom.VerticesOut; - shader->info.gs.invocations = sh->Geom.Invocations; shader->info.has_transform_feedback_varyings = shader_prog->TransformFeedback.NumVarying > 0; + switch (stage) { + case MESA_SHADER_GEOMETRY: + shader->info.gs.vertices_out = sh->Geom.VerticesOut; + shader->info.gs.invocations = sh->Geom.Invocations; + break; + + default: + break; /* No stage-specific info */ + } + return shader; } diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index fb8d59038d8..719e6cebff4 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1493,13 +1493,15 @@ typedef struct nir_shader_info { /** Was this shader linked with any transform feedback varyings? */ bool has_transform_feedback_varyings; - struct { - /** The maximum number of vertices the geometry shader might write. */ - unsigned vertices_out; + union { + struct { + /** The maximum number of vertices the geometry shader might write. */ + unsigned vertices_out; - /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ - unsigned invocations; - } gs; + /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ + unsigned invocations; + } gs; + }; } nir_shader_info; typedef struct nir_shader { From 4889c73dd1ed0af7920b950f6810361a6eeabcc2 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 15:02:25 -0700 Subject: [PATCH 255/270] nir/info: Add compute shader local size to nir_shader_info Reviewed-by: Topi Pohjolainen --- src/glsl/nir/glsl_to_nir.cpp | 8 ++++++++ src/glsl/nir/nir.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 95b2312f30e..d230ad4416d 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -173,6 +173,14 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.gs.invocations = sh->Geom.Invocations; break; + case MESA_SHADER_COMPUTE: { + struct gl_compute_program *cp = (struct gl_compute_program *)sh->Program; + shader->info.cs.local_size[0] = cp->LocalSize[0]; + shader->info.cs.local_size[1] = cp->LocalSize[1]; + shader->info.cs.local_size[2] = cp->LocalSize[2]; + break; + } + default: break; /* No stage-specific info */ } diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 719e6cebff4..32259e720d9 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1501,6 +1501,10 @@ typedef struct nir_shader_info { /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */ unsigned invocations; } gs; + + struct { + unsigned local_size[3]; + } cs; }; } nir_shader_info; From 688d2e45855299dcf474791f29d65040ce5cb2dc Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 15:47:09 -0700 Subject: [PATCH 256/270] nir/info: Add a few bits of info for fragment shaders Reviewed-by: Topi Pohjolainen --- src/glsl/nir/glsl_to_nir.cpp | 10 ++++++++++ src/glsl/nir/nir.h | 13 +++++++++++++ src/mesa/program/prog_to_nir.c | 6 ++++++ 3 files changed, 29 insertions(+) diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index d230ad4416d..76e1382c362 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -173,6 +173,16 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.gs.invocations = sh->Geom.Invocations; break; + case MESA_SHADER_FRAGMENT: { + struct gl_fragment_program *fp = + (struct gl_fragment_program *)sh->Program; + + shader->info.fs.uses_discard = fp->UsesKill; + shader->info.fs.early_fragment_tests = sh->EarlyFragmentTests; + shader->info.fs.depth_layout = fp->FragDepthLayout; + break; + } + case MESA_SHADER_COMPUTE: { struct gl_compute_program *cp = (struct gl_compute_program *)sh->Program; shader->info.cs.local_size[0] = cp->LocalSize[0]; diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 32259e720d9..2ab48fb9d9c 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1502,6 +1502,19 @@ typedef struct nir_shader_info { unsigned invocations; } gs; + struct { + bool uses_discard; + + /** + * Whether early fragment tests are enabled as defined by + * ARB_shader_image_load_store. + */ + bool early_fragment_tests; + + /** gl_FragDepth layout for ARB_conservative_depth. */ + enum gl_frag_depth_layout depth_layout; + } fs; + struct { unsigned local_size[3]; } cs; diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index da61a2b9bd3..539e3c05312 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -1129,6 +1129,12 @@ prog_to_nir(const struct gl_program *prog, s->info.uses_clip_distance_out = false; s->info.separate_shader = false; + if (stage == MESA_SHADER_FRAGMENT) { + struct gl_fragment_program *fp = (struct gl_fragment_program *)prog; + + s->info.fs.uses_discard = fp->UsesKill; + } + fail: if (c->error) { ralloc_free(s); From 5e86f5b3d21fe8e96676bb0608990d72dbf61b85 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Oct 2015 16:01:33 -0700 Subject: [PATCH 257/270] i965/fs: Remove the gl_program from the generator Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++-- src/mesa/drivers/dri/i965/brw_fs.h | 3 --- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 3 +-- src/mesa/drivers/dri/i965/brw_vec4.cpp | 2 +- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp index d458ad846bf..5308d175416 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp @@ -32,7 +32,7 @@ brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw, generator(brw->intelScreen->compiler, brw, mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key), (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data), - NULL, 0, false, "BLORP") + 0, false, "BLORP") { if (debug_flag) generator.enable_debug("blorp"); diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index a2fd4411d38..638a3c685b3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5145,7 +5145,7 @@ brw_wm_fs_emit(struct brw_context *brw, fs_generator g(brw->intelScreen->compiler, brw, mem_ctx, (void *) key, &prog_data->base, - &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS"); + v.promoted_constants, v.runtime_check_aads_emit, "FS"); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { char *name; @@ -5301,7 +5301,7 @@ brw_cs_emit(struct brw_context *brw, } fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void*) key, &prog_data->base, &cp->Base, + mem_ctx, (void*) key, &prog_data->base, v8.promoted_constants, v8.runtime_check_aads_emit, "CS"); if (INTEL_DEBUG & DEBUG_CS) { char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d", diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 29a009ed406..0da5a7625a2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -400,7 +400,6 @@ public: void *mem_ctx, const void *key, struct brw_stage_prog_data *prog_data, - struct gl_program *fp, unsigned promoted_constants, bool runtime_check_aads_emit, const char *stage_abbrev); @@ -499,8 +498,6 @@ private: const void * const key; struct brw_stage_prog_data * const prog_data; - const struct gl_program *prog; - unsigned dispatch_width; /**< 8 or 16 */ exec_list discard_halt_patches; diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 49884909f70..13c495cd395 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -131,7 +131,6 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const void *key, struct brw_stage_prog_data *prog_data, - struct gl_program *prog, unsigned promoted_constants, bool runtime_check_aads_emit, const char *stage_abbrev) @@ -139,7 +138,7 @@ fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo), key(key), prog_data(prog_data), - prog(prog), promoted_constants(promoted_constants), + promoted_constants(promoted_constants), runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), stage_abbrev(stage_abbrev), mem_ctx(mem_ctx) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 08f3e9188ed..b3b76cc9cdf 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1952,7 +1952,7 @@ brw_vs_emit(struct brw_context *brw, fs_generator g(brw->intelScreen->compiler, brw, mem_ctx, (void *) key, &prog_data->base.base, - &vp->Base, v.promoted_constants, + v.promoted_constants, v.runtime_check_aads_emit, "VS"); if (INTEL_DEBUG & DEBUG_VS) { char *name; From 8f1d968704858d78d7e78a6b88db3ea2bc0cf749 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Oct 2015 17:41:46 -0700 Subject: [PATCH 258/270] i965/vec4: Remove gl_program and gl_shader_program from the generator Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 4 ++-- src/mesa/drivers/dri/i965/brw_vec4.h | 10 +++----- .../drivers/dri/i965/brw_vec4_generator.cpp | 24 ++++++++----------- .../drivers/dri/i965/brw_vec4_gs_visitor.cpp | 11 ++++----- 4 files changed, 20 insertions(+), 29 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index b3b76cc9cdf..bcd1f487f0b 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1990,9 +1990,9 @@ brw_vs_emit(struct brw_context *brw, } vec4_generator g(brw->intelScreen->compiler, brw, - prog, &vp->Base, &prog_data->base, + &prog_data->base, mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS"); - assembly = g.generate_assembly(v.cfg, final_assembly_size); + assembly = g.generate_assembly(v.cfg, final_assembly_size, vp->Base.nir); } return assembly; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 5e3500c0c9a..cf9ec0aa2ef 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -391,8 +391,6 @@ class vec4_generator { public: vec4_generator(const struct brw_compiler *compiler, void *log_data, - struct gl_shader_program *shader_prog, - struct gl_program *prog, struct brw_vue_prog_data *prog_data, void *mem_ctx, bool debug_flag, @@ -400,10 +398,11 @@ public: const char *stage_abbrev); ~vec4_generator(); - const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size); + const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size, + const nir_shader *nir); private: - void generate_code(const cfg_t *cfg); + void generate_code(const cfg_t *cfg, const nir_shader *nir); void generate_math1_gen4(vec4_instruction *inst, struct brw_reg dst, @@ -485,9 +484,6 @@ private: struct brw_codegen *p; - struct gl_shader_program *shader_prog; - const struct gl_program *prog; - struct brw_vue_prog_data *prog_data; void *mem_ctx; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 2a1e4159a6b..a84f6c47471 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -21,6 +21,7 @@ */ #include +#include "glsl/glsl_parser_extras.h" #include "brw_vec4.h" #include "brw_cfg.h" @@ -137,15 +138,13 @@ vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i) vec4_generator::vec4_generator(const struct brw_compiler *compiler, void *log_data, - struct gl_shader_program *shader_prog, - struct gl_program *prog, struct brw_vue_prog_data *prog_data, void *mem_ctx, bool debug_flag, const char *stage_name, const char *stage_abbrev) : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo), - shader_prog(shader_prog), prog(prog), prog_data(prog_data), + prog_data(prog_data), mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev), debug_flag(debug_flag) { @@ -1142,7 +1141,7 @@ vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst, } void -vec4_generator::generate_code(const cfg_t *cfg) +vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir) { struct annotation_info annotation; memset(&annotation, 0, sizeof(annotation)); @@ -1648,14 +1647,10 @@ vec4_generator::generate_code(const cfg_t *cfg) int after_size = p->next_insn_offset; if (unlikely(debug_flag)) { - if (shader_prog) { - fprintf(stderr, "Native code for %s %s shader %d:\n", - shader_prog->Label ? shader_prog->Label : "unnamed", - stage_name, shader_prog->Name); - } else { - fprintf(stderr, "Native code for %s program %d:\n", stage_name, - prog->Id); - } + fprintf(stderr, "Native code for %s %s shader %s:\n", + nir->info.label ? nir->info.label : "unnamed", + _mesa_shader_stage_to_string(nir->stage), nir->info.name); + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d" " bytes (%.0f%%)\n", stage_abbrev, @@ -1676,10 +1671,11 @@ vec4_generator::generate_code(const cfg_t *cfg) const unsigned * vec4_generator::generate_assembly(const cfg_t *cfg, - unsigned *assembly_size) + unsigned *assembly_size, + const nir_shader *nir) { brw_set_default_access_mode(p, BRW_ALIGN_16); - generate_code(cfg); + generate_code(cfg, nir); return brw_get_program(p, assembly_size); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 775f64d96bc..5775cd276dc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -598,17 +598,16 @@ vec4_gs_visitor::gs_end_primitive() static const unsigned * generate_assembly(struct brw_context *brw, - struct gl_shader_program *shader_prog, - struct gl_program *prog, + const nir_shader *nir, struct brw_vue_prog_data *prog_data, void *mem_ctx, const cfg_t *cfg, unsigned *final_assembly_size) { vec4_generator g(brw->intelScreen->compiler, brw, - shader_prog, prog, prog_data, mem_ctx, + prog_data, mem_ctx, INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); - return g.generate_assembly(cfg, final_assembly_size); + return g.generate_assembly(cfg, final_assembly_size, nir); } extern "C" const unsigned * @@ -634,7 +633,7 @@ brw_gs_emit(struct brw_context *brw, c, shader->Program->nir, mem_ctx, true /* no_spills */, shader_time_index); if (v.run()) { - return generate_assembly(brw, prog, &c->gp->program.Base, + return generate_assembly(brw, shader->Program->nir, &c->prog_data.base, mem_ctx, v.cfg, final_assembly_size); } @@ -687,7 +686,7 @@ brw_gs_emit(struct brw_context *brw, prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, gs->fail_msg); } else { - ret = generate_assembly(brw, prog, &c->gp->program.Base, + ret = generate_assembly(brw, shader->Program->nir, &c->prog_data.base, mem_ctx, gs->cfg, final_assembly_size); } From 0ca401327ef9e280b3a8b008f1e41477afec3a35 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Oct 2015 19:26:02 -0700 Subject: [PATCH 259/270] i965: Use a const nir_shader in backend_shader Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_fs.h | 2 +- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +- src/mesa/drivers/dri/i965/brw_shader.cpp | 2 +- src/mesa/drivers/dri/i965/brw_shader.h | 4 ++-- src/mesa/drivers/dri/i965/brw_vec4.h | 2 +- src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 2 +- src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h | 2 +- src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 2 +- src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp | 2 +- src/mesa/drivers/dri/i965/brw_vs.h | 2 +- src/mesa/drivers/dri/i965/gen6_gs_visitor.h | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 0da5a7625a2..171338dcc0b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -96,7 +96,7 @@ public: const void *key, struct brw_stage_prog_data *prog_data, struct gl_program *prog, - nir_shader *shader, + const nir_shader *shader, unsigned dispatch_width, int shader_time_index); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index eac1ec0c932..23c99b7b912 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -1070,7 +1070,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, const void *key, struct brw_stage_prog_data *prog_data, struct gl_program *prog, - nir_shader *shader, + const nir_shader *shader, unsigned dispatch_width, int shader_time_index) : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 7ee0c66468c..2324b56f583 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -657,7 +657,7 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg) backend_shader::backend_shader(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, - nir_shader *shader, + const nir_shader *shader, struct brw_stage_prog_data *stage_prog_data) : compiler(compiler), log_data(log_data), diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index ad2de5eae2d..6d4cf048390 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -225,7 +225,7 @@ protected: backend_shader(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, - nir_shader *shader, + const nir_shader *shader, struct brw_stage_prog_data *stage_prog_data); public: @@ -234,7 +234,7 @@ public: void *log_data; /* Passed to compiler->*_log functions */ const struct brw_device_info * const devinfo; - nir_shader *nir; + const nir_shader *nir; struct brw_stage_prog_data * const stage_prog_data; /** ralloc context for temporary data used during compile */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index cf9ec0aa2ef..d861b2e85df 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -72,7 +72,7 @@ public: void *log_data, const struct brw_sampler_prog_key_data *key, struct brw_vue_prog_data *prog_data, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 5775cd276dc..b710d64f74e 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -35,7 +35,7 @@ namespace brw { vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler, void *log_data, struct brw_gs_compile *c, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h index 3ff195c3e68..d75e4802d8d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h @@ -71,7 +71,7 @@ public: vec4_gs_visitor(const struct brw_compiler *compiler, void *log_data, struct brw_gs_compile *c, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 7bc13fe29d6..f891910ae60 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1814,7 +1814,7 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, void *log_data, const struct brw_sampler_prog_key_data *key_tex, struct brw_vue_prog_data *prog_data, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp index b6e1971c2ee..485a80ee2fc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp @@ -301,7 +301,7 @@ vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler, void *log_data, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *vs_prog_data, - nir_shader *shader, + const nir_shader *shader, gl_clip_plane *clip_planes, void *mem_ctx, int shader_time_index, diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index f1242f61b33..c927cacd787 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -88,7 +88,7 @@ public: void *log_data, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *vs_prog_data, - nir_shader *shader, + const nir_shader *shader, gl_clip_plane *clip_planes, void *mem_ctx, int shader_time_index, diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h index e75d6aa10b8..d02c67d8a74 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h @@ -39,7 +39,7 @@ public: void *log_data, struct brw_gs_compile *c, struct gl_shader_program *prog, - nir_shader *shader, + const nir_shader *shader, void *mem_ctx, bool no_spills, int shader_time_index) : From 22ad44910e993e1acd0b4052722fe786626008b5 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Oct 2015 19:27:28 -0700 Subject: [PATCH 260/270] i965/fs: Rework wm_fs_emit to take a nir_shader and a brw_compiler This commit removes all dependence on GL state by getting rid of the brw_context parameter and the GL data structures. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_fs.cpp | 59 ++++++++++++---------------- src/mesa/drivers/dri/i965/brw_wm.c | 14 ++++++- src/mesa/drivers/dri/i965/brw_wm.h | 13 ++++-- 3 files changed, 47 insertions(+), 39 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 638a3c685b3..d37a9ed0b55 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5093,40 +5093,39 @@ fs_visitor::run_cs() } const unsigned * -brw_wm_fs_emit(struct brw_context *brw, +brw_wm_fs_emit(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - struct gl_fragment_program *fp, - struct gl_shader_program *prog, + const nir_shader *shader, + struct gl_program *prog, int shader_time_index8, int shader_time_index16, - unsigned *final_assembly_size) + bool use_rep_send, + unsigned *final_assembly_size, + char **error_str) { - /* Now the main event: Visit the shader IR and generate our FS IR for it. - */ - fs_visitor v(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &fp->Base, fp->Base.nir, 8, shader_time_index8); + fs_visitor v(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 8, + shader_time_index8); if (!v.run_fs(false /* do_rep_send */)) { - if (prog) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - } - - _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", - v.fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); return NULL; } cfg_t *simd16_cfg = NULL; - fs_visitor v2(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &fp->Base, fp->Base.nir, 16, shader_time_index16); - if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) { + fs_visitor v2(compiler, log_data, mem_ctx, key, + &prog_data->base, prog, shader, 16, + shader_time_index16); + if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { if (!v.simd16_unsupported) { /* Try a SIMD16 compile */ v2.import_uniforms(&v); - if (!v2.run_fs(brw->use_rep_send)) { - perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg); + if (!v2.run_fs(use_rep_send)) { + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v2.fail_msg); } else { simd16_cfg = v2.cfg; } @@ -5134,8 +5133,8 @@ brw_wm_fs_emit(struct brw_context *brw, } cfg_t *simd8_cfg; - int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8; - if ((no_simd8 || brw->gen < 5) && simd16_cfg) { + int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send; + if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) { simd8_cfg = NULL; prog_data->no_8 = true; } else { @@ -5143,20 +5142,14 @@ brw_wm_fs_emit(struct brw_context *brw, prog_data->no_8 = false; } - fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void *) key, &prog_data->base, + fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base, v.promoted_constants, v.runtime_check_aads_emit, "FS"); if (unlikely(INTEL_DEBUG & DEBUG_WM)) { - char *name; - if (prog) - name = ralloc_asprintf(mem_ctx, "%s fragment shader %d", - prog->Label ? prog->Label : "unnamed", - prog->Name); - else - name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id); - - g.enable_debug(name); + g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", + shader->info.label ? shader->info.label : + "unnamed", + shader->info.name)); } if (simd8_cfg) diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 65de54335e8..c40fb0e4376 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -230,9 +230,19 @@ brw_codegen_wm_prog(struct brw_context *brw, st_index16 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS16); } - program = brw_wm_fs_emit(brw, mem_ctx, key, &prog_data, - &fp->program, prog, st_index8, st_index16, &program_size); + char *error_str = NULL; + program = brw_wm_fs_emit(brw->intelScreen->compiler, brw, mem_ctx, + key, &prog_data, fp->program.Base.nir, + &fp->program.Base, st_index8, st_index16, + brw->use_rep_send, &program_size, &error_str); if (program == NULL) { + if (prog) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, error_str); + } + + _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", error_str); + ralloc_free(mem_ctx); return false; } diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index 6ee22b2f907..ac22bee11ad 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -61,20 +61,25 @@ extern "C" { #endif +struct nir_shader; + /** * Compile a fragment shader. * * Returns the final assembly and the program's size. */ -const unsigned *brw_wm_fs_emit(struct brw_context *brw, +const unsigned *brw_wm_fs_emit(const struct brw_compiler *compiler, + void *log_data, void *mem_ctx, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, - struct gl_fragment_program *fp, - struct gl_shader_program *prog, + const struct nir_shader *shader, + struct gl_program *prog, int shader_time_index8, int shader_time_index16, - unsigned *final_assembly_size); + bool use_rep_send, + unsigned *final_assembly_size, + char **error_str); GLboolean brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog); struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type); From 5d8bf6de6166a686a006478a420bcd373860e9ee Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 13:53:33 -0700 Subject: [PATCH 261/270] i965/vs: Rework vs_emit to take a nir_shader and a brw_compiler This commit removes all dependence on GL state by getting rid of the brw_context parameter and the GL data structures. v2 (Jason Ekstrand): - Patch use_legacy_snorm_formula through as a function argument rather than trying to go through the shader key. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 70 ++++++++++---------------- src/mesa/drivers/dri/i965/brw_vs.c | 16 +++++- src/mesa/drivers/dri/i965/brw_vs.h | 12 +++-- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index bcd1f487f0b..1b3bce82097 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1920,51 +1920,42 @@ extern "C" { * Returns the final assembly and the program's size. */ const unsigned * -brw_vs_emit(struct brw_context *brw, +brw_vs_emit(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *prog_data, - struct gl_vertex_program *vp, - struct gl_shader_program *prog, + const nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, int shader_time_index, - unsigned *final_assembly_size) + unsigned *final_assembly_size, + char **error_str) { const unsigned *assembly = NULL; - if (brw->intelScreen->compiler->scalar_vs) { + if (compiler->scalar_vs) { prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; - fs_visitor v(brw->intelScreen->compiler, brw, - mem_ctx, key, &prog_data->base.base, + fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ - vp->Base.nir, 8, shader_time_index); - if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) { - if (prog) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - } - - _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", - v.fail_msg); + shader, 8, shader_time_index); + if (!v.run_vs(clip_planes)) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); return NULL; } - fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void *) key, &prog_data->base.base, - v.promoted_constants, + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, v.runtime_check_aads_emit, "VS"); if (INTEL_DEBUG & DEBUG_VS) { - char *name; - if (prog) { - name = ralloc_asprintf(mem_ctx, "%s vertex shader %d", - prog->Label ? prog->Label : "unnamed", - prog->Name); - } else { - name = ralloc_asprintf(mem_ctx, "vertex program %d", - vp->Base.Id); - } - g.enable_debug(name); + const char *debug_name = + ralloc_asprintf(mem_ctx, "%s vertex shader %s", + shader->info.label ? shader->info.label : "unnamed", + shader->info.name); + + g.enable_debug(debug_name); } g.generate_code(v.cfg, 8); assembly = g.get_assembly(final_assembly_size); @@ -1973,26 +1964,19 @@ brw_vs_emit(struct brw_context *brw, if (!assembly) { prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; - vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data, - vp->Base.nir, brw_select_clip_planes(&brw->ctx), - mem_ctx, shader_time_index, - !_mesa_is_gles3(&brw->ctx)); + vec4_vs_visitor v(compiler, log_data, key, prog_data, + shader, clip_planes, mem_ctx, + shader_time_index, use_legacy_snorm_formula); if (!v.run()) { - if (prog) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, v.fail_msg); - } - - _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", - v.fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); return NULL; } - vec4_generator g(brw->intelScreen->compiler, brw, - &prog_data->base, + vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS"); - assembly = g.generate_assembly(v.cfg, final_assembly_size, vp->Base.nir); + assembly = g.generate_assembly(v.cfg, final_assembly_size, shader); } return assembly; diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 725311732ce..353c5a04968 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -31,6 +31,7 @@ #include "main/compiler.h" +#include "main/context.h" #include "brw_context.h" #include "brw_vs.h" #include "brw_util.h" @@ -201,9 +202,20 @@ brw_codegen_vs_prog(struct brw_context *brw, /* Emit GEN4 code. */ - program = brw_vs_emit(brw, mem_ctx, key, &prog_data, - &vp->program, prog, st_index, &program_size); + char *error_str; + program = brw_vs_emit(brw->intelScreen->compiler, brw, mem_ctx, key, + &prog_data, vp->program.Base.nir, + brw_select_clip_planes(&brw->ctx), + !_mesa_is_gles3(&brw->ctx), + st_index, &program_size, &error_str); if (program == NULL) { + if (prog) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, error_str); + } + + _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", error_str); + ralloc_free(mem_ctx); return false; } diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index c927cacd787..b65dd3b6012 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -54,14 +54,18 @@ extern "C" { #endif -const unsigned *brw_vs_emit(struct brw_context *brw, +struct nir_shader; + +const unsigned *brw_vs_emit(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *prog_data, - struct gl_vertex_program *vp, - struct gl_shader_program *shader_prog, + const struct nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, int shader_time_index, - unsigned *program_size); + unsigned *final_assembly_size, + char **error_str); void brw_vs_debug_recompile(struct brw_context *brw, struct gl_shader_program *prog, const struct brw_vs_prog_key *key); From 657863bb5c895fac7f5e52dfd025d07bf52f94a8 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 14:39:56 -0700 Subject: [PATCH 262/270] i965/gs: Rework gs_emit to take a nir_shader and a brw_compiler This commit removes all dependence on GL state by getting rid of the brw_context parameter and the GL data structures. Unfortunately, we still have to pass in the gl_shader_program for gen6 because it's needed for transform feedback. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_gs.c | 6 +- .../drivers/dri/i965/brw_vec4_gs_visitor.cpp | 55 +++++++------------ .../drivers/dri/i965/brw_vec4_gs_visitor.h | 8 ++- 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index e0165fb4a23..469a6fbcc7f 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -57,6 +57,7 @@ brw_codegen_gs_prog(struct brw_context *brw, struct brw_geometry_program *gp, struct brw_gs_prog_key *key) { + struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; struct brw_stage_state *stage_state = &brw->gs.base; struct brw_gs_compile c; memset(&c, 0, sizeof(c)); @@ -300,8 +301,11 @@ brw_codegen_gs_prog(struct brw_context *brw, void *mem_ctx = ralloc_context(NULL); unsigned program_size; + char *error_str; const unsigned *program = - brw_gs_emit(brw, prog, &c, mem_ctx, st_index, &program_size); + brw_gs_emit(brw->intelScreen->compiler, brw, &c, + shader->Program->nir, prog, + mem_ctx, st_index, &program_size, &error_str); if (program == NULL) { ralloc_free(mem_ctx); return false; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index b710d64f74e..8fc6a08f386 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -596,31 +596,17 @@ vec4_gs_visitor::gs_end_primitive() emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); } -static const unsigned * -generate_assembly(struct brw_context *brw, - const nir_shader *nir, - struct brw_vue_prog_data *prog_data, - void *mem_ctx, - const cfg_t *cfg, - unsigned *final_assembly_size) -{ - vec4_generator g(brw->intelScreen->compiler, brw, - prog_data, mem_ctx, - INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); - return g.generate_assembly(cfg, final_assembly_size, nir); -} - extern "C" const unsigned * -brw_gs_emit(struct brw_context *brw, - struct gl_shader_program *prog, +brw_gs_emit(const struct brw_compiler *compiler, void *log_data, struct brw_gs_compile *c, + const nir_shader *shader, + struct gl_shader_program *shader_prog, void *mem_ctx, int shader_time_index, - unsigned *final_assembly_size) + unsigned *final_assembly_size, + char **error_str) { - struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; - - if (brw->gen >= 7) { + if (compiler->devinfo->gen >= 7) { /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do * so without spilling. If the GS invocations count > 1, then we can't use * dual object mode. @@ -629,13 +615,12 @@ brw_gs_emit(struct brw_context *brw, likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) { c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; - vec4_gs_visitor v(brw->intelScreen->compiler, brw, - c, shader->Program->nir, + vec4_gs_visitor v(compiler, log_data, c, shader, mem_ctx, true /* no_spills */, shader_time_index); if (v.run()) { - return generate_assembly(brw, shader->Program->nir, - &c->prog_data.base, mem_ctx, v.cfg, - final_assembly_size); + vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx, + INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); + return g.generate_assembly(v.cfg, final_assembly_size, shader); } } } @@ -663,7 +648,7 @@ brw_gs_emit(struct brw_context *brw, * mode is more performant when invocations > 1. Gen6 only supports * SINGLE mode. */ - if (c->prog_data.invocations <= 1 || brw->gen < 7) + if (c->prog_data.invocations <= 1 || compiler->devinfo->gen < 7) c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE; else c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE; @@ -671,24 +656,22 @@ brw_gs_emit(struct brw_context *brw, vec4_gs_visitor *gs = NULL; const unsigned *ret = NULL; - if (brw->gen >= 7) - gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw, - c, shader->Program->nir, + if (compiler->devinfo->gen >= 7) + gs = new vec4_gs_visitor(compiler, log_data, c, shader, mem_ctx, false /* no_spills */, shader_time_index); else - gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw, - c, prog, shader->Program->nir, + gs = new gen6_gs_visitor(compiler, log_data, c, shader_prog, shader, mem_ctx, false /* no_spills */, shader_time_index); if (!gs->run()) { - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, gs->fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, gs->fail_msg); } else { - ret = generate_assembly(brw, shader->Program->nir, - &c->prog_data.base, mem_ctx, gs->cfg, - final_assembly_size); + vec4_generator g(compiler, log_data, &c->prog_data.base, mem_ctx, + INTEL_DEBUG & DEBUG_GS, "geometry", "GS"); + ret = g.generate_assembly(gs->cfg, final_assembly_size, shader); } delete gs; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h index d75e4802d8d..e9ced7f04f8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h @@ -51,12 +51,14 @@ struct brw_gs_compile extern "C" { #endif -const unsigned *brw_gs_emit(struct brw_context *brw, - struct gl_shader_program *prog, +const unsigned *brw_gs_emit(const struct brw_compiler *compiler, void *log_data, struct brw_gs_compile *c, + const nir_shader *shader, + struct gl_shader_program *shader_prog, void *mem_ctx, int shader_time_index, - unsigned *final_assembly_size); + unsigned *final_assembly_size, + char **error_str); #ifdef __cplusplus } /* extern "C" */ From 4e711872d024ce41c8b07b1150d8a393de21e26d Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 15:28:26 -0700 Subject: [PATCH 263/270] i965/cs: Rework cs_emit to take a nir_shader and a brw_compiler This commit removes all dependence on GL state by getting rid of the brw_context parameter and the GL data structures. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_cs.c | 10 ++++-- src/mesa/drivers/dri/i965/brw_cs.h | 10 +++--- src/mesa/drivers/dri/i965/brw_fs.cpp | 51 +++++++++++++++------------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index 45fb816c160..12e75097f8b 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -105,9 +105,15 @@ brw_codegen_cs_prog(struct brw_context *brw, if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS); - program = brw_cs_emit(brw, mem_ctx, key, &prog_data, - &cp->program, prog, st_index, &program_size); + char *error_str; + program = brw_cs_emit(brw->intelScreen->compiler, brw, mem_ctx, + key, &prog_data, cp->program.Base.nir, + st_index, &program_size, &error_str); if (program == NULL) { + prog->LinkStatus = false; + ralloc_strcat(&prog->InfoLog, error_str); + _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); + ralloc_free(mem_ctx); return false; } diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index 17c2ff9871a..1a9613e3039 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -39,15 +39,17 @@ extern "C" { void brw_upload_cs_prog(struct brw_context *brw); +struct nir_shader; + const unsigned * -brw_cs_emit(struct brw_context *brw, +brw_cs_emit(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_cs_prog_key *key, struct brw_cs_prog_data *prog_data, - struct gl_compute_program *cp, - struct gl_shader_program *prog, + const struct nir_shader *shader, int shader_time_index, - unsigned *final_assembly_size); + unsigned *final_assembly_size, + char **error_str); void brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index d37a9ed0b55..ce130dffad6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5234,29 +5234,32 @@ fs_visitor::emit_cs_work_group_id_setup() } const unsigned * -brw_cs_emit(struct brw_context *brw, +brw_cs_emit(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_cs_prog_key *key, struct brw_cs_prog_data *prog_data, - struct gl_compute_program *cp, - struct gl_shader_program *prog, + const nir_shader *shader, int shader_time_index, - unsigned *final_assembly_size) + unsigned *final_assembly_size, + char **error_str) { - prog_data->local_size[0] = cp->LocalSize[0]; - prog_data->local_size[1] = cp->LocalSize[1]; - prog_data->local_size[2] = cp->LocalSize[2]; + prog_data->local_size[0] = shader->info.cs.local_size[0]; + prog_data->local_size[1] = shader->info.cs.local_size[1]; + prog_data->local_size[2] = shader->info.cs.local_size[2]; unsigned local_workgroup_size = - cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2]; - unsigned max_cs_threads = brw->intelScreen->compiler->devinfo->max_cs_threads; + shader->info.cs.local_size[0] * shader->info.cs.local_size[1] * + shader->info.cs.local_size[2]; + + unsigned max_cs_threads = compiler->devinfo->max_cs_threads; cfg_t *cfg = NULL; const char *fail_msg = NULL; /* Now the main event: Visit the shader IR and generate our CS IR for it. */ - fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &cp->Base, cp->Base.nir, 8, shader_time_index); + fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 8, shader_time_index); if (!v8.run_cs()) { fail_msg = v8.fail_msg; } else if (local_workgroup_size <= 8 * max_cs_threads) { @@ -5264,15 +5267,18 @@ brw_cs_emit(struct brw_context *brw, prog_data->simd_size = 8; } - fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data->base, &cp->Base, cp->Base.nir, 16, shader_time_index); + fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + shader, 16, shader_time_index); if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && !fail_msg && !v8.simd16_unsupported && local_workgroup_size <= 16 * max_cs_threads) { /* Try a SIMD16 compile */ v16.import_uniforms(&v8); if (!v16.run_cs()) { - perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg); + compiler->shader_perf_log(log_data, + "SIMD16 shader failed to compile: %s", + v16.fail_msg); if (!cfg) { fail_msg = "Couldn't generate SIMD16 program and not " @@ -5286,20 +5292,19 @@ brw_cs_emit(struct brw_context *brw, if (unlikely(cfg == NULL)) { assert(fail_msg); - prog->LinkStatus = false; - ralloc_strcat(&prog->InfoLog, fail_msg); - _mesa_problem(NULL, "Failed to compile compute shader: %s\n", - fail_msg); + if (error_str) + *error_str = ralloc_strdup(mem_ctx, fail_msg); + return NULL; } - fs_generator g(brw->intelScreen->compiler, brw, - mem_ctx, (void*) key, &prog_data->base, + fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base, v8.promoted_constants, v8.runtime_check_aads_emit, "CS"); if (INTEL_DEBUG & DEBUG_CS) { - char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d", - prog->Label ? prog->Label : "unnamed", - prog->Name); + char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", + shader->info.label ? shader->info.label : + "unnamed", + shader->info.name); g.enable_debug(name); } From 67db9072b9fde74277f74f7303366b8bdd3a711e Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 16:01:44 -0700 Subject: [PATCH 264/270] i965/fs: Move some of the prog_data setup into brw_wm_emit This commit moves the common/modern stuff. Some legacy stuff such as setting use_alt_mode was left because it needs to know whether or not we're an ARB program. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_fs.cpp | 100 +++++++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_wm.c | 98 -------------------------- 2 files changed, 100 insertions(+), 98 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index ce130dffad6..2d162b8ae99 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5092,6 +5092,90 @@ fs_visitor::run_cs() return !failed; } +/** + * Return a bitfield where bit n is set if barycentric interpolation mode n + * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader. + */ +static unsigned +brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, + bool shade_model_flat, + bool persample_shading, + const nir_shader *shader) +{ + unsigned barycentric_interp_modes = 0; + + nir_foreach_variable(var, &shader->inputs) { + enum glsl_interp_qualifier interp_qualifier = + (enum glsl_interp_qualifier)var->data.interpolation; + bool is_centroid = var->data.centroid && !persample_shading; + bool is_sample = var->data.sample || persample_shading; + bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) || + (var->data.location == VARYING_SLOT_COL1); + + /* Ignore WPOS and FACE, because they don't require interpolation. */ + if (var->data.location == VARYING_SLOT_POS || + var->data.location == VARYING_SLOT_FACE) + continue; + + /* Determine the set (or sets) of barycentric coordinates needed to + * interpolate this variable. Note that when + * brw->needs_unlit_centroid_workaround is set, centroid interpolation + * uses PIXEL interpolation for unlit pixels and CENTROID interpolation + * for lit pixels, so we need both sets of barycentric coordinates. + */ + if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) { + if (is_centroid) { + barycentric_interp_modes |= + 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; + } else if (is_sample) { + barycentric_interp_modes |= + 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC; + } + if ((!is_centroid && !is_sample) || + devinfo->needs_unlit_centroid_workaround) { + barycentric_interp_modes |= + 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; + } + } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH || + (!(shade_model_flat && is_gl_Color) && + interp_qualifier == INTERP_QUALIFIER_NONE)) { + if (is_centroid) { + barycentric_interp_modes |= + 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; + } else if (is_sample) { + barycentric_interp_modes |= + 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC; + } + if ((!is_centroid && !is_sample) || + devinfo->needs_unlit_centroid_workaround) { + barycentric_interp_modes |= + 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; + } + } + } + + return barycentric_interp_modes; +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + return BRW_PSCDEPTH_OFF; + } + } + return BRW_PSCDEPTH_OFF; +} + const unsigned * brw_wm_fs_emit(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, @@ -5104,6 +5188,22 @@ brw_wm_fs_emit(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str) { + /* key->alpha_test_func means simulating alpha testing via discards, + * so the shader definitely kills pixels. + */ + prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func; + prog_data->uses_omask = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); + prog_data->computed_depth_mode = computed_depth_mode(shader); + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + + prog_data->barycentric_interp_modes = + brw_compute_barycentric_interp_modes(compiler->devinfo, + key->flat_shade, + key->persample_shading, + shader); + fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base, prog, shader, 8, shader_time_index8); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index c40fb0e4376..93f13445349 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -39,89 +39,6 @@ #include "util/ralloc.h" -/** - * Return a bitfield where bit n is set if barycentric interpolation mode n - * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader. - */ -static unsigned -brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo, - bool shade_model_flat, - bool persample_shading, - nir_shader *shader) -{ - unsigned barycentric_interp_modes = 0; - - nir_foreach_variable(var, &shader->inputs) { - enum glsl_interp_qualifier interp_qualifier = var->data.interpolation; - bool is_centroid = var->data.centroid && !persample_shading; - bool is_sample = var->data.sample || persample_shading; - bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) || - (var->data.location == VARYING_SLOT_COL1); - - /* Ignore WPOS and FACE, because they don't require interpolation. */ - if (var->data.location == VARYING_SLOT_POS || - var->data.location == VARYING_SLOT_FACE) - continue; - - /* Determine the set (or sets) of barycentric coordinates needed to - * interpolate this variable. Note that when - * brw->needs_unlit_centroid_workaround is set, centroid interpolation - * uses PIXEL interpolation for unlit pixels and CENTROID interpolation - * for lit pixels, so we need both sets of barycentric coordinates. - */ - if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) { - if (is_centroid) { - barycentric_interp_modes |= - 1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; - } else if (is_sample) { - barycentric_interp_modes |= - 1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC; - } - if ((!is_centroid && !is_sample) || - devinfo->needs_unlit_centroid_workaround) { - barycentric_interp_modes |= - 1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; - } - } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH || - (!(shade_model_flat && is_gl_Color) && - interp_qualifier == INTERP_QUALIFIER_NONE)) { - if (is_centroid) { - barycentric_interp_modes |= - 1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; - } else if (is_sample) { - barycentric_interp_modes |= - 1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC; - } - if ((!is_centroid && !is_sample) || - devinfo->needs_unlit_centroid_workaround) { - barycentric_interp_modes |= - 1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; - } - } - } - - return barycentric_interp_modes; -} - -static uint8_t -computed_depth_mode(struct gl_fragment_program *fp) -{ - if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { - switch (fp->FragDepthLayout) { - case FRAG_DEPTH_LAYOUT_NONE: - case FRAG_DEPTH_LAYOUT_ANY: - return BRW_PSCDEPTH_ON; - case FRAG_DEPTH_LAYOUT_GREATER: - return BRW_PSCDEPTH_ON_GE; - case FRAG_DEPTH_LAYOUT_LESS: - return BRW_PSCDEPTH_ON_LE; - case FRAG_DEPTH_LAYOUT_UNCHANGED: - return BRW_PSCDEPTH_OFF; - } - } - return BRW_PSCDEPTH_OFF; -} - static void assign_fs_binding_table_offsets(const struct brw_device_info *devinfo, const struct gl_shader_program *shader_prog, @@ -166,15 +83,6 @@ brw_codegen_wm_prog(struct brw_context *brw, fs = (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; memset(&prog_data, 0, sizeof(prog_data)); - /* key->alpha_test_func means simulating alpha testing via discards, - * so the shader definitely kills pixels. - */ - prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func; - prog_data.uses_omask = - fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); - prog_data.computed_depth_mode = computed_depth_mode(&fp->program); - - prog_data.early_fragment_tests = fs && fs->base.EarlyFragmentTests; /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */ if (!prog) @@ -209,12 +117,6 @@ brw_codegen_wm_prog(struct brw_context *brw, &prog_data.base); } - prog_data.barycentric_interp_modes = - brw_compute_barycentric_interp_modes(brw->intelScreen->devinfo, - key->flat_shade, - key->persample_shading, - fp->program.Base.nir); - if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); From 4467344c829f1dccdf74e27bef2c5fda72552be6 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 16:20:34 -0700 Subject: [PATCH 265/270] i965: Rename brw_foo_emit to brw_compile_foo Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/brw_cs.c | 6 +++--- src/mesa/drivers/dri/i965/brw_cs.h | 16 +++++++-------- src/mesa/drivers/dri/i965/brw_fs.cpp | 18 ++++++++--------- src/mesa/drivers/dri/i965/brw_gs.c | 6 +++--- src/mesa/drivers/dri/i965/brw_vec4.cpp | 20 +++++++++---------- .../drivers/dri/i965/brw_vec4_gs_visitor.cpp | 16 +++++++-------- .../drivers/dri/i965/brw_vec4_gs_visitor.h | 17 ++++++++-------- src/mesa/drivers/dri/i965/brw_vs.c | 10 +++++----- src/mesa/drivers/dri/i965/brw_vs.h | 20 +++++++++---------- src/mesa/drivers/dri/i965/brw_wm.c | 2 +- src/mesa/drivers/dri/i965/brw_wm.h | 2 +- 11 files changed, 67 insertions(+), 66 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index 12e75097f8b..263d224e882 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -106,9 +106,9 @@ brw_codegen_cs_prog(struct brw_context *brw, st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS); char *error_str; - program = brw_cs_emit(brw->intelScreen->compiler, brw, mem_ctx, - key, &prog_data, cp->program.Base.nir, - st_index, &program_size, &error_str); + program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, + key, &prog_data, cp->program.Base.nir, + st_index, &program_size, &error_str); if (program == NULL) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index 1a9613e3039..aac519f1fd1 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -42,14 +42,14 @@ brw_upload_cs_prog(struct brw_context *brw); struct nir_shader; const unsigned * -brw_cs_emit(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_cs_prog_key *key, - struct brw_cs_prog_data *prog_data, - const struct nir_shader *shader, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str); +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const struct nir_shader *shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); void brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 2d162b8ae99..3c767ce58f0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5177,7 +5177,7 @@ computed_depth_mode(const nir_shader *shader) } const unsigned * -brw_wm_fs_emit(const struct brw_compiler *compiler, void *log_data, +brw_compile_fs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_wm_prog_key *key, struct brw_wm_prog_data *prog_data, @@ -5334,14 +5334,14 @@ fs_visitor::emit_cs_work_group_id_setup() } const unsigned * -brw_cs_emit(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_cs_prog_key *key, - struct brw_cs_prog_data *prog_data, - const nir_shader *shader, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str) +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const nir_shader *shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) { prog_data->local_size[0] = shader->info.cs.local_size[0]; prog_data->local_size[1] = shader->info.cs.local_size[1]; diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index 469a6fbcc7f..10a7f28fdab 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -303,9 +303,9 @@ brw_codegen_gs_prog(struct brw_context *brw, unsigned program_size; char *error_str; const unsigned *program = - brw_gs_emit(brw->intelScreen->compiler, brw, &c, - shader->Program->nir, prog, - mem_ctx, st_index, &program_size, &error_str); + brw_compile_gs(brw->intelScreen->compiler, brw, &c, + shader->Program->nir, prog, + mem_ctx, st_index, &program_size, &error_str); if (program == NULL) { ralloc_free(mem_ctx); return false; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 1b3bce82097..ca4d23a490c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1920,16 +1920,16 @@ extern "C" { * Returns the final assembly and the program's size. */ const unsigned * -brw_vs_emit(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *prog_data, - const nir_shader *shader, - gl_clip_plane *clip_planes, - bool use_legacy_snorm_formula, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str) +brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) { const unsigned *assembly = NULL; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 8fc6a08f386..a715cf5a6cb 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -597,14 +597,14 @@ vec4_gs_visitor::gs_end_primitive() } extern "C" const unsigned * -brw_gs_emit(const struct brw_compiler *compiler, void *log_data, - struct brw_gs_compile *c, - const nir_shader *shader, - struct gl_shader_program *shader_prog, - void *mem_ctx, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str) +brw_compile_gs(const struct brw_compiler *compiler, void *log_data, + struct brw_gs_compile *c, + const nir_shader *shader, + struct gl_shader_program *shader_prog, + void *mem_ctx, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) { if (compiler->devinfo->gen >= 7) { /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h index e9ced7f04f8..df33e941d24 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h @@ -51,14 +51,15 @@ struct brw_gs_compile extern "C" { #endif -const unsigned *brw_gs_emit(const struct brw_compiler *compiler, void *log_data, - struct brw_gs_compile *c, - const nir_shader *shader, - struct gl_shader_program *shader_prog, - void *mem_ctx, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str); +const unsigned *brw_compile_gs(const struct brw_compiler *compiler, + void *log_data, + struct brw_gs_compile *c, + const nir_shader *shader, + struct gl_shader_program *shader_prog, + void *mem_ctx, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); #ifdef __cplusplus } /* extern "C" */ diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 353c5a04968..41290463257 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -203,11 +203,11 @@ brw_codegen_vs_prog(struct brw_context *brw, /* Emit GEN4 code. */ char *error_str; - program = brw_vs_emit(brw->intelScreen->compiler, brw, mem_ctx, key, - &prog_data, vp->program.Base.nir, - brw_select_clip_planes(&brw->ctx), - !_mesa_is_gles3(&brw->ctx), - st_index, &program_size, &error_str); + program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key, + &prog_data, vp->program.Base.nir, + brw_select_clip_planes(&brw->ctx), + !_mesa_is_gles3(&brw->ctx), + st_index, &program_size, &error_str); if (program == NULL) { if (prog) { prog->LinkStatus = false; diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index b65dd3b6012..aebb76f7bd0 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -56,16 +56,16 @@ extern "C" { struct nir_shader; -const unsigned *brw_vs_emit(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *prog_data, - const struct nir_shader *shader, - gl_clip_plane *clip_planes, - bool use_legacy_snorm_formula, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str); +const unsigned *brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const struct nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); void brw_vs_debug_recompile(struct brw_context *brw, struct gl_shader_program *prog, const struct brw_vs_prog_key *key); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 93f13445349..5c49db9e63e 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -133,7 +133,7 @@ brw_codegen_wm_prog(struct brw_context *brw, } char *error_str = NULL; - program = brw_wm_fs_emit(brw->intelScreen->compiler, brw, mem_ctx, + program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, fp->program.Base.nir, &fp->program.Base, st_index8, st_index16, brw->use_rep_send, &program_size, &error_str); diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index ac22bee11ad..7e4bf35d089 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -68,7 +68,7 @@ struct nir_shader; * * Returns the final assembly and the program's size. */ -const unsigned *brw_wm_fs_emit(const struct brw_compiler *compiler, +const unsigned *brw_compile_fs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_wm_prog_key *key, From 6980372010ad5929c0b4b0a0370d281cbd6f8b2e Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 8 Oct 2015 17:09:54 -0700 Subject: [PATCH 266/270] i965: Move the entire compiler API into a single file At this point, the compiler API has been substantially simplified. In the spirit of Kristian's making a compiler library, this commit makes a single header file that contains, more-or-less, the entire compiler API. There's still a bit of cleanup to do particularly in the area of geometry shaders. However, this gets us much closer to having a separate compiler. Reviewed-by: Topi Pohjolainen --- src/mesa/drivers/dri/i965/Makefile.sources | 1 + src/mesa/drivers/dri/i965/brw_compiler.h | 661 ++++++++++++++++++ src/mesa/drivers/dri/i965/brw_context.c | 2 +- src/mesa/drivers/dri/i965/brw_context.h | 356 +--------- src/mesa/drivers/dri/i965/brw_cs.h | 17 - src/mesa/drivers/dri/i965/brw_program.h | 124 +--- src/mesa/drivers/dri/i965/brw_shader.h | 58 -- .../drivers/dri/i965/brw_vec4_gs_visitor.h | 33 - src/mesa/drivers/dri/i965/brw_vs.h | 12 - src/mesa/drivers/dri/i965/brw_wm.h | 20 - 10 files changed, 665 insertions(+), 619 deletions(-) create mode 100644 src/mesa/drivers/dri/i965/brw_compiler.h diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index 81ef6283fa1..c2438bda356 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -1,6 +1,7 @@ i965_compiler_FILES = \ brw_cfg.cpp \ brw_cfg.h \ + brw_compiler.h \ brw_cubemap_normalize.cpp \ brw_dead_control_flow.cpp \ brw_dead_control_flow.h \ diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h new file mode 100644 index 00000000000..11c485d2f08 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -0,0 +1,661 @@ +/* + * Copyright © 2010 - 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "brw_device_info.h" +#include "main/mtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ra_regs; +struct nir_shader; +struct brw_geometry_program; +union gl_constant_value; + +struct brw_compiler { + const struct brw_device_info *devinfo; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used. + */ + int *classes; + + /** + * Mapping for register-allocated objects in *regs to the first + * GRF for that object. + */ + uint8_t *ra_reg_to_grf; + } vec4_reg_set; + + struct { + struct ra_regs *regs; + + /** + * Array of the ra classes for the unaligned contiguous register + * block sizes used, indexed by register size. + */ + int classes[16]; + + /** + * Mapping from classes to ra_reg ranges. Each of the per-size + * classes corresponds to a range of ra_reg nodes. This array stores + * those ranges in the form of first ra_reg in each class and the + * total number of ra_reg elements in the last array element. This + * way the range of the i'th class is given by: + * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) + */ + int class_to_ra_reg_range[17]; + + /** + * Mapping for register-allocated objects in *regs to the first + * GRF for that object. + */ + uint8_t *ra_reg_to_grf; + + /** + * ra class for the aligned pairs we use for PLN, which doesn't + * appear in *classes. + */ + int aligned_pairs_class; + } fs_reg_sets[2]; + + void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); + void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); + + bool scalar_vs; + struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; +}; + + +/** + * Program key structures. + * + * When drawing, we look for the currently bound shaders in the program + * cache. This is essentially a hash table lookup, and these are the keys. + * + * Sometimes OpenGL features specified as state need to be simulated via + * shader code, due to a mismatch between the API and the hardware. This + * is often referred to as "non-orthagonal state" or "NOS". We store NOS + * in the program key so it's considered when searching for a program. If + * we haven't seen a particular combination before, we have to recompile a + * new specialized version. + * + * Shader compilation should not look up state in gl_context directly, but + * instead use the copy in the program key. This guarantees recompiles will + * happen correctly. + * + * @{ + */ + +enum PACKED gen6_gather_sampler_wa { + WA_SIGN = 1, /* whether we need to sign extend */ + WA_8BIT = 2, /* if we have an 8bit format needing wa */ + WA_16BIT = 4, /* if we have a 16bit format needing wa */ +}; + +/** + * Sampler information needed by VS, WM, and GS program cache keys. + */ +struct brw_sampler_prog_key_data { + /** + * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. + */ + uint16_t swizzles[MAX_SAMPLERS]; + + uint32_t gl_clamp_mask[3]; + + /** + * For RG32F, gather4's channel select is broken. + */ + uint32_t gather_channel_quirk_mask; + + /** + * Whether this sampler uses the compressed multisample surface layout. + */ + uint32_t compressed_multisample_layout_mask; + + /** + * For Sandybridge, which shader w/a we need for gather quirks. + */ + enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; +}; + + +/** The program key for Vertex Shaders. */ +struct brw_vs_prog_key { + unsigned program_string_id; + + /* + * Per-attribute workaround flags + */ + uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX]; + + bool copy_edgeflag:1; + + bool clamp_vertex_color:1; + + /** + * How many user clipping planes are being uploaded to the vertex shader as + * push constants. + * + * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to + * clip distances. + */ + unsigned nr_userclip_plane_consts:4; + + /** + * For pre-Gen6 hardware, a bitfield indicating which texture coordinates + * are going to be replaced with point coordinates (as a consequence of a + * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because + * our SF thread requires exact matching between VS outputs and FS inputs, + * these texture coordinates will need to be unconditionally included in + * the VUE, even if they aren't written by the vertex shader. + */ + uint8_t point_coord_replace; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Geometry Shaders. */ +struct brw_gs_prog_key +{ + unsigned program_string_id; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Fragment/Pixel Shaders. */ +struct brw_wm_prog_key { + uint8_t iz_lookup; + bool stats_wm:1; + bool flat_shade:1; + bool persample_shading:1; + bool persample_2x:1; + unsigned nr_color_regions:5; + bool replicate_alpha:1; + bool render_to_fbo:1; + bool clamp_fragment_color:1; + bool compute_pos_offset:1; + bool compute_sample_id:1; + unsigned line_aa:2; + bool high_quality_derivatives:1; + + uint16_t drawable_height; + uint64_t input_slots_valid; + unsigned program_string_id; + GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ + float alpha_test_ref; + + struct brw_sampler_prog_key_data tex; +}; + +struct brw_cs_prog_key { + uint32_t program_string_id; + struct brw_sampler_prog_key_data tex; +}; + +/* + * Image metadata structure as laid out in the shader parameter + * buffer. Entries have to be 16B-aligned for the vec4 back-end to be + * able to use them. That's okay because the padding and any unused + * entries [most of them except when we're doing untyped surface + * access] will be removed by the uniform packing pass. + */ +#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 +#define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 +#define BRW_IMAGE_PARAM_SIZE_OFFSET 8 +#define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 +#define BRW_IMAGE_PARAM_TILING_OFFSET 16 +#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 +#define BRW_IMAGE_PARAM_SIZE 24 + +struct brw_image_param { + /** Surface binding table index. */ + uint32_t surface_idx; + + /** Offset applied to the X and Y surface coordinates. */ + uint32_t offset[2]; + + /** Surface X, Y and Z dimensions. */ + uint32_t size[3]; + + /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in + * pixels, vertical slice stride in pixels. + */ + uint32_t stride[4]; + + /** Log2 of the tiling modulus in the X, Y and Z dimension. */ + uint32_t tiling[3]; + + /** + * Right shift to apply for bit 6 address swizzling. Two different + * swizzles can be specified and will be applied one after the other. The + * resulting address will be: + * + * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ + * (addr >> swizzling[1]))) + * + * Use \c 0xff if any of the swizzles is not required. + */ + uint32_t swizzling[2]; +}; + +struct brw_stage_prog_data { + struct { + /** size of our binding table. */ + uint32_t size_bytes; + + /** @{ + * surface indices for the various groups of surfaces + */ + uint32_t pull_constants_start; + uint32_t texture_start; + uint32_t gather_texture_start; + uint32_t ubo_start; + uint32_t ssbo_start; + uint32_t abo_start; + uint32_t image_start; + uint32_t shader_time_start; + /** @} */ + } binding_table; + + GLuint nr_params; /**< number of float params/constants */ + GLuint nr_pull_params; + unsigned nr_image_params; + + unsigned curb_read_length; + unsigned total_scratch; + + /** + * Register where the thread expects to find input data from the URB + * (typically uniforms, followed by vertex or fragment attributes). + */ + unsigned dispatch_grf_start_reg; + + bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ + + /* Pointers to tracked values (only valid once + * _mesa_load_state_parameters has been called at runtime). + */ + const union gl_constant_value **param; + const union gl_constant_value **pull_param; + + /** Image metadata passed to the shader as uniforms. */ + struct brw_image_param *image_param; +}; + +/* Data about a particular attempt to compile a program. Note that + * there can be many of these, each in a different GL state + * corresponding to a different brw_wm_prog_key struct, with different + * compiled programs. + */ +struct brw_wm_prog_data { + struct brw_stage_prog_data base; + + GLuint num_varying_inputs; + + GLuint dispatch_grf_start_reg_16; + GLuint reg_blocks; + GLuint reg_blocks_16; + + struct { + /** @{ + * surface indices the WM-specific surfaces + */ + uint32_t render_target_start; + /** @} */ + } binding_table; + + uint8_t computed_depth_mode; + + bool early_fragment_tests; + bool no_8; + bool dual_src_blend; + bool uses_pos_offset; + bool uses_omask; + bool uses_kill; + bool pulls_bary; + uint32_t prog_offset_16; + + /** + * Mask of which interpolation modes are required by the fragment shader. + * Used in hardware setup on gen6+. + */ + uint32_t barycentric_interp_modes; + + /** + * Map from gl_varying_slot to the position within the FS setup data + * payload where the varying's attribute vertex deltas should be delivered. + * For varying slots that are not used by the FS, the value is -1. + */ + int urb_setup[VARYING_SLOT_MAX]; +}; + +struct brw_cs_prog_data { + struct brw_stage_prog_data base; + + GLuint dispatch_grf_start_reg_16; + unsigned local_size[3]; + unsigned simd_size; + bool uses_barrier; + bool uses_num_work_groups; + unsigned local_invocation_id_regs; + + struct { + /** @{ + * surface indices the CS-specific surfaces + */ + uint32_t work_groups_start; + /** @} */ + } binding_table; +}; + +/** + * Enum representing the i965-specific vertex results that don't correspond + * exactly to any element of gl_varying_slot. The values of this enum are + * assigned such that they don't conflict with gl_varying_slot. + */ +typedef enum +{ + BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, + BRW_VARYING_SLOT_PAD, + /** + * Technically this is not a varying but just a placeholder that + * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord + * builtin variable to be compiled correctly. see compile_sf_prog() for + * more info. + */ + BRW_VARYING_SLOT_PNTC, + BRW_VARYING_SLOT_COUNT +} brw_varying_slot; + +/** + * Data structure recording the relationship between the gl_varying_slot enum + * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a + * single octaword within the VUE (128 bits). + * + * Note that each BRW register contains 256 bits (2 octawords), so when + * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two + * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as + * in a vertex shader), each register corresponds to a single VUE slot, since + * it contains data for two separate vertices. + */ +struct brw_vue_map { + /** + * Bitfield representing all varying slots that are (a) stored in this VUE + * map, and (b) actually written by the shader. Does not include any of + * the additional varying slots defined in brw_varying_slot. + */ + GLbitfield64 slots_valid; + + /** + * Is this VUE map for a separate shader pipeline? + * + * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched + * without the linker having a chance to dead code eliminate unused varyings. + * + * This means that we have to use a fixed slot layout, based on the output's + * location field, rather than assigning slots in a compact contiguous block. + */ + bool separate; + + /** + * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are + * not stored in a slot (because they are not written, or because + * additional processing is applied before storing them in the VUE), the + * value is -1. + */ + signed char varying_to_slot[BRW_VARYING_SLOT_COUNT]; + + /** + * Map from VUE slot to gl_varying_slot value. For slots that do not + * directly correspond to a gl_varying_slot, the value comes from + * brw_varying_slot. + * + * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this + * simplifies code that uses the value stored in slot_to_varying to + * create a bit mask). + */ + signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; + + /** + * Total number of VUE slots in use + */ + int num_slots; +}; + +/** + * Convert a VUE slot number into a byte offset within the VUE. + */ +static inline GLuint brw_vue_slot_to_offset(GLuint slot) +{ + return 16*slot; +} + +/** + * Convert a vertex output (brw_varying_slot) into a byte offset within the + * VUE. + */ +static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map, + GLuint varying) +{ + return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); +} + +void brw_compute_vue_map(const struct brw_device_info *devinfo, + struct brw_vue_map *vue_map, + GLbitfield64 slots_valid, + bool separate_shader); + +enum shader_dispatch_mode { + DISPATCH_MODE_4X1_SINGLE = 0, + DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, + DISPATCH_MODE_4X2_DUAL_OBJECT = 2, + DISPATCH_MODE_SIMD8 = 3, +}; + +struct brw_vue_prog_data { + struct brw_stage_prog_data base; + struct brw_vue_map vue_map; + + GLuint urb_read_length; + GLuint total_grf; + + /* Used for calculating urb partitions. In the VS, this is the size of the + * URB entry used for both input and output to the thread. In the GS, this + * is the size of the URB entry used for output. + */ + GLuint urb_entry_size; + + enum shader_dispatch_mode dispatch_mode; +}; + +struct brw_vs_prog_data { + struct brw_vue_prog_data base; + + GLbitfield64 inputs_read; + + unsigned nr_attributes; + + bool uses_vertexid; + bool uses_instanceid; +}; + +struct brw_gs_prog_data +{ + struct brw_vue_prog_data base; + + /** + * Size of an output vertex, measured in HWORDS (32 bytes). + */ + unsigned output_vertex_size_hwords; + + unsigned output_topology; + + /** + * Size of the control data (cut bits or StreamID bits), in hwords (32 + * bytes). 0 if there is no control data. + */ + unsigned control_data_header_size_hwords; + + /** + * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID + * if the control data is StreamID bits, or + * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). + * Ignored if control_data_header_size is 0. + */ + unsigned control_data_format; + + bool include_primitive_id; + + /** + * The number of vertices emitted, if constant - otherwise -1. + */ + int static_vertex_count; + + int invocations; + + /** + * Gen6 transform feedback enabled flag. + */ + bool gen6_xfb_enabled; + + /** + * Gen6: Provoking vertex convention for odd-numbered triangles + * in tristrips. + */ + GLuint pv_first:1; + + /** + * Gen6: Number of varyings that are output to transform feedback. + */ + GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ + + /** + * Gen6: Map from the index of a transform feedback binding table entry to the + * gl_varying_slot that should be streamed out through that binding table + * entry. + */ + unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; + + /** + * Gen6: Map from the index of a transform feedback binding table entry to the + * swizzles that should be used when streaming out data through that + * binding table entry. + */ + unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; +}; + + +/** @} */ + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_vs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + const struct nir_shader *shader, + gl_clip_plane *clip_planes, + bool use_legacy_snorm_formula, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Scratch data used when compiling a GLSL geometry shader. + */ +struct brw_gs_compile +{ + struct brw_gs_prog_key key; + struct brw_gs_prog_data prog_data; + struct brw_vue_map input_vue_map; + + struct brw_geometry_program *gp; + + unsigned control_data_bits_per_vertex; + unsigned control_data_header_size_bits; +}; + +/** + * Compile a vertex shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_gs(const struct brw_compiler *compiler, void *log_data, + struct brw_gs_compile *c, + const struct nir_shader *shader, + struct gl_shader_program *shader_prog, + void *mem_ctx, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a fragment shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_fs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_wm_prog_key *key, + struct brw_wm_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_program *prog, + int shader_time_index8, + int shader_time_index16, + bool use_rep_send, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a compute shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_cs(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const struct nir_shader *shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 6b2bbd21703..3b125448e14 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -51,7 +51,7 @@ #include "brw_context.h" #include "brw_defines.h" -#include "brw_shader.h" +#include "brw_compiler.h" #include "brw_draw.h" #include "brw_state.h" diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 4aba7b814c6..4f503ae4869 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -40,6 +40,7 @@ #include "main/mm.h" #include "main/mtypes.h" #include "brw_structs.h" +#include "brw_compiler.h" #include "intel_aub.h" #include "program/prog_parameter.h" @@ -340,261 +341,6 @@ struct brw_shader { bool compiled_once; }; -struct brw_stage_prog_data { - struct { - /** size of our binding table. */ - uint32_t size_bytes; - - /** @{ - * surface indices for the various groups of surfaces - */ - uint32_t pull_constants_start; - uint32_t texture_start; - uint32_t gather_texture_start; - uint32_t ubo_start; - uint32_t ssbo_start; - uint32_t abo_start; - uint32_t image_start; - uint32_t shader_time_start; - /** @} */ - } binding_table; - - GLuint nr_params; /**< number of float params/constants */ - GLuint nr_pull_params; - unsigned nr_image_params; - - unsigned curb_read_length; - unsigned total_scratch; - - /** - * Register where the thread expects to find input data from the URB - * (typically uniforms, followed by vertex or fragment attributes). - */ - unsigned dispatch_grf_start_reg; - - bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ - - /* Pointers to tracked values (only valid once - * _mesa_load_state_parameters has been called at runtime). - */ - const gl_constant_value **param; - const gl_constant_value **pull_param; - - /** Image metadata passed to the shader as uniforms. */ - struct brw_image_param *image_param; -}; - -/* - * Image metadata structure as laid out in the shader parameter - * buffer. Entries have to be 16B-aligned for the vec4 back-end to be - * able to use them. That's okay because the padding and any unused - * entries [most of them except when we're doing untyped surface - * access] will be removed by the uniform packing pass. - */ -#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 -#define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 -#define BRW_IMAGE_PARAM_SIZE_OFFSET 8 -#define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 -#define BRW_IMAGE_PARAM_TILING_OFFSET 16 -#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 -#define BRW_IMAGE_PARAM_SIZE 24 - -struct brw_image_param { - /** Surface binding table index. */ - uint32_t surface_idx; - - /** Offset applied to the X and Y surface coordinates. */ - uint32_t offset[2]; - - /** Surface X, Y and Z dimensions. */ - uint32_t size[3]; - - /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in - * pixels, vertical slice stride in pixels. - */ - uint32_t stride[4]; - - /** Log2 of the tiling modulus in the X, Y and Z dimension. */ - uint32_t tiling[3]; - - /** - * Right shift to apply for bit 6 address swizzling. Two different - * swizzles can be specified and will be applied one after the other. The - * resulting address will be: - * - * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ - * (addr >> swizzling[1]))) - * - * Use \c 0xff if any of the swizzles is not required. - */ - uint32_t swizzling[2]; -}; - -/* Data about a particular attempt to compile a program. Note that - * there can be many of these, each in a different GL state - * corresponding to a different brw_wm_prog_key struct, with different - * compiled programs. - */ -struct brw_wm_prog_data { - struct brw_stage_prog_data base; - - GLuint num_varying_inputs; - - GLuint dispatch_grf_start_reg_16; - GLuint reg_blocks; - GLuint reg_blocks_16; - - struct { - /** @{ - * surface indices the WM-specific surfaces - */ - uint32_t render_target_start; - /** @} */ - } binding_table; - - uint8_t computed_depth_mode; - - bool early_fragment_tests; - bool no_8; - bool dual_src_blend; - bool uses_pos_offset; - bool uses_omask; - bool uses_kill; - bool pulls_bary; - uint32_t prog_offset_16; - - /** - * Mask of which interpolation modes are required by the fragment shader. - * Used in hardware setup on gen6+. - */ - uint32_t barycentric_interp_modes; - - /** - * Map from gl_varying_slot to the position within the FS setup data - * payload where the varying's attribute vertex deltas should be delivered. - * For varying slots that are not used by the FS, the value is -1. - */ - int urb_setup[VARYING_SLOT_MAX]; -}; - -struct brw_cs_prog_data { - struct brw_stage_prog_data base; - - GLuint dispatch_grf_start_reg_16; - unsigned local_size[3]; - unsigned simd_size; - bool uses_barrier; - bool uses_num_work_groups; - unsigned local_invocation_id_regs; - - struct { - /** @{ - * surface indices the CS-specific surfaces - */ - uint32_t work_groups_start; - /** @} */ - } binding_table; -}; - -/** - * Enum representing the i965-specific vertex results that don't correspond - * exactly to any element of gl_varying_slot. The values of this enum are - * assigned such that they don't conflict with gl_varying_slot. - */ -typedef enum -{ - BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, - BRW_VARYING_SLOT_PAD, - /** - * Technically this is not a varying but just a placeholder that - * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord - * builtin variable to be compiled correctly. see compile_sf_prog() for - * more info. - */ - BRW_VARYING_SLOT_PNTC, - BRW_VARYING_SLOT_COUNT -} brw_varying_slot; - - -/** - * Data structure recording the relationship between the gl_varying_slot enum - * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a - * single octaword within the VUE (128 bits). - * - * Note that each BRW register contains 256 bits (2 octawords), so when - * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two - * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as - * in a vertex shader), each register corresponds to a single VUE slot, since - * it contains data for two separate vertices. - */ -struct brw_vue_map { - /** - * Bitfield representing all varying slots that are (a) stored in this VUE - * map, and (b) actually written by the shader. Does not include any of - * the additional varying slots defined in brw_varying_slot. - */ - GLbitfield64 slots_valid; - - /** - * Is this VUE map for a separate shader pipeline? - * - * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched - * without the linker having a chance to dead code eliminate unused varyings. - * - * This means that we have to use a fixed slot layout, based on the output's - * location field, rather than assigning slots in a compact contiguous block. - */ - bool separate; - - /** - * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are - * not stored in a slot (because they are not written, or because - * additional processing is applied before storing them in the VUE), the - * value is -1. - */ - signed char varying_to_slot[BRW_VARYING_SLOT_COUNT]; - - /** - * Map from VUE slot to gl_varying_slot value. For slots that do not - * directly correspond to a gl_varying_slot, the value comes from - * brw_varying_slot. - * - * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this - * simplifies code that uses the value stored in slot_to_varying to - * create a bit mask). - */ - signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; - - /** - * Total number of VUE slots in use - */ - int num_slots; -}; - -/** - * Convert a VUE slot number into a byte offset within the VUE. - */ -static inline GLuint brw_vue_slot_to_offset(GLuint slot) -{ - return 16*slot; -} - -/** - * Convert a vertex output (brw_varying_slot) into a byte offset within the - * VUE. - */ -static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map, - GLuint varying) -{ - return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); -} - -void brw_compute_vue_map(const struct brw_device_info *devinfo, - struct brw_vue_map *vue_map, - GLbitfield64 slots_valid, - bool separate_shader); - - /** * Bitmask indicating which fragment shader inputs represent varyings (and * hence have to be delivered to the fragment shader by the SF/SBE stage). @@ -671,41 +417,6 @@ struct brw_ff_gs_prog_data { unsigned svbi_postincrement_value; }; -enum shader_dispatch_mode { - DISPATCH_MODE_4X1_SINGLE = 0, - DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, - DISPATCH_MODE_4X2_DUAL_OBJECT = 2, - DISPATCH_MODE_SIMD8 = 3, -}; - -struct brw_vue_prog_data { - struct brw_stage_prog_data base; - struct brw_vue_map vue_map; - - GLuint urb_read_length; - GLuint total_grf; - - /* Used for calculating urb partitions. In the VS, this is the size of the - * URB entry used for both input and output to the thread. In the GS, this - * is the size of the URB entry used for output. - */ - GLuint urb_entry_size; - - enum shader_dispatch_mode dispatch_mode; -}; - - -struct brw_vs_prog_data { - struct brw_vue_prog_data base; - - GLbitfield64 inputs_read; - - unsigned nr_attributes; - - bool uses_vertexid; - bool uses_instanceid; -}; - /** Number of texture sampler units */ #define BRW_MAX_TEX_UNIT 32 @@ -763,71 +474,6 @@ struct brw_vs_prog_data { #define SURF_INDEX_GEN6_SOL_BINDING(t) (t) -struct brw_gs_prog_data -{ - struct brw_vue_prog_data base; - - /** - * Size of an output vertex, measured in HWORDS (32 bytes). - */ - unsigned output_vertex_size_hwords; - - unsigned output_topology; - - /** - * Size of the control data (cut bits or StreamID bits), in hwords (32 - * bytes). 0 if there is no control data. - */ - unsigned control_data_header_size_hwords; - - /** - * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID - * if the control data is StreamID bits, or - * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). - * Ignored if control_data_header_size is 0. - */ - unsigned control_data_format; - - bool include_primitive_id; - - /** - * The number of vertices emitted, if constant - otherwise -1. - */ - int static_vertex_count; - - int invocations; - - /** - * Gen6 transform feedback enabled flag. - */ - bool gen6_xfb_enabled; - - /** - * Gen6: Provoking vertex convention for odd-numbered triangles - * in tristrips. - */ - GLuint pv_first:1; - - /** - * Gen6: Number of varyings that are output to transform feedback. - */ - GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ - - /** - * Gen6: Map from the index of a transform feedback binding table entry to the - * gl_varying_slot that should be streamed out through that binding table - * entry. - */ - unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS]; - - /** - * Gen6: Map from the index of a transform feedback binding table entry to the - * swizzles that should be used when streaming out data through that - * binding table entry. - */ - unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS]; -}; - /** * Stride in bytes between shader_time entries. * diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h index aac519f1fd1..899e340f14e 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.h +++ b/src/mesa/drivers/dri/i965/brw_cs.h @@ -27,11 +27,6 @@ #include "brw_program.h" -struct brw_cs_prog_key { - uint32_t program_string_id; - struct brw_sampler_prog_key_data tex; -}; - #ifdef __cplusplus extern "C" { #endif @@ -39,18 +34,6 @@ extern "C" { void brw_upload_cs_prog(struct brw_context *brw); -struct nir_shader; - -const unsigned * -brw_compile_cs(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_cs_prog_key *key, - struct brw_cs_prog_data *prog_data, - const struct nir_shader *shader, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str); - void brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, void *buffer, uint32_t threads, uint32_t stride); diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h index cf0522a8b10..f8cf2b062c8 100644 --- a/src/mesa/drivers/dri/i965/brw_program.h +++ b/src/mesa/drivers/dri/i965/brw_program.h @@ -24,129 +24,7 @@ #ifndef BRW_PROGRAM_H #define BRW_PROGRAM_H -/** - * Program key structures. - * - * When drawing, we look for the currently bound shaders in the program - * cache. This is essentially a hash table lookup, and these are the keys. - * - * Sometimes OpenGL features specified as state need to be simulated via - * shader code, due to a mismatch between the API and the hardware. This - * is often referred to as "non-orthagonal state" or "NOS". We store NOS - * in the program key so it's considered when searching for a program. If - * we haven't seen a particular combination before, we have to recompile a - * new specialized version. - * - * Shader compilation should not look up state in gl_context directly, but - * instead use the copy in the program key. This guarantees recompiles will - * happen correctly. - * - * @{ - */ - -enum PACKED gen6_gather_sampler_wa { - WA_SIGN = 1, /* whether we need to sign extend */ - WA_8BIT = 2, /* if we have an 8bit format needing wa */ - WA_16BIT = 4, /* if we have a 16bit format needing wa */ -}; - -/** - * Sampler information needed by VS, WM, and GS program cache keys. - */ -struct brw_sampler_prog_key_data { - /** - * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. - */ - uint16_t swizzles[MAX_SAMPLERS]; - - uint32_t gl_clamp_mask[3]; - - /** - * For RG32F, gather4's channel select is broken. - */ - uint32_t gather_channel_quirk_mask; - - /** - * Whether this sampler uses the compressed multisample surface layout. - */ - uint32_t compressed_multisample_layout_mask; - - /** - * For Sandybridge, which shader w/a we need for gather quirks. - */ - enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; -}; - - -/** The program key for Vertex Shaders. */ -struct brw_vs_prog_key { - unsigned program_string_id; - - /* - * Per-attribute workaround flags - */ - uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX]; - - bool copy_edgeflag:1; - - bool clamp_vertex_color:1; - - /** - * How many user clipping planes are being uploaded to the vertex shader as - * push constants. - * - * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to - * clip distances. - */ - unsigned nr_userclip_plane_consts:4; - - /** - * For pre-Gen6 hardware, a bitfield indicating which texture coordinates - * are going to be replaced with point coordinates (as a consequence of a - * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because - * our SF thread requires exact matching between VS outputs and FS inputs, - * these texture coordinates will need to be unconditionally included in - * the VUE, even if they aren't written by the vertex shader. - */ - uint8_t point_coord_replace; - - struct brw_sampler_prog_key_data tex; -}; - -/** The program key for Geometry Shaders. */ -struct brw_gs_prog_key -{ - unsigned program_string_id; - - struct brw_sampler_prog_key_data tex; -}; - -/** The program key for Fragment/Pixel Shaders. */ -struct brw_wm_prog_key { - uint8_t iz_lookup; - bool stats_wm:1; - bool flat_shade:1; - bool persample_shading:1; - bool persample_2x:1; - unsigned nr_color_regions:5; - bool replicate_alpha:1; - bool render_to_fbo:1; - bool clamp_fragment_color:1; - bool compute_pos_offset:1; - bool compute_sample_id:1; - unsigned line_aa:2; - bool high_quality_derivatives:1; - - uint16_t drawable_height; - uint64_t input_slots_valid; - unsigned program_string_id; - GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ - float alpha_test_ref; - - struct brw_sampler_prog_key_data tex; -}; - -/** @} */ +#include "brw_compiler.h" #ifdef __cplusplus extern "C" { diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 6d4cf048390..b33b08f40d7 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -38,64 +38,6 @@ #define MAX_SAMPLER_MESSAGE_SIZE 11 #define MAX_VGRF_SIZE 16 -struct brw_compiler { - const struct brw_device_info *devinfo; - - struct { - struct ra_regs *regs; - - /** - * Array of the ra classes for the unaligned contiguous register - * block sizes used. - */ - int *classes; - - /** - * Mapping for register-allocated objects in *regs to the first - * GRF for that object. - */ - uint8_t *ra_reg_to_grf; - } vec4_reg_set; - - struct { - struct ra_regs *regs; - - /** - * Array of the ra classes for the unaligned contiguous register - * block sizes used, indexed by register size. - */ - int classes[16]; - - /** - * Mapping from classes to ra_reg ranges. Each of the per-size - * classes corresponds to a range of ra_reg nodes. This array stores - * those ranges in the form of first ra_reg in each class and the - * total number of ra_reg elements in the last array element. This - * way the range of the i'th class is given by: - * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) - */ - int class_to_ra_reg_range[17]; - - /** - * Mapping for register-allocated objects in *regs to the first - * GRF for that object. - */ - uint8_t *ra_reg_to_grf; - - /** - * ra class for the aligned pairs we use for PLN, which doesn't - * appear in *classes. - */ - int aligned_pairs_class; - } fs_reg_sets[2]; - - void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); - void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); - - bool scalar_vs; - struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; -}; - enum PACKED register_file { BAD_FILE, GRF, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h index df33e941d24..c52552768c8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h @@ -32,39 +32,6 @@ #include "brw_vec4.h" -/** - * Scratch data used when compiling a GLSL geometry shader. - */ -struct brw_gs_compile -{ - struct brw_gs_prog_key key; - struct brw_gs_prog_data prog_data; - struct brw_vue_map input_vue_map; - - struct brw_geometry_program *gp; - - unsigned control_data_bits_per_vertex; - unsigned control_data_header_size_bits; -}; - -#ifdef __cplusplus -extern "C" { -#endif - -const unsigned *brw_compile_gs(const struct brw_compiler *compiler, - void *log_data, - struct brw_gs_compile *c, - const nir_shader *shader, - struct gl_shader_program *shader_prog, - void *mem_ctx, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - #ifdef __cplusplus namespace brw { diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index aebb76f7bd0..bcb5e7b0b2a 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -54,18 +54,6 @@ extern "C" { #endif -struct nir_shader; - -const unsigned *brw_compile_vs(const struct brw_compiler *compiler, void *log_data, - void *mem_ctx, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *prog_data, - const struct nir_shader *shader, - gl_clip_plane *clip_planes, - bool use_legacy_snorm_formula, - int shader_time_index, - unsigned *final_assembly_size, - char **error_str); void brw_vs_debug_recompile(struct brw_context *brw, struct gl_shader_program *prog, const struct brw_vs_prog_key *key); diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index 7e4bf35d089..53a642ee8bb 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -61,26 +61,6 @@ extern "C" { #endif -struct nir_shader; - -/** - * Compile a fragment shader. - * - * Returns the final assembly and the program's size. - */ -const unsigned *brw_compile_fs(const struct brw_compiler *compiler, - void *log_data, - void *mem_ctx, - const struct brw_wm_prog_key *key, - struct brw_wm_prog_data *prog_data, - const struct nir_shader *shader, - struct gl_program *prog, - int shader_time_index8, - int shader_time_index16, - bool use_rep_send, - unsigned *final_assembly_size, - char **error_str); - GLboolean brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog); struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type); From 41c474df53d9dcd5fd8e24eba5b7acc2b3c32795 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 15 Oct 2015 11:39:06 -0700 Subject: [PATCH 267/270] i965/vs: Move URB entry_size and read_length calculations to compile_vs Reviewed-By: Eduardo Lima Mitev --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 34 ++++++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_vs.c | 34 -------------------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index ca4d23a490c..00e2d63804e 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1933,6 +1933,40 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, { const unsigned *assembly = NULL; + unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read); + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (shader->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | + BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { + nr_attributes++; + } + + /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry + * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in + * vec4 mode, the hardware appears to wedge unless we read something. + */ + if (compiler->scalar_vs) + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); + else + prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2); + + prog_data->nr_attributes = nr_attributes; + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = + MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots); + + if (compiler->devinfo->gen == 6) + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); + else + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + if (compiler->scalar_vs) { prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 41290463257..ba680a98f7e 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -148,40 +148,6 @@ brw_codegen_vs_prog(struct brw_context *brw, &prog_data.base.vue_map, outputs_written, prog ? prog->SeparateShader : false); - unsigned nr_attributes = _mesa_bitcount_64(prog_data.inputs_read); - - /* gl_VertexID and gl_InstanceID are system values, but arrive via an - * incoming vertex attribute. So, add an extra slot. - */ - if (vp->program.Base.SystemValuesRead & - (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | - BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { - nr_attributes++; - } - - /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry - * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in - * vec4 mode, the hardware appears to wedge unless we read something. - */ - if (brw->intelScreen->compiler->scalar_vs) - prog_data.base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); - else - prog_data.base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2); - - prog_data.nr_attributes = nr_attributes; - - /* Since vertex shaders reuse the same VUE entry for inputs and outputs - * (overwriting the original contents), we need to make sure the size is - * the larger of the two. - */ - const unsigned vue_entries = - MAX2(nr_attributes, prog_data.base.vue_map.num_slots); - - if (brw->gen == 6) - prog_data.base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); - else - prog_data.base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); - if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, true); From 9e17c36b8ba79e688011a5fd293ad5f42da21b66 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Wed, 14 Oct 2015 02:12:09 -0700 Subject: [PATCH 268/270] i965: Extract can_change_source_types() functions. Make them members of fs_inst/vec4_instruction for use elsewhere. Also fix the fs version to check that dst.type == src[1].type and for !saturate. Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs.cpp | 12 ++++++++++++ .../drivers/dri/i965/brw_fs_copy_propagation.cpp | 15 ++------------- src/mesa/drivers/dri/i965/brw_ir_fs.h | 1 + src/mesa/drivers/dri/i965/brw_ir_vec4.h | 1 + src/mesa/drivers/dri/i965/brw_vec4.cpp | 12 ++++++++++++ .../dri/i965/brw_vec4_copy_propagation.cpp | 16 ++-------------- 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 3c767ce58f0..49323eb790d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -337,6 +337,18 @@ fs_inst::can_do_source_mods(const struct brw_device_info *devinfo) return true; } +bool +fs_inst::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate)); +} + bool fs_inst::has_side_effects() const { diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 230b0caec47..5589716239a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -275,17 +275,6 @@ is_logic_op(enum opcode opcode) opcode == BRW_OPCODE_NOT); } -static bool -can_change_source_types(fs_inst *inst) -{ - return !inst->src[0].abs && !inst->src[0].negate && - inst->dst.type == inst->src[0].type && - (inst->opcode == BRW_OPCODE_MOV || - (inst->opcode == BRW_OPCODE_SEL && - inst->predicate != BRW_PREDICATE_NONE && - !inst->src[1].abs && !inst->src[1].negate)); -} - bool fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) { @@ -368,7 +357,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (has_source_modifiers && entry->dst.type != inst->src[arg].type && - !can_change_source_types(inst)) + !inst->can_change_types()) return false; if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) && @@ -438,7 +427,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) * type. If we got here, then we can just change the source and * destination types of the instruction and keep going. */ - assert(can_change_source_types(inst)); + assert(inst->can_change_types()); for (int i = 0; i < inst->sources; i++) { inst->src[i].type = entry->dst.type; } diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index 97c6f8b2500..7726e4b78a0 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -204,6 +204,7 @@ public: unsigned components_read(unsigned i) const; int regs_read(int arg) const; bool can_do_source_mods(const struct brw_device_info *devinfo); + bool can_change_types() const; bool has_side_effects() const; bool reads_flag() const; diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 96dd633e117..1b57b65db27 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -179,6 +179,7 @@ public: int swizzle, int swizzle_mask); void reswizzle(int dst_writemask, int swizzle); bool can_do_source_mods(const struct brw_device_info *devinfo); + bool can_change_types() const; bool reads_flag() { diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 00e2d63804e..befc92445d3 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -280,6 +280,18 @@ vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo) return true; } +bool +vec4_instruction::can_change_types() const +{ + return dst.type == src[0].type && + !src[0].abs && !src[0].negate && !saturate && + (opcode == BRW_OPCODE_MOV || + (opcode == BRW_OPCODE_SEL && + dst.type == src[1].type && + predicate != BRW_PREDICATE_NONE && + !src[1].abs && !src[1].negate)); +} + /** * Returns how many MRFs an opcode will write over. * diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp index 610caef7dce..db99ecba35a 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp @@ -255,18 +255,6 @@ try_constant_propagate(const struct brw_device_info *devinfo, return false; } -static bool -can_change_source_types(vec4_instruction *inst) -{ - return inst->dst.type == inst->src[0].type && - !inst->src[0].abs && !inst->src[0].negate && !inst->saturate && - (inst->opcode == BRW_OPCODE_MOV || - (inst->opcode == BRW_OPCODE_SEL && - inst->dst.type == inst->src[1].type && - inst->predicate != BRW_PREDICATE_NONE && - !inst->src[1].abs && !inst->src[1].negate)); -} - static bool try_copy_propagate(const struct brw_device_info *devinfo, vec4_instruction *inst, @@ -325,7 +313,7 @@ try_copy_propagate(const struct brw_device_info *devinfo, if (has_source_modifiers && value.type != inst->src[arg].type && - !can_change_source_types(inst)) + !inst->can_change_types()) return false; if (has_source_modifiers && @@ -394,7 +382,7 @@ try_copy_propagate(const struct brw_device_info *devinfo, value.swizzle = composed_swizzle; if (has_source_modifiers && value.type != inst->src[arg].type) { - assert(can_change_source_types(inst)); + assert(inst->can_change_types()); for (int i = 0; i < 3; i++) { inst->src[i].type = value.type; } From 35a2d259f27f5b41f29a1112ca48093dac09c364 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Wed, 14 Oct 2015 02:23:25 -0700 Subject: [PATCH 269/270] i965/fs: Consider type mismatches in saturate propagation. NIR considers bcsel to produce and consume unsigned types, leading to SEL instructions operating on unsigned types when the data is really floating-point. Previous to this patch, saturate propagation would happily transform (+f0) sel g20:UD, g30:UD, g40:UD mov.sat g50:F, g20:F into (+f0) sel.sat g20:UD, g30:UD, g40:UD mov g50:F, g20:F But since the meaning of .sat is dependent on the type of the destination register, this is not valid. Instead, allow saturate propagation to change the types of dest/source on instructions that are simply copying data in order to propagate the saturate modifier. Fixes bad code gen in 158 programs. Reviewed-by: Jason Ekstrand --- .../dri/i965/brw_fs_saturate_propagation.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp index e406c2899e8..8792a8c7b1d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp @@ -52,11 +52,12 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) ip--; if (inst->opcode != BRW_OPCODE_MOV || + !inst->saturate || inst->dst.file != GRF || + inst->dst.type != inst->src[0].type || inst->src[0].file != GRF || inst->src[0].abs || - inst->src[0].negate || - !inst->saturate) + inst->src[0].negate) continue; int src_var = v->live_intervals->var_from_reg(inst->src[0]); @@ -65,7 +66,9 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) bool interfered = false; foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) { if (scan_inst->overwrites_reg(inst->src[0])) { - if (scan_inst->is_partial_write()) + if (scan_inst->is_partial_write() || + (scan_inst->dst.type != inst->dst.type && + !scan_inst->can_change_types())) break; if (scan_inst->saturate) { @@ -73,6 +76,12 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) progress = true; } else if (src_end_ip <= ip || inst->dst.equals(inst->src[0])) { if (scan_inst->can_do_saturate()) { + if (scan_inst->dst.type != inst->dst.type) { + scan_inst->dst.type = inst->dst.type; + for (int i = 0; i < scan_inst->sources; i++) { + scan_inst->src[i].type = inst->dst.type; + } + } scan_inst->saturate = true; inst->saturate = false; progress = true; From de862f03accb12b044ced60cb98f47a055457223 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Thu, 15 Oct 2015 16:01:11 -0700 Subject: [PATCH 270/270] i965/fs: Localize variables' scopes. --- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 23c99b7b912..f825fed4daf 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -904,12 +904,9 @@ fs_visitor::emit_urb_writes() urb_offset = 0; flush = false; for (slot = 0; slot < vue_map->num_slots; slot++) { - fs_reg reg, src, zero; - int varying = vue_map->slot_to_varying[slot]; switch (varying) { - case VARYING_SLOT_PSIZ: - + case VARYING_SLOT_PSIZ: { /* The point size varying slot is the vue header and is always in the * vue map. But often none of the special varyings that live there * are written and in that case we can skip writing to the vue @@ -921,7 +918,7 @@ fs_visitor::emit_urb_writes() break; } - zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg zero(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); bld.MOV(zero, fs_reg(0u)); sources[length++] = zero; @@ -940,7 +937,7 @@ fs_visitor::emit_urb_writes() else sources[length++] = zero; break; - + } case BRW_VARYING_SLOT_NDC: case VARYING_SLOT_EDGE: unreachable("unexpected scalar vs output"); @@ -973,8 +970,8 @@ fs_visitor::emit_urb_writes() * temp register and use that for the payload. */ for (int i = 0; i < 4; i++) { - reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type); - src = offset(this->outputs[varying], bld, i); + fs_reg reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type); + fs_reg src = offset(this->outputs[varying], bld, i); set_saturate(true, bld.MOV(reg, src)); sources[length++] = reg; }