main: memcpy larger chunks in _mesa_propagate_uniforms_to_driver_storage
When possible, do the memcpy on larger blocks. This reduces cycles spent in _mesa_propagate_uniforms_to_driver_storage from 1.51 % to 0.62% according to perf during the Unigine Heaven benchmark. It did not affect the framerate of the benchmark. The system used for testing was an i5 6600K with a Radeon R9 380. Piglit hangs randomly on this system both with and without the patch so i could not make a comparison. v2: fixed whitespace Signed-off-by: Nils Wallménius <nils.wallmenius@gmail.com> Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:

committed by
Nicolai Hähnle

parent
dd208ea006
commit
a354c389f5
@@ -578,14 +578,31 @@ _mesa_propagate_uniforms_to_driver_storage(struct gl_uniform_storage *uni,
|
||||
unsigned j;
|
||||
unsigned v;
|
||||
|
||||
for (j = 0; j < count; j++) {
|
||||
for (v = 0; v < vectors; v++) {
|
||||
memcpy(dst, src, src_vector_byte_stride);
|
||||
src += src_vector_byte_stride;
|
||||
dst += store->vector_stride;
|
||||
}
|
||||
if (src_vector_byte_stride == store->vector_stride) {
|
||||
if (extra_stride) {
|
||||
for (j = 0; j < count; j++) {
|
||||
memcpy(dst, src, src_vector_byte_stride * vectors);
|
||||
src += src_vector_byte_stride * vectors;
|
||||
dst += store->vector_stride * vectors;
|
||||
|
||||
dst += extra_stride;
|
||||
dst += extra_stride;
|
||||
}
|
||||
} else {
|
||||
/* Unigine Heaven benchmark gets here */
|
||||
memcpy(dst, src, src_vector_byte_stride * vectors * count);
|
||||
src += src_vector_byte_stride * vectors * count;
|
||||
dst += store->vector_stride * vectors * count;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < count; j++) {
|
||||
for (v = 0; v < vectors; v++) {
|
||||
memcpy(dst, src, src_vector_byte_stride);
|
||||
src += src_vector_byte_stride;
|
||||
dst += store->vector_stride;
|
||||
}
|
||||
|
||||
dst += extra_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
Reference in New Issue
Block a user