main: memcpy larger chunks in _mesa_propagate_uniforms_to_driver_storage

When possible, do the memcpy on larger blocks. This reduces cycles
spent in _mesa_propagate_uniforms_to_driver_storage from
1.51 % to 0.62% according to perf during the Unigine Heaven benchmark.
It did not affect the framerate of the benchmark. The system used for
testing was an i5 6600K with a Radeon R9 380.

Piglit hangs randomly on this system both with and without the patch
so i could not make a comparison.

v2: fixed whitespace

Signed-off-by: Nils Wallménius <nils.wallmenius@gmail.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
Nils Wallménius
2016-07-22 13:10:03 +02:00
committed by Nicolai Hähnle
parent dd208ea006
commit a354c389f5

View File

@@ -578,14 +578,31 @@ _mesa_propagate_uniforms_to_driver_storage(struct gl_uniform_storage *uni,
unsigned j;
unsigned v;
for (j = 0; j < count; j++) {
for (v = 0; v < vectors; v++) {
memcpy(dst, src, src_vector_byte_stride);
src += src_vector_byte_stride;
dst += store->vector_stride;
}
if (src_vector_byte_stride == store->vector_stride) {
if (extra_stride) {
for (j = 0; j < count; j++) {
memcpy(dst, src, src_vector_byte_stride * vectors);
src += src_vector_byte_stride * vectors;
dst += store->vector_stride * vectors;
dst += extra_stride;
dst += extra_stride;
}
} else {
/* Unigine Heaven benchmark gets here */
memcpy(dst, src, src_vector_byte_stride * vectors * count);
src += src_vector_byte_stride * vectors * count;
dst += store->vector_stride * vectors * count;
}
} else {
for (j = 0; j < count; j++) {
for (v = 0; v < vectors; v++) {
memcpy(dst, src, src_vector_byte_stride);
src += src_vector_byte_stride;
dst += store->vector_stride;
}
dst += extra_stride;
}
}
break;
}