anv: add gfx9 generated draw support
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20497>
This commit is contained in:

committed by
Marge Bot

parent
3ac7d5f258
commit
1d9cf8f381
@@ -134,6 +134,7 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
|
||||
|
||||
cmd_buffer->generation_jump_addr = ANV_NULL_ADDRESS;
|
||||
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
|
||||
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
|
||||
|
||||
anv_cmd_state_init(cmd_buffer);
|
||||
|
||||
@@ -201,6 +202,7 @@ anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
|
||||
|
||||
cmd_buffer->generation_jump_addr = ANV_NULL_ADDRESS;
|
||||
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
|
||||
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
|
||||
|
||||
anv_state_stream_finish(&cmd_buffer->surface_state_stream);
|
||||
anv_state_stream_init(&cmd_buffer->surface_state_stream,
|
||||
|
@@ -897,7 +897,6 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
|
||||
|
||||
|
||||
device->generated_indirect_draws =
|
||||
device->info.ver >= 11 &&
|
||||
debug_get_bool_option("ANV_ENABLE_GENERATED_INDIRECT_DRAWS",
|
||||
true);
|
||||
|
||||
|
@@ -31,6 +31,7 @@
|
||||
|
||||
#include "anv_generated_indirect_draws.h"
|
||||
|
||||
#include "shaders/gfx9_generated_draws_spv.h"
|
||||
#include "shaders/gfx11_generated_draws_spv.h"
|
||||
|
||||
/* This pass takes vulkan descriptor bindings 0 & 1 and turns them into global
|
||||
@@ -101,7 +102,27 @@ lower_vulkan_descriptors_instr(nir_builder *b, nir_instr *instr, void *cb_data)
|
||||
break;
|
||||
}
|
||||
|
||||
case 2:
|
||||
case 2: {
|
||||
desc_value =
|
||||
nir_load_ubo(b, 1, 64,
|
||||
nir_imm_int(b, 2),
|
||||
nir_imm_int(b,
|
||||
offsetof(struct anv_generated_indirect_params,
|
||||
draw_ids_addr)),
|
||||
.align_mul = 8,
|
||||
.align_offset = 0,
|
||||
.range_base = 0,
|
||||
.range = ~0);
|
||||
desc_value =
|
||||
nir_vec4(b,
|
||||
nir_unpack_64_2x32_split_x(b, desc_value),
|
||||
nir_unpack_64_2x32_split_y(b, desc_value),
|
||||
nir_imm_int(b, 0),
|
||||
nir_imm_int(b, 0));
|
||||
break;
|
||||
}
|
||||
|
||||
case 3:
|
||||
desc_value =
|
||||
nir_vec2(b,
|
||||
nir_imm_int(b, 2),
|
||||
@@ -276,9 +297,6 @@ compile_upload_spirv(struct anv_device *device,
|
||||
VkResult
|
||||
anv_device_init_generated_indirect_draws(struct anv_device *device)
|
||||
{
|
||||
if (device->info->ver < 11)
|
||||
return VK_SUCCESS;
|
||||
|
||||
const struct intel_l3_weights w =
|
||||
intel_get_default_l3_weights(device->info,
|
||||
true /* wants_dc_cache */,
|
||||
@@ -298,16 +316,24 @@ anv_device_init_generated_indirect_draws(struct anv_device *device)
|
||||
sizeof(indirect_draws_key),
|
||||
NULL);
|
||||
if (device->generated_draw_kernel == NULL) {
|
||||
const uint32_t *spirv_source =
|
||||
device->info->ver >= 11 ?
|
||||
gfx11_generated_draws_spv_source :
|
||||
gfx9_generated_draws_spv_source;
|
||||
const uint32_t spirv_source_size =
|
||||
device->info->ver >= 11 ?
|
||||
ARRAY_SIZE(gfx11_generated_draws_spv_source) :
|
||||
ARRAY_SIZE(gfx9_generated_draws_spv_source);
|
||||
const uint32_t send_count =
|
||||
device->info->ver >= 11 ?
|
||||
11 /* 2 * (2 loads + 3 stores) + 1 store */ :
|
||||
17 /* 2 * (2 loads + 6 stores) + 1 store */;
|
||||
|
||||
device->generated_draw_kernel =
|
||||
compile_upload_spirv(device,
|
||||
&indirect_draws_key,
|
||||
sizeof(indirect_draws_key),
|
||||
gfx11_generated_draws_spv_source,
|
||||
ARRAY_SIZE(gfx11_generated_draws_spv_source),
|
||||
11 /*
|
||||
* 2 * (2 indirect data loads + 3 3DPRIMITVE stores) +
|
||||
* 1 store (MI_BATCH_BUFFER_START)
|
||||
*/);
|
||||
spirv_source, spirv_source_size, send_count);
|
||||
}
|
||||
if (device->generated_draw_kernel == NULL)
|
||||
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
|
@@ -28,19 +28,40 @@
|
||||
|
||||
#define ANV_GENERATED_FLAG_INDEXED BITFIELD_BIT(0)
|
||||
#define ANV_GENERATED_FLAG_PREDICATED BITFIELD_BIT(1)
|
||||
#define ANV_GENERATED_FLAG_DRAWID BITFIELD_BIT(2)
|
||||
#define ANV_GENERATED_FLAG_BASE BITFIELD_BIT(3)
|
||||
|
||||
/* This needs to match common_generated_draws.glsl :
|
||||
*
|
||||
* layout(set = 0, binding = 2) uniform block
|
||||
*/
|
||||
struct anv_generated_indirect_draw_params {
|
||||
/* Draw ID buffer address (only used on Gfx9) */
|
||||
uint64_t draw_id_addr;
|
||||
/* Indirect data buffer address (only used on Gfx9) */
|
||||
uint64_t indirect_data_addr;
|
||||
/* Stride between each elements of the indirect data buffer */
|
||||
uint32_t indirect_data_stride;
|
||||
uint32_t flags; /* 0-7: bits, 8-15: mocs, 16-23: cmd_dws */
|
||||
/* Base number of the draw ID, it is added to the index computed from the
|
||||
* gl_FragCoord
|
||||
*/
|
||||
uint32_t draw_base;
|
||||
|
||||
/* Number of draws to generate */
|
||||
uint32_t draw_count;
|
||||
|
||||
/* Maximum number of draws (equals to draw_count for indirect draws without
|
||||
* an indirect count)
|
||||
*/
|
||||
uint32_t max_draw_count;
|
||||
|
||||
/* Instance multiplier for multi view */
|
||||
uint32_t instance_multiplier;
|
||||
|
||||
/* Address where to jump at after the generated draw (only used with
|
||||
* indirect draw count variants)
|
||||
*/
|
||||
uint64_t end_addr;
|
||||
};
|
||||
|
||||
@@ -53,6 +74,9 @@ struct anv_generated_indirect_params {
|
||||
/* Global address of binding 1 */
|
||||
uint64_t generated_cmds_addr;
|
||||
|
||||
/* Global address of binding 2 */
|
||||
uint64_t draw_ids_addr;
|
||||
|
||||
/* CPU side pointer to the previous item when number of draws has to be
|
||||
* split into smaller chunks, see while loop in
|
||||
* genX(cmd_buffer_emit_indirect_generated_draws)
|
||||
|
@@ -2777,6 +2777,11 @@ struct anv_cmd_buffer {
|
||||
*/
|
||||
struct anv_address generation_return_addr;
|
||||
|
||||
/**
|
||||
* Binding table allocation for generation shaders (only used on Gfx9).
|
||||
*/
|
||||
struct anv_state generation_bt_state;
|
||||
|
||||
/** List of anv_batch_bo used for generation
|
||||
*
|
||||
* We have to keep this separated of the anv_cmd_buffer::batch_bos that is
|
||||
|
@@ -3550,12 +3550,8 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
|
||||
genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
|
||||
}
|
||||
|
||||
#define GFX_HAS_GENERATED_CMDS GFX_VER >= 11
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
#include "genX_cmd_draw_generated_indirect.h"
|
||||
#endif
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
ALWAYS_INLINE static bool
|
||||
anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
|
||||
{
|
||||
@@ -3574,7 +3570,6 @@ anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
|
||||
return device->physical->generated_indirect_draws &&
|
||||
count >= device->physical->instance->generated_indirect_threshold;
|
||||
}
|
||||
#endif
|
||||
|
||||
VkResult
|
||||
genX(BeginCommandBuffer)(
|
||||
@@ -3791,9 +3786,7 @@ genX(EndCommandBuffer)(
|
||||
|
||||
anv_measure_endcommandbuffer(cmd_buffer);
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
|
||||
#endif
|
||||
|
||||
/* Turn on object level preemption if it is disabled to have it in known
|
||||
* state at the beginning of new command buffer.
|
||||
@@ -3873,9 +3866,7 @@ genX(CmdExecuteCommands)(
|
||||
*/
|
||||
genX(cmd_buffer_apply_pipe_flushes)(primary);
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
genX(cmd_buffer_flush_generated_draws)(primary);
|
||||
#endif
|
||||
|
||||
for (uint32_t i = 0; i < commandBufferCount; i++) {
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
|
||||
@@ -4066,10 +4057,8 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
|
||||
anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
|
||||
anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
|
||||
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
|
||||
#endif
|
||||
|
||||
anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
|
||||
}
|
||||
@@ -4721,7 +4710,6 @@ void genX(CmdDrawIndirect)(
|
||||
drawCount);
|
||||
trace_intel_begin_draw_indirect(&cmd_buffer->trace);
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
if (anv_use_generated_draws(cmd_buffer, drawCount)) {
|
||||
genX(cmd_buffer_emit_indirect_generated_draws)(
|
||||
cmd_buffer,
|
||||
@@ -4735,11 +4723,6 @@ void genX(CmdDrawIndirect)(
|
||||
anv_address_add(buffer->address, offset),
|
||||
stride, drawCount, false /* indexed */);
|
||||
}
|
||||
#else
|
||||
emit_indirect_draws(cmd_buffer,
|
||||
anv_address_add(buffer->address, offset),
|
||||
stride, drawCount, false /* indexed */);
|
||||
#endif
|
||||
|
||||
trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
|
||||
}
|
||||
@@ -4763,7 +4746,6 @@ void genX(CmdDrawIndexedIndirect)(
|
||||
drawCount);
|
||||
trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
if (anv_use_generated_draws(cmd_buffer, drawCount)) {
|
||||
genX(cmd_buffer_emit_indirect_generated_draws)(
|
||||
cmd_buffer,
|
||||
@@ -4777,11 +4759,6 @@ void genX(CmdDrawIndexedIndirect)(
|
||||
anv_address_add(buffer->address, offset),
|
||||
stride, drawCount, true /* indexed */);
|
||||
}
|
||||
#else
|
||||
emit_indirect_draws(cmd_buffer,
|
||||
anv_address_add(buffer->address, offset),
|
||||
stride, drawCount, true /* indexed */);
|
||||
#endif
|
||||
|
||||
trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
|
||||
}
|
||||
@@ -4966,7 +4943,6 @@ void genX(CmdDrawIndirectCount)(
|
||||
anv_address_add(count_buffer->address, countBufferOffset);
|
||||
stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
|
||||
genX(cmd_buffer_emit_indirect_generated_draws)(
|
||||
cmd_buffer,
|
||||
@@ -4983,14 +4959,6 @@ void genX(CmdDrawIndirectCount)(
|
||||
maxDrawCount,
|
||||
false /* indexed */);
|
||||
}
|
||||
#else
|
||||
emit_indirect_count_draws(cmd_buffer,
|
||||
indirect_data_address,
|
||||
stride,
|
||||
count_address,
|
||||
maxDrawCount,
|
||||
false /* indexed */);
|
||||
#endif
|
||||
|
||||
trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
|
||||
}
|
||||
@@ -5023,7 +4991,6 @@ void genX(CmdDrawIndexedIndirectCount)(
|
||||
anv_address_add(count_buffer->address, countBufferOffset);
|
||||
stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
|
||||
|
||||
#if GFX_HAS_GENERATED_CMDS
|
||||
if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
|
||||
genX(cmd_buffer_emit_indirect_generated_draws)(
|
||||
cmd_buffer,
|
||||
@@ -5040,14 +5007,6 @@ void genX(CmdDrawIndexedIndirectCount)(
|
||||
maxDrawCount,
|
||||
true /* indexed */);
|
||||
}
|
||||
#else
|
||||
emit_indirect_count_draws(cmd_buffer,
|
||||
indirect_data_address,
|
||||
stride,
|
||||
count_address,
|
||||
maxDrawCount,
|
||||
true /* indexed */);
|
||||
#endif
|
||||
|
||||
trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
|
||||
|
||||
|
@@ -34,10 +34,6 @@
|
||||
#include "anv_private.h"
|
||||
#include "anv_generated_indirect_draws.h"
|
||||
|
||||
#if GFX_VER < 11
|
||||
#error "Generated draws optimization not supported prior to Gfx11"
|
||||
#endif
|
||||
|
||||
/* This is a maximum number of items a fragment shader can generate due to the
|
||||
* viewport size.
|
||||
*/
|
||||
@@ -92,7 +88,9 @@ genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
|
||||
sgvs.InstanceIDComponentNumber = COMP_1;
|
||||
sgvs.InstanceIDElementOffset = 0;
|
||||
}
|
||||
#if GFX_VER >= 11
|
||||
anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
|
||||
#endif
|
||||
anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
|
||||
vfi.InstancingEnable = false;
|
||||
vfi.VertexElementIndex = 0;
|
||||
@@ -183,7 +181,7 @@ genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
|
||||
|
||||
ps.VectorMaskEnable = prog_data->uses_vmask;
|
||||
|
||||
ps.BindingTableEntryCount = 0;
|
||||
ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0;
|
||||
ps.PushConstantEnable = prog_data->base.nr_params > 0 ||
|
||||
prog_data->base.ubo_ranges[0].length;
|
||||
|
||||
@@ -254,7 +252,44 @@ genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
|
||||
}
|
||||
#endif
|
||||
|
||||
cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0) | BITFIELD_BIT(1);
|
||||
#if GFX_VER == 9
|
||||
/* Allocate a binding table for Gfx9 for 2 reason :
|
||||
*
|
||||
* 1. we need a to emit a 3DSTATE_BINDING_TABLE_POINTERS_PS to make the
|
||||
* HW apply the preceeding 3DSTATE_CONSTANT_PS
|
||||
*
|
||||
* 2. Emitting an empty 3DSTATE_BINDING_TABLE_POINTERS_PS would cause RT
|
||||
* writes (even though they're empty) to disturb later writes
|
||||
* (probably due to RT cache)
|
||||
*
|
||||
* Our binding table only has one entry to the null surface.
|
||||
*/
|
||||
uint32_t bt_offset;
|
||||
cmd_buffer->generation_bt_state =
|
||||
anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1, &bt_offset);
|
||||
if (cmd_buffer->generation_bt_state.map == NULL) {
|
||||
VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
|
||||
if (result != VK_SUCCESS)
|
||||
return;
|
||||
|
||||
/* Re-emit state base addresses so we get the new surface state base
|
||||
* address before we start emitting binding tables etc.
|
||||
*/
|
||||
genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
|
||||
|
||||
cmd_buffer->generation_bt_state =
|
||||
anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1, &bt_offset);
|
||||
assert(cmd_buffer->generation_bt_state.map != NULL);
|
||||
}
|
||||
|
||||
uint32_t *bt_map = cmd_buffer->generation_bt_state.map;
|
||||
bt_map[0] = anv_bindless_state_for_binding_table(
|
||||
cmd_buffer->device->null_surface_state).offset + bt_offset;
|
||||
|
||||
cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
|
||||
#endif
|
||||
|
||||
cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0);
|
||||
cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
|
||||
ANV_CMD_DIRTY_XFB_ENABLE);
|
||||
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
|
||||
@@ -326,10 +361,20 @@ genX(cmd_buffer_emit_generated_push_data)(struct anv_cmd_buffer *cmd_buffer,
|
||||
.ConstantBufferReadLength = DIV_ROUND_UP(push_data_state.alloc_size, 32),
|
||||
});
|
||||
#else
|
||||
/* The Skylake PRM contains the following restriction:
|
||||
*
|
||||
* "The driver must ensure The following case does not occur
|
||||
* without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
|
||||
* buffer 3 read length equal to zero committed followed by a
|
||||
* 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
|
||||
* zero committed."
|
||||
*
|
||||
* To avoid this, we program the highest slot.
|
||||
*/
|
||||
anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) {
|
||||
c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
|
||||
c.ConstantBody.ReadLength[0] = DIV_ROUND_UP(push_data_state.alloc_size, 32);
|
||||
c.ConstantBody.Buffer[0] = push_data_addr;
|
||||
c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(push_data_state.alloc_size, 32);
|
||||
c.ConstantBody.Buffer[3] = push_data_addr;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -340,6 +385,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t generated_cmd_stride,
|
||||
struct anv_address indirect_data_addr,
|
||||
uint32_t indirect_data_stride,
|
||||
struct anv_address draw_id_addr,
|
||||
uint32_t item_base,
|
||||
uint32_t item_count,
|
||||
struct anv_address count_addr,
|
||||
@@ -356,15 +402,21 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
|
||||
ANV_UBO_ALIGNMENT);
|
||||
|
||||
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
|
||||
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
|
||||
|
||||
struct anv_generated_indirect_params *push_data = push_data_state.map;
|
||||
*push_data = (struct anv_generated_indirect_params) {
|
||||
.draw = {
|
||||
.draw_id_addr = anv_address_physical(draw_id_addr),
|
||||
.indirect_data_addr = anv_address_physical(indirect_data_addr),
|
||||
.indirect_data_stride = indirect_data_stride,
|
||||
.flags = (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
|
||||
(cmd_buffer->state.conditional_render_enabled ?
|
||||
ANV_GENERATED_FLAG_PREDICATED : 0) |
|
||||
((vs_prog_data->uses_firstvertex ||
|
||||
vs_prog_data->uses_baseinstance) ?
|
||||
ANV_GENERATED_FLAG_BASE : 0) |
|
||||
(vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
|
||||
(anv_mocs(cmd_buffer->device, indirect_data_addr.bo,
|
||||
ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) |
|
||||
((generated_cmd_stride / 4) << 16),
|
||||
@@ -378,6 +430,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
|
||||
},
|
||||
.indirect_data_addr = anv_address_physical(indirect_data_addr),
|
||||
.generated_cmds_addr = anv_address_physical(generated_cmds_addr),
|
||||
.draw_ids_addr = anv_address_physical(draw_id_addr),
|
||||
};
|
||||
|
||||
if (!anv_address_is_null(count_addr)) {
|
||||
@@ -405,6 +458,15 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
|
||||
/* Only emit the data after the memcpy above. */
|
||||
genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state);
|
||||
|
||||
#if GFX_VER == 9
|
||||
/* Why are the push constants not flushed without a binding table
|
||||
* update??
|
||||
*/
|
||||
anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), btp) {
|
||||
btp.PointertoPSBindingTable = cmd_buffer->generation_bt_state.offset;
|
||||
}
|
||||
#endif
|
||||
|
||||
anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
|
||||
prim.VertexAccessType = SEQUENTIAL;
|
||||
prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
|
||||
@@ -440,6 +502,58 @@ genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_b
|
||||
trace_intel_end_generate_draws(&cmd_buffer->trace);
|
||||
|
||||
genX(cmd_buffer_emit_generate_draws_pipeline)(cmd_buffer);
|
||||
|
||||
}
|
||||
|
||||
static struct anv_address
|
||||
genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t draw_id_count)
|
||||
{
|
||||
#if GFX_VER >= 11
|
||||
return ANV_NULL_ADDRESS;
|
||||
#else
|
||||
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
|
||||
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
|
||||
if (!vs_prog_data->uses_drawid)
|
||||
return ANV_NULL_ADDRESS;
|
||||
|
||||
struct anv_state draw_id_state =
|
||||
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * draw_id_count, 4);
|
||||
return anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
|
||||
draw_id_state);
|
||||
#endif
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
/* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
|
||||
* everything. Prior to this, we need to emit a couple of
|
||||
* VERTEX_BUFFER_STATE.
|
||||
*/
|
||||
#if GFX_VER >= 11
|
||||
return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
|
||||
#else
|
||||
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
|
||||
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
|
||||
|
||||
uint32_t len = 0;
|
||||
|
||||
if (vs_prog_data->uses_firstvertex ||
|
||||
vs_prog_data->uses_baseinstance ||
|
||||
vs_prog_data->uses_drawid) {
|
||||
len += 4; /* 3DSTATE_VERTEX_BUFFERS */
|
||||
|
||||
if (vs_prog_data->uses_firstvertex ||
|
||||
vs_prog_data->uses_baseinstance)
|
||||
len += 4 * GENX(VERTEX_BUFFER_STATE_length);
|
||||
|
||||
if (vs_prog_data->uses_drawid)
|
||||
len += 4 * GENX(VERTEX_BUFFER_STATE_length);
|
||||
}
|
||||
|
||||
return len + 4 * GENX(3DPRIMITIVE_length);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -465,14 +579,54 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
|
||||
uint32_t max_draw_count,
|
||||
bool indexed)
|
||||
{
|
||||
const bool start_generation_batch =
|
||||
anv_address_is_null(cmd_buffer->generation_return_addr);
|
||||
|
||||
genX(flush_pipeline_select_3d)(cmd_buffer);
|
||||
|
||||
struct anv_address draw_id_addr =
|
||||
genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
|
||||
|
||||
#if GFX_VER == 9
|
||||
/* Mark the VB-0 as using the entire dynamic state pool area, but only for
|
||||
* the draw call starting the generation batch. All the following ones will
|
||||
* use the same area.
|
||||
*/
|
||||
if (start_generation_batch) {
|
||||
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 0,
|
||||
(struct anv_address) {
|
||||
.offset = DYNAMIC_STATE_POOL_MIN_ADDRESS,
|
||||
},
|
||||
DYNAMIC_STATE_POOL_SIZE);
|
||||
}
|
||||
|
||||
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
|
||||
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
|
||||
|
||||
if (vs_prog_data->uses_baseinstance ||
|
||||
vs_prog_data->uses_firstvertex) {
|
||||
/* We're using the indirect buffer directly to source base instance &
|
||||
* first vertex values. Mark the entire area as used.
|
||||
*/
|
||||
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
|
||||
indirect_data_addr,
|
||||
indirect_data_stride * max_draw_count);
|
||||
}
|
||||
|
||||
if (vs_prog_data->uses_drawid) {
|
||||
/* Mark the whole draw id buffer as used. */
|
||||
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
|
||||
draw_id_addr,
|
||||
sizeof(uint32_t) * max_draw_count);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Apply the pipeline flush here so the indirect data is available for the
|
||||
* generation shader.
|
||||
*/
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
if (anv_address_is_null(cmd_buffer->generation_return_addr))
|
||||
if (start_generation_batch)
|
||||
genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
|
||||
|
||||
/* In order to have the vertex fetch gather the data we need to have a non
|
||||
@@ -493,7 +647,8 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
|
||||
/* Emit the 3D state in the main batch. */
|
||||
genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
|
||||
|
||||
const uint32_t draw_cmd_stride = 4 * GENX(3DPRIMITIVE_EXTENDED_length);
|
||||
const uint32_t draw_cmd_stride =
|
||||
genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
|
||||
|
||||
struct anv_generated_indirect_params *last_params = NULL;
|
||||
uint32_t item_base = 0;
|
||||
@@ -522,6 +677,7 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
|
||||
anv_address_add(indirect_data_addr,
|
||||
item_base * indirect_data_stride),
|
||||
indirect_data_stride,
|
||||
anv_address_add(draw_id_addr, 4 * item_base),
|
||||
item_base,
|
||||
item_count,
|
||||
count_addr,
|
||||
@@ -537,6 +693,10 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
|
||||
}
|
||||
|
||||
genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
|
||||
|
||||
#if GFX_VER == 9
|
||||
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -552,6 +712,9 @@ genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
|
||||
genX(emit_apply_pipe_flushes)(batch,
|
||||
cmd_buffer->device,
|
||||
_3D,
|
||||
#if GFX_VER == 9
|
||||
ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
|
||||
#endif
|
||||
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
|
||||
ANV_PIPE_CS_STALL_BIT);
|
||||
|
||||
|
@@ -25,8 +25,10 @@
|
||||
|
||||
#define ANV_GENERATED_FLAG_INDEXED BITFIELD_BIT(0)
|
||||
#define ANV_GENERATED_FLAG_PREDICATED BITFIELD_BIT(1)
|
||||
#define ANV_GENERATED_FLAG_DRAWID BITFIELD_BIT(2)
|
||||
#define ANV_GENERATED_FLAG_BASE BITFIELD_BIT(3)
|
||||
|
||||
/* These 2 bindings will be accessed through A64 messages */
|
||||
/* These 3 bindings will be accessed through A64 messages */
|
||||
layout(set = 0, binding = 0, std430) buffer Storage0 {
|
||||
uint indirect_data[];
|
||||
};
|
||||
@@ -35,8 +37,13 @@ layout(set = 0, binding = 1, std430) buffer Storage1 {
|
||||
uint commands[];
|
||||
};
|
||||
|
||||
layout(set = 0, binding = 2, std430) buffer Storage2 {
|
||||
uint draw_ids[];
|
||||
};
|
||||
|
||||
/* This data will be provided through push constants. */
|
||||
layout(set = 0, binding = 2) uniform block {
|
||||
layout(set = 0, binding = 3) uniform block {
|
||||
uint64_t draw_id_addr;
|
||||
uint64_t indirect_data_addr;
|
||||
uint indirect_data_stride;
|
||||
uint flags;
|
||||
@@ -47,6 +54,44 @@ layout(set = 0, binding = 2) uniform block {
|
||||
uint64_t end_addr;
|
||||
};
|
||||
|
||||
void write_VERTEX_BUFFER_STATE(uint write_offset,
|
||||
uint mocs,
|
||||
uint buffer_idx,
|
||||
uint64_t address,
|
||||
uint size)
|
||||
{
|
||||
commands[write_offset + 0] = (0 << 0 | /* Buffer Pitch */
|
||||
0 << 13 | /* Null Vertex Buffer */
|
||||
1 << 14 | /* Address Modify Enable */
|
||||
mocs << 16 | /* MOCS */
|
||||
buffer_idx << 26); /* Vertex Buffer Index */
|
||||
commands[write_offset + 1] = uint(address & 0xffffffff);
|
||||
commands[write_offset + 2] = uint(address >> 32);
|
||||
commands[write_offset + 3] = size;
|
||||
}
|
||||
|
||||
void write_3DPRIMITIVE(uint write_offset,
|
||||
bool is_predicated,
|
||||
bool is_indexed,
|
||||
uint vertex_count_per_instance,
|
||||
uint start_vertex_location,
|
||||
uint instance_count,
|
||||
uint start_instance_location,
|
||||
uint base_vertex_location)
|
||||
{
|
||||
commands[write_offset + 0] = (3 << 29 | /* Command Type */
|
||||
3 << 27 | /* Command SubType */
|
||||
3 << 24 | /* 3D Command Opcode */
|
||||
uint(is_predicated) << 8 |
|
||||
5 << 0); /* DWord Length */
|
||||
commands[write_offset + 1] = uint(is_indexed) << 8;
|
||||
commands[write_offset + 2] = vertex_count_per_instance;
|
||||
commands[write_offset + 3] = start_vertex_location;
|
||||
commands[write_offset + 4] = instance_count;
|
||||
commands[write_offset + 5] = start_instance_location;
|
||||
commands[write_offset + 6] = base_vertex_location;
|
||||
}
|
||||
|
||||
void write_3DPRIMITIVE_EXTENDED(uint write_offset,
|
||||
bool is_predicated,
|
||||
bool is_indexed,
|
||||
|
144
src/intel/vulkan/shaders/gfx9_generated_draws.glsl
Normal file
144
src/intel/vulkan/shaders/gfx9_generated_draws.glsl
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright © 2022 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#version 450
|
||||
#extension GL_ARB_gpu_shader_int64 : enable
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
#include "common_generated_draws.glsl"
|
||||
|
||||
void main()
|
||||
{
|
||||
bool is_indexed = (flags & ANV_GENERATED_FLAG_INDEXED) != 0;
|
||||
bool is_predicated = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0;
|
||||
bool uses_base = (flags & ANV_GENERATED_FLAG_BASE) != 0;
|
||||
bool uses_drawid = (flags & ANV_GENERATED_FLAG_DRAWID) != 0;
|
||||
uint mocs = (flags >> 8) & 0xff;
|
||||
uint _3dprim_dw_size = (flags >> 16) & 0xff;
|
||||
uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x);
|
||||
uint indirect_data_offset = item_idx * indirect_data_stride / 4;
|
||||
uint cmd_idx = item_idx * _3dprim_dw_size;
|
||||
uint draw_id = draw_base + item_idx;
|
||||
|
||||
if (draw_id < draw_count) {
|
||||
if (is_indexed) {
|
||||
/* Loading a VkDrawIndexedIndirectCommand */
|
||||
uint index_count = indirect_data[indirect_data_offset + 0];
|
||||
uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
|
||||
uint first_index = indirect_data[indirect_data_offset + 2];
|
||||
uint vertex_offset = indirect_data[indirect_data_offset + 3];
|
||||
uint first_instance = indirect_data[indirect_data_offset + 4];
|
||||
|
||||
if (uses_base || uses_drawid) {
|
||||
uint state_vertex_len =
|
||||
1 + (uses_base ? 4 : 0) + (uses_drawid ? 4 : 0);
|
||||
commands[cmd_idx] =
|
||||
(3 << 29 | /* Command Type */
|
||||
3 << 27 | /* Command SubType */
|
||||
0 << 24 | /* 3D Command Opcode */
|
||||
8 << 16 | /* 3D Command Sub Opcode */
|
||||
(state_vertex_len - 2) << 0); /* DWord Length */
|
||||
cmd_idx += 1;
|
||||
if (uses_base) {
|
||||
uint64_t indirect_draw_data_addr =
|
||||
indirect_data_addr + item_idx * indirect_data_stride + 12;
|
||||
write_VERTEX_BUFFER_STATE(cmd_idx,
|
||||
mocs,
|
||||
31,
|
||||
indirect_draw_data_addr,
|
||||
8);
|
||||
cmd_idx += 4;
|
||||
}
|
||||
if (uses_drawid) {
|
||||
uint64_t draw_idx_addr = draw_id_addr + 4 * item_idx;
|
||||
draw_ids[draw_id] = draw_id;
|
||||
write_VERTEX_BUFFER_STATE(cmd_idx,
|
||||
mocs,
|
||||
32,
|
||||
draw_idx_addr,
|
||||
4);
|
||||
cmd_idx += 4;
|
||||
}
|
||||
}
|
||||
write_3DPRIMITIVE(cmd_idx,
|
||||
is_predicated,
|
||||
is_indexed,
|
||||
index_count,
|
||||
first_index,
|
||||
instance_count,
|
||||
first_instance,
|
||||
vertex_offset);
|
||||
} else {
|
||||
/* Loading a VkDrawIndirectCommand structure */
|
||||
uint vertex_count = indirect_data[indirect_data_offset + 0];
|
||||
uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
|
||||
uint first_vertex = indirect_data[indirect_data_offset + 2];
|
||||
uint first_instance = indirect_data[indirect_data_offset + 3];
|
||||
|
||||
if (uses_base || uses_drawid) {
|
||||
uint state_vertex_len =
|
||||
1 + (uses_base ? 4 : 0) + (uses_drawid ? 4 : 0);
|
||||
commands[cmd_idx] =
|
||||
(3 << 29 | /* Command Type */
|
||||
3 << 27 | /* Command SubType */
|
||||
0 << 24 | /* 3D Command Opcode */
|
||||
8 << 16 | /* 3D Command Sub Opcode */
|
||||
(state_vertex_len - 2) << 0); /* DWord Length */
|
||||
cmd_idx += 1;
|
||||
if (uses_base) {
|
||||
uint64_t indirect_draw_data_addr =
|
||||
indirect_data_addr + item_idx * indirect_data_stride + 8;
|
||||
write_VERTEX_BUFFER_STATE(cmd_idx,
|
||||
mocs,
|
||||
31,
|
||||
indirect_draw_data_addr,
|
||||
8);
|
||||
cmd_idx += 4;
|
||||
}
|
||||
if (uses_drawid) {
|
||||
uint64_t draw_idx_addr = draw_id_addr + 4 * item_idx;
|
||||
draw_ids[draw_id] = draw_id;
|
||||
write_VERTEX_BUFFER_STATE(cmd_idx,
|
||||
mocs,
|
||||
32,
|
||||
draw_idx_addr,
|
||||
4);
|
||||
cmd_idx += 4;
|
||||
}
|
||||
}
|
||||
write_3DPRIMITIVE(cmd_idx,
|
||||
is_predicated,
|
||||
is_indexed,
|
||||
vertex_count,
|
||||
first_vertex,
|
||||
instance_count,
|
||||
first_instance,
|
||||
0 /* base_vertex_location */);
|
||||
}
|
||||
} else if (draw_id == draw_count && draw_id < max_draw_count) {
|
||||
/* Only write a jump forward in the batch if we have fewer elements than
|
||||
* the max draw count.
|
||||
*/
|
||||
write_MI_BATCH_BUFFER_START(cmd_idx, end_addr);
|
||||
}
|
||||
}
|
@@ -33,6 +33,7 @@ float64_spv_h = custom_target(
|
||||
)
|
||||
|
||||
generated_draws_shaders = [
|
||||
'gfx9_generated_draws.glsl',
|
||||
'gfx11_generated_draws.glsl',
|
||||
]
|
||||
|
||||
|
Reference in New Issue
Block a user