anv: add gfx9 generated draw support

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20497>
This commit is contained in:
Lionel Landwerlin
2023-01-02 00:58:46 +02:00
committed by Marge Bot
parent 3ac7d5f258
commit 1d9cf8f381
10 changed files with 432 additions and 64 deletions

View File

@@ -134,6 +134,7 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
cmd_buffer->generation_jump_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
anv_cmd_state_init(cmd_buffer);
@@ -201,6 +202,7 @@ anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
cmd_buffer->generation_jump_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
anv_state_stream_finish(&cmd_buffer->surface_state_stream);
anv_state_stream_init(&cmd_buffer->surface_state_stream,

View File

@@ -897,7 +897,6 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
device->generated_indirect_draws =
device->info.ver >= 11 &&
debug_get_bool_option("ANV_ENABLE_GENERATED_INDIRECT_DRAWS",
true);

View File

@@ -31,6 +31,7 @@
#include "anv_generated_indirect_draws.h"
#include "shaders/gfx9_generated_draws_spv.h"
#include "shaders/gfx11_generated_draws_spv.h"
/* This pass takes vulkan descriptor bindings 0 & 1 and turns them into global
@@ -101,7 +102,27 @@ lower_vulkan_descriptors_instr(nir_builder *b, nir_instr *instr, void *cb_data)
break;
}
case 2:
case 2: {
desc_value =
nir_load_ubo(b, 1, 64,
nir_imm_int(b, 2),
nir_imm_int(b,
offsetof(struct anv_generated_indirect_params,
draw_ids_addr)),
.align_mul = 8,
.align_offset = 0,
.range_base = 0,
.range = ~0);
desc_value =
nir_vec4(b,
nir_unpack_64_2x32_split_x(b, desc_value),
nir_unpack_64_2x32_split_y(b, desc_value),
nir_imm_int(b, 0),
nir_imm_int(b, 0));
break;
}
case 3:
desc_value =
nir_vec2(b,
nir_imm_int(b, 2),
@@ -276,9 +297,6 @@ compile_upload_spirv(struct anv_device *device,
VkResult
anv_device_init_generated_indirect_draws(struct anv_device *device)
{
if (device->info->ver < 11)
return VK_SUCCESS;
const struct intel_l3_weights w =
intel_get_default_l3_weights(device->info,
true /* wants_dc_cache */,
@@ -298,16 +316,24 @@ anv_device_init_generated_indirect_draws(struct anv_device *device)
sizeof(indirect_draws_key),
NULL);
if (device->generated_draw_kernel == NULL) {
const uint32_t *spirv_source =
device->info->ver >= 11 ?
gfx11_generated_draws_spv_source :
gfx9_generated_draws_spv_source;
const uint32_t spirv_source_size =
device->info->ver >= 11 ?
ARRAY_SIZE(gfx11_generated_draws_spv_source) :
ARRAY_SIZE(gfx9_generated_draws_spv_source);
const uint32_t send_count =
device->info->ver >= 11 ?
11 /* 2 * (2 loads + 3 stores) + 1 store */ :
17 /* 2 * (2 loads + 6 stores) + 1 store */;
device->generated_draw_kernel =
compile_upload_spirv(device,
&indirect_draws_key,
sizeof(indirect_draws_key),
gfx11_generated_draws_spv_source,
ARRAY_SIZE(gfx11_generated_draws_spv_source),
11 /*
* 2 * (2 indirect data loads + 3 3DPRIMITVE stores) +
* 1 store (MI_BATCH_BUFFER_START)
*/);
spirv_source, spirv_source_size, send_count);
}
if (device->generated_draw_kernel == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

View File

@@ -28,19 +28,40 @@
#define ANV_GENERATED_FLAG_INDEXED BITFIELD_BIT(0)
#define ANV_GENERATED_FLAG_PREDICATED BITFIELD_BIT(1)
#define ANV_GENERATED_FLAG_DRAWID BITFIELD_BIT(2)
#define ANV_GENERATED_FLAG_BASE BITFIELD_BIT(3)
/* This needs to match common_generated_draws.glsl :
*
* layout(set = 0, binding = 2) uniform block
*/
struct anv_generated_indirect_draw_params {
/* Draw ID buffer address (only used on Gfx9) */
uint64_t draw_id_addr;
/* Indirect data buffer address (only used on Gfx9) */
uint64_t indirect_data_addr;
/* Stride between each elements of the indirect data buffer */
uint32_t indirect_data_stride;
uint32_t flags; /* 0-7: bits, 8-15: mocs, 16-23: cmd_dws */
/* Base number of the draw ID, it is added to the index computed from the
* gl_FragCoord
*/
uint32_t draw_base;
/* Number of draws to generate */
uint32_t draw_count;
/* Maximum number of draws (equals to draw_count for indirect draws without
* an indirect count)
*/
uint32_t max_draw_count;
/* Instance multiplier for multi view */
uint32_t instance_multiplier;
/* Address where to jump at after the generated draw (only used with
* indirect draw count variants)
*/
uint64_t end_addr;
};
@@ -53,6 +74,9 @@ struct anv_generated_indirect_params {
/* Global address of binding 1 */
uint64_t generated_cmds_addr;
/* Global address of binding 2 */
uint64_t draw_ids_addr;
/* CPU side pointer to the previous item when number of draws has to be
* split into smaller chunks, see while loop in
* genX(cmd_buffer_emit_indirect_generated_draws)

View File

@@ -2777,6 +2777,11 @@ struct anv_cmd_buffer {
*/
struct anv_address generation_return_addr;
/**
* Binding table allocation for generation shaders (only used on Gfx9).
*/
struct anv_state generation_bt_state;
/** List of anv_batch_bo used for generation
*
* We have to keep this separated of the anv_cmd_buffer::batch_bos that is

View File

@@ -3550,12 +3550,8 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
}
#define GFX_HAS_GENERATED_CMDS GFX_VER >= 11
#if GFX_HAS_GENERATED_CMDS
#include "genX_cmd_draw_generated_indirect.h"
#endif
#if GFX_HAS_GENERATED_CMDS
ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
{
@@ -3574,7 +3570,6 @@ anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
return device->physical->generated_indirect_draws &&
count >= device->physical->instance->generated_indirect_threshold;
}
#endif
VkResult
genX(BeginCommandBuffer)(
@@ -3791,9 +3786,7 @@ genX(EndCommandBuffer)(
anv_measure_endcommandbuffer(cmd_buffer);
#if GFX_HAS_GENERATED_CMDS
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
#endif
/* Turn on object level preemption if it is disabled to have it in known
* state at the beginning of new command buffer.
@@ -3873,9 +3866,7 @@ genX(CmdExecuteCommands)(
*/
genX(cmd_buffer_apply_pipe_flushes)(primary);
#if GFX_HAS_GENERATED_CMDS
genX(cmd_buffer_flush_generated_draws)(primary);
#endif
for (uint32_t i = 0; i < commandBufferCount; i++) {
ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
@@ -4066,10 +4057,8 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
#if GFX_HAS_GENERATED_CMDS
if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
#endif
anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
}
@@ -4721,7 +4710,6 @@ void genX(CmdDrawIndirect)(
drawCount);
trace_intel_begin_draw_indirect(&cmd_buffer->trace);
#if GFX_HAS_GENERATED_CMDS
if (anv_use_generated_draws(cmd_buffer, drawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
@@ -4735,11 +4723,6 @@ void genX(CmdDrawIndirect)(
anv_address_add(buffer->address, offset),
stride, drawCount, false /* indexed */);
}
#else
emit_indirect_draws(cmd_buffer,
anv_address_add(buffer->address, offset),
stride, drawCount, false /* indexed */);
#endif
trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
}
@@ -4763,7 +4746,6 @@ void genX(CmdDrawIndexedIndirect)(
drawCount);
trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
#if GFX_HAS_GENERATED_CMDS
if (anv_use_generated_draws(cmd_buffer, drawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
@@ -4777,11 +4759,6 @@ void genX(CmdDrawIndexedIndirect)(
anv_address_add(buffer->address, offset),
stride, drawCount, true /* indexed */);
}
#else
emit_indirect_draws(cmd_buffer,
anv_address_add(buffer->address, offset),
stride, drawCount, true /* indexed */);
#endif
trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
}
@@ -4966,7 +4943,6 @@ void genX(CmdDrawIndirectCount)(
anv_address_add(count_buffer->address, countBufferOffset);
stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
#if GFX_HAS_GENERATED_CMDS
if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
@@ -4983,14 +4959,6 @@ void genX(CmdDrawIndirectCount)(
maxDrawCount,
false /* indexed */);
}
#else
emit_indirect_count_draws(cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
false /* indexed */);
#endif
trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
}
@@ -5023,7 +4991,6 @@ void genX(CmdDrawIndexedIndirectCount)(
anv_address_add(count_buffer->address, countBufferOffset);
stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
#if GFX_HAS_GENERATED_CMDS
if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
genX(cmd_buffer_emit_indirect_generated_draws)(
cmd_buffer,
@@ -5040,14 +5007,6 @@ void genX(CmdDrawIndexedIndirectCount)(
maxDrawCount,
true /* indexed */);
}
#else
emit_indirect_count_draws(cmd_buffer,
indirect_data_address,
stride,
count_address,
maxDrawCount,
true /* indexed */);
#endif
trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);

View File

@@ -34,10 +34,6 @@
#include "anv_private.h"
#include "anv_generated_indirect_draws.h"
#if GFX_VER < 11
#error "Generated draws optimization not supported prior to Gfx11"
#endif
/* This is a maximum number of items a fragment shader can generate due to the
* viewport size.
*/
@@ -92,7 +88,9 @@ genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
sgvs.InstanceIDComponentNumber = COMP_1;
sgvs.InstanceIDElementOffset = 0;
}
#if GFX_VER >= 11
anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
#endif
anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
vfi.InstancingEnable = false;
vfi.VertexElementIndex = 0;
@@ -183,7 +181,7 @@ genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
ps.VectorMaskEnable = prog_data->uses_vmask;
ps.BindingTableEntryCount = 0;
ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0;
ps.PushConstantEnable = prog_data->base.nr_params > 0 ||
prog_data->base.ubo_ranges[0].length;
@@ -254,7 +252,44 @@ genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer)
}
#endif
cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0) | BITFIELD_BIT(1);
#if GFX_VER == 9
/* Allocate a binding table for Gfx9 for 2 reason :
*
* 1. we need a to emit a 3DSTATE_BINDING_TABLE_POINTERS_PS to make the
* HW apply the preceeding 3DSTATE_CONSTANT_PS
*
* 2. Emitting an empty 3DSTATE_BINDING_TABLE_POINTERS_PS would cause RT
* writes (even though they're empty) to disturb later writes
* (probably due to RT cache)
*
* Our binding table only has one entry to the null surface.
*/
uint32_t bt_offset;
cmd_buffer->generation_bt_state =
anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1, &bt_offset);
if (cmd_buffer->generation_bt_state.map == NULL) {
VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
if (result != VK_SUCCESS)
return;
/* Re-emit state base addresses so we get the new surface state base
* address before we start emitting binding tables etc.
*/
genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
cmd_buffer->generation_bt_state =
anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1, &bt_offset);
assert(cmd_buffer->generation_bt_state.map != NULL);
}
uint32_t *bt_map = cmd_buffer->generation_bt_state.map;
bt_map[0] = anv_bindless_state_for_binding_table(
cmd_buffer->device->null_surface_state).offset + bt_offset;
cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
#endif
cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0);
cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
ANV_CMD_DIRTY_XFB_ENABLE);
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
@@ -326,10 +361,20 @@ genX(cmd_buffer_emit_generated_push_data)(struct anv_cmd_buffer *cmd_buffer,
.ConstantBufferReadLength = DIV_ROUND_UP(push_data_state.alloc_size, 32),
});
#else
/* The Skylake PRM contains the following restriction:
*
* "The driver must ensure The following case does not occur
* without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
* buffer 3 read length equal to zero committed followed by a
* 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
* zero committed."
*
* To avoid this, we program the highest slot.
*/
anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) {
c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
c.ConstantBody.ReadLength[0] = DIV_ROUND_UP(push_data_state.alloc_size, 32);
c.ConstantBody.Buffer[0] = push_data_addr;
c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(push_data_state.alloc_size, 32);
c.ConstantBody.Buffer[3] = push_data_addr;
}
#endif
}
@@ -340,6 +385,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
uint32_t generated_cmd_stride,
struct anv_address indirect_data_addr,
uint32_t indirect_data_stride,
struct anv_address draw_id_addr,
uint32_t item_base,
uint32_t item_count,
struct anv_address count_addr,
@@ -356,15 +402,21 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
ANV_UBO_ALIGNMENT);
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
struct anv_generated_indirect_params *push_data = push_data_state.map;
*push_data = (struct anv_generated_indirect_params) {
.draw = {
.draw_id_addr = anv_address_physical(draw_id_addr),
.indirect_data_addr = anv_address_physical(indirect_data_addr),
.indirect_data_stride = indirect_data_stride,
.flags = (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
(cmd_buffer->state.conditional_render_enabled ?
ANV_GENERATED_FLAG_PREDICATED : 0) |
((vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance) ?
ANV_GENERATED_FLAG_BASE : 0) |
(vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
(anv_mocs(cmd_buffer->device, indirect_data_addr.bo,
ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) |
((generated_cmd_stride / 4) << 16),
@@ -378,6 +430,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
},
.indirect_data_addr = anv_address_physical(indirect_data_addr),
.generated_cmds_addr = anv_address_physical(generated_cmds_addr),
.draw_ids_addr = anv_address_physical(draw_id_addr),
};
if (!anv_address_is_null(count_addr)) {
@@ -405,6 +458,15 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
/* Only emit the data after the memcpy above. */
genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state);
#if GFX_VER == 9
/* Why are the push constants not flushed without a binding table
* update??
*/
anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), btp) {
btp.PointertoPSBindingTable = cmd_buffer->generation_bt_state.offset;
}
#endif
anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
prim.VertexAccessType = SEQUENTIAL;
prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
@@ -440,6 +502,58 @@ genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_b
trace_intel_end_generate_draws(&cmd_buffer->trace);
genX(cmd_buffer_emit_generate_draws_pipeline)(cmd_buffer);
}
static struct anv_address
genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
uint32_t draw_id_count)
{
#if GFX_VER >= 11
return ANV_NULL_ADDRESS;
#else
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
if (!vs_prog_data->uses_drawid)
return ANV_NULL_ADDRESS;
struct anv_state draw_id_state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * draw_id_count, 4);
return anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
draw_id_state);
#endif
}
static uint32_t
genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
{
/* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
* everything. Prior to this, we need to emit a couple of
* VERTEX_BUFFER_STATE.
*/
#if GFX_VER >= 11
return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
#else
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
uint32_t len = 0;
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance ||
vs_prog_data->uses_drawid) {
len += 4; /* 3DSTATE_VERTEX_BUFFERS */
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance)
len += 4 * GENX(VERTEX_BUFFER_STATE_length);
if (vs_prog_data->uses_drawid)
len += 4 * GENX(VERTEX_BUFFER_STATE_length);
}
return len + 4 * GENX(3DPRIMITIVE_length);
#endif
}
static void
@@ -465,14 +579,54 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
uint32_t max_draw_count,
bool indexed)
{
const bool start_generation_batch =
anv_address_is_null(cmd_buffer->generation_return_addr);
genX(flush_pipeline_select_3d)(cmd_buffer);
struct anv_address draw_id_addr =
genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
#if GFX_VER == 9
/* Mark the VB-0 as using the entire dynamic state pool area, but only for
* the draw call starting the generation batch. All the following ones will
* use the same area.
*/
if (start_generation_batch) {
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 0,
(struct anv_address) {
.offset = DYNAMIC_STATE_POOL_MIN_ADDRESS,
},
DYNAMIC_STATE_POOL_SIZE);
}
struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
if (vs_prog_data->uses_baseinstance ||
vs_prog_data->uses_firstvertex) {
/* We're using the indirect buffer directly to source base instance &
* first vertex values. Mark the entire area as used.
*/
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
indirect_data_addr,
indirect_data_stride * max_draw_count);
}
if (vs_prog_data->uses_drawid) {
/* Mark the whole draw id buffer as used. */
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
draw_id_addr,
sizeof(uint32_t) * max_draw_count);
}
#endif
/* Apply the pipeline flush here so the indirect data is available for the
* generation shader.
*/
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
if (anv_address_is_null(cmd_buffer->generation_return_addr))
if (start_generation_batch)
genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
/* In order to have the vertex fetch gather the data we need to have a non
@@ -493,7 +647,8 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
/* Emit the 3D state in the main batch. */
genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
const uint32_t draw_cmd_stride = 4 * GENX(3DPRIMITIVE_EXTENDED_length);
const uint32_t draw_cmd_stride =
genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
struct anv_generated_indirect_params *last_params = NULL;
uint32_t item_base = 0;
@@ -522,6 +677,7 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
anv_address_add(indirect_data_addr,
item_base * indirect_data_stride),
indirect_data_stride,
anv_address_add(draw_id_addr, 4 * item_base),
item_base,
item_count,
count_addr,
@@ -537,6 +693,10 @@ genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer
}
genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
#if GFX_VER == 9
update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
#endif
}
static void
@@ -552,6 +712,9 @@ genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
genX(emit_apply_pipe_flushes)(batch,
cmd_buffer->device,
_3D,
#if GFX_VER == 9
ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
#endif
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_CS_STALL_BIT);

View File

@@ -25,8 +25,10 @@
#define ANV_GENERATED_FLAG_INDEXED BITFIELD_BIT(0)
#define ANV_GENERATED_FLAG_PREDICATED BITFIELD_BIT(1)
#define ANV_GENERATED_FLAG_DRAWID BITFIELD_BIT(2)
#define ANV_GENERATED_FLAG_BASE BITFIELD_BIT(3)
/* These 2 bindings will be accessed through A64 messages */
/* These 3 bindings will be accessed through A64 messages */
layout(set = 0, binding = 0, std430) buffer Storage0 {
uint indirect_data[];
};
@@ -35,8 +37,13 @@ layout(set = 0, binding = 1, std430) buffer Storage1 {
uint commands[];
};
layout(set = 0, binding = 2, std430) buffer Storage2 {
uint draw_ids[];
};
/* This data will be provided through push constants. */
layout(set = 0, binding = 2) uniform block {
layout(set = 0, binding = 3) uniform block {
uint64_t draw_id_addr;
uint64_t indirect_data_addr;
uint indirect_data_stride;
uint flags;
@@ -47,6 +54,44 @@ layout(set = 0, binding = 2) uniform block {
uint64_t end_addr;
};
void write_VERTEX_BUFFER_STATE(uint write_offset,
uint mocs,
uint buffer_idx,
uint64_t address,
uint size)
{
commands[write_offset + 0] = (0 << 0 | /* Buffer Pitch */
0 << 13 | /* Null Vertex Buffer */
1 << 14 | /* Address Modify Enable */
mocs << 16 | /* MOCS */
buffer_idx << 26); /* Vertex Buffer Index */
commands[write_offset + 1] = uint(address & 0xffffffff);
commands[write_offset + 2] = uint(address >> 32);
commands[write_offset + 3] = size;
}
void write_3DPRIMITIVE(uint write_offset,
bool is_predicated,
bool is_indexed,
uint vertex_count_per_instance,
uint start_vertex_location,
uint instance_count,
uint start_instance_location,
uint base_vertex_location)
{
commands[write_offset + 0] = (3 << 29 | /* Command Type */
3 << 27 | /* Command SubType */
3 << 24 | /* 3D Command Opcode */
uint(is_predicated) << 8 |
5 << 0); /* DWord Length */
commands[write_offset + 1] = uint(is_indexed) << 8;
commands[write_offset + 2] = vertex_count_per_instance;
commands[write_offset + 3] = start_vertex_location;
commands[write_offset + 4] = instance_count;
commands[write_offset + 5] = start_instance_location;
commands[write_offset + 6] = base_vertex_location;
}
void write_3DPRIMITIVE_EXTENDED(uint write_offset,
bool is_predicated,
bool is_indexed,

View File

@@ -0,0 +1,144 @@
/*
* Copyright © 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#version 450
#extension GL_ARB_gpu_shader_int64 : enable
#extension GL_GOOGLE_include_directive : enable
#include "common_generated_draws.glsl"
void main()
{
bool is_indexed = (flags & ANV_GENERATED_FLAG_INDEXED) != 0;
bool is_predicated = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0;
bool uses_base = (flags & ANV_GENERATED_FLAG_BASE) != 0;
bool uses_drawid = (flags & ANV_GENERATED_FLAG_DRAWID) != 0;
uint mocs = (flags >> 8) & 0xff;
uint _3dprim_dw_size = (flags >> 16) & 0xff;
uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x);
uint indirect_data_offset = item_idx * indirect_data_stride / 4;
uint cmd_idx = item_idx * _3dprim_dw_size;
uint draw_id = draw_base + item_idx;
if (draw_id < draw_count) {
if (is_indexed) {
/* Loading a VkDrawIndexedIndirectCommand */
uint index_count = indirect_data[indirect_data_offset + 0];
uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
uint first_index = indirect_data[indirect_data_offset + 2];
uint vertex_offset = indirect_data[indirect_data_offset + 3];
uint first_instance = indirect_data[indirect_data_offset + 4];
if (uses_base || uses_drawid) {
uint state_vertex_len =
1 + (uses_base ? 4 : 0) + (uses_drawid ? 4 : 0);
commands[cmd_idx] =
(3 << 29 | /* Command Type */
3 << 27 | /* Command SubType */
0 << 24 | /* 3D Command Opcode */
8 << 16 | /* 3D Command Sub Opcode */
(state_vertex_len - 2) << 0); /* DWord Length */
cmd_idx += 1;
if (uses_base) {
uint64_t indirect_draw_data_addr =
indirect_data_addr + item_idx * indirect_data_stride + 12;
write_VERTEX_BUFFER_STATE(cmd_idx,
mocs,
31,
indirect_draw_data_addr,
8);
cmd_idx += 4;
}
if (uses_drawid) {
uint64_t draw_idx_addr = draw_id_addr + 4 * item_idx;
draw_ids[draw_id] = draw_id;
write_VERTEX_BUFFER_STATE(cmd_idx,
mocs,
32,
draw_idx_addr,
4);
cmd_idx += 4;
}
}
write_3DPRIMITIVE(cmd_idx,
is_predicated,
is_indexed,
index_count,
first_index,
instance_count,
first_instance,
vertex_offset);
} else {
/* Loading a VkDrawIndirectCommand structure */
uint vertex_count = indirect_data[indirect_data_offset + 0];
uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier;
uint first_vertex = indirect_data[indirect_data_offset + 2];
uint first_instance = indirect_data[indirect_data_offset + 3];
if (uses_base || uses_drawid) {
uint state_vertex_len =
1 + (uses_base ? 4 : 0) + (uses_drawid ? 4 : 0);
commands[cmd_idx] =
(3 << 29 | /* Command Type */
3 << 27 | /* Command SubType */
0 << 24 | /* 3D Command Opcode */
8 << 16 | /* 3D Command Sub Opcode */
(state_vertex_len - 2) << 0); /* DWord Length */
cmd_idx += 1;
if (uses_base) {
uint64_t indirect_draw_data_addr =
indirect_data_addr + item_idx * indirect_data_stride + 8;
write_VERTEX_BUFFER_STATE(cmd_idx,
mocs,
31,
indirect_draw_data_addr,
8);
cmd_idx += 4;
}
if (uses_drawid) {
uint64_t draw_idx_addr = draw_id_addr + 4 * item_idx;
draw_ids[draw_id] = draw_id;
write_VERTEX_BUFFER_STATE(cmd_idx,
mocs,
32,
draw_idx_addr,
4);
cmd_idx += 4;
}
}
write_3DPRIMITIVE(cmd_idx,
is_predicated,
is_indexed,
vertex_count,
first_vertex,
instance_count,
first_instance,
0 /* base_vertex_location */);
}
} else if (draw_id == draw_count && draw_id < max_draw_count) {
/* Only write a jump forward in the batch if we have fewer elements than
* the max draw count.
*/
write_MI_BATCH_BUFFER_START(cmd_idx, end_addr);
}
}

View File

@@ -33,6 +33,7 @@ float64_spv_h = custom_target(
)
generated_draws_shaders = [
'gfx9_generated_draws.glsl',
'gfx11_generated_draws.glsl',
]