/* * Copyright © 2021 Google * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "radv_meta.h" #include "radv_private.h" #include "nir_builder.h" enum radv_dgc_token_type { RADV_DGC_INDEX_BUFFER, RADV_DGC_DRAW, RADV_DGC_INDEXED_DRAW, }; struct radv_dgc_token { uint16_t type; /* enum radv_dgc_token_type, but making the size explicit */ uint16_t offset; /* offset in the input stream */ union { struct { uint16_t vtx_base_sgpr; } draw; struct { uint16_t index_size; uint16_t vtx_base_sgpr; uint32_t max_index_count; } indexed_draw; }; }; struct radv_dgc_params { uint32_t cmd_buf_stride; uint32_t cmd_buf_size; uint32_t upload_stride; uint32_t upload_addr; uint32_t sequence_count; uint32_t stream_stride; /* draw info */ uint16_t draw_indexed; uint16_t draw_params_offset; uint16_t base_index_size; uint16_t vtx_base_sgpr; uint32_t max_index_count; /* bind index buffer info. Valid if base_index_size == 0 && draw_indexed */ uint16_t index_buffer_offset; /* Top bit is DGC_DYNAMIC_VERTEX_INPUT */ uint8_t vbo_cnt; uint8_t const_copy; /* Which VBOs are set in this indirect layout. */ uint32_t vbo_bind_mask; uint16_t vbo_reg; uint16_t const_copy_size; uint64_t push_constant_mask; uint32_t ibo_type_32; uint32_t ibo_type_8; uint16_t push_constant_shader_cnt; uint16_t emit_state; uint32_t pa_su_sc_mode_cntl_base; uint16_t state_offset; uint16_t scissor_count; uint16_t scissor_offset; /* in parameter buffer. */ }; enum { DGC_USES_DRAWID = 1u << 14, DGC_USES_BASEINSTANCE = 1u << 15, }; enum { DGC_DYNAMIC_STRIDE = 1u << 15, }; enum { DGC_DYNAMIC_VERTEX_INPUT = 1u << 7, }; enum { DGC_DESC_STREAM, DGC_DESC_PREPARE, DGC_DESC_PARAMS, DGC_DESC_COUNT, DGC_NUM_DESCS, }; struct dgc_cmdbuf { nir_ssa_def *descriptor; nir_variable *offset; }; static void dgc_emit(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *value) { assert(value->bit_size >= 32); nir_ssa_def *offset = nir_load_var(b, cs->offset); nir_store_ssbo(b, value, cs->descriptor, offset,.access = ACCESS_NON_READABLE); nir_store_var(b, cs->offset, nir_iadd_imm(b, offset, value->num_components * value->bit_size / 8), 0x1); } #define load_param32(b, field) \ nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \ .base = offsetof(struct radv_dgc_params, field), .range = 4) #define load_param16(b, field) \ nir_ubfe( \ (b), \ nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \ .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \ nir_imm_int((b), (offsetof(struct radv_dgc_params, field) & 2) * 8), nir_imm_int((b), 16)) #define load_param8(b, field) \ nir_ubfe( \ (b), \ nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \ .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \ nir_imm_int((b), (offsetof(struct radv_dgc_params, field) & 3) * 8), nir_imm_int((b), 8)) #define load_param64(b, field) \ nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \ .base = offsetof(struct radv_dgc_params, field), .range = 8)) static nir_ssa_def * nir_pkt3(nir_builder *b, unsigned op, nir_ssa_def *len) { len = nir_iand_imm(b, len, 0x3fff); return nir_ior_imm(b, nir_ishl_imm(b, len, 16), PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op)); } static void dgc_emit_userdata_vertex(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *vtx_base_sgpr, nir_ssa_def *first_vertex, nir_ssa_def *first_instance, nir_ssa_def *drawid) { vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr); nir_ssa_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID); nir_ssa_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE); nir_ssa_def *pkt_cnt = nir_imm_int(b, 1); pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt); nir_ssa_def *values[5] = { nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), first_vertex, nir_imm_int(b, PKT3_NOP_PAD), nir_imm_int(b, PKT3_NOP_PAD), }; values[3] = nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance), values[4]); values[4] = nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, values[4]); dgc_emit(b, cs, nir_vec(b, values, 5)); } static void dgc_emit_instance_count(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *instance_count) { nir_ssa_def *values[2] = {nir_imm_int(b, PKT3(PKT3_NUM_INSTANCES, 0, false)), instance_count}; dgc_emit(b, cs, nir_vec(b, values, 2)); } static void dgc_emit_draw_indexed(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *index_offset, nir_ssa_def *index_count, nir_ssa_def *max_index_count) { nir_ssa_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, false)), max_index_count, index_offset, index_count, nir_imm_int(b, V_0287F0_DI_SRC_SEL_DMA)}; dgc_emit(b, cs, nir_vec(b, values, 5)); } static void dgc_emit_draw(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *vertex_count) { nir_ssa_def *values[3] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_AUTO, 1, false)), vertex_count, nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX)}; dgc_emit(b, cs, nir_vec(b, values, 3)); } static void build_dgc_buffer_tail(nir_builder *b, nir_ssa_def *sequence_count) { nir_ssa_def *global_id = get_global_ids(b, 1); nir_ssa_def *cmd_buf_stride = load_param32(b, cmd_buf_stride); nir_ssa_def *cmd_buf_size = load_param32(b, cmd_buf_size); nir_push_if(b, nir_ieq_imm(b, global_id, 0)); { nir_ssa_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count); nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset"); nir_store_var(b, offset, cmd_buf_tail_start, 0x1); nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PREPARE); nir_push_loop(b); { nir_ssa_def *curr_offset = nir_load_var(b, offset); const unsigned MAX_PACKET_WORDS = 0x3FFC; nir_push_if(b, nir_ieq(b, curr_offset, cmd_buf_size)); { nir_jump(b, nir_jump_break); } nir_pop_if(b, NULL); nir_ssa_def *packet_size = nir_isub(b, cmd_buf_size, curr_offset); packet_size = nir_umin(b, packet_size, nir_imm_int(b, MAX_PACKET_WORDS * 4)); nir_ssa_def *len = nir_ushr_imm(b, packet_size, 2); len = nir_iadd_imm(b, len, -2); nir_ssa_def *packet = nir_pkt3(b, PKT3_NOP, len); nir_store_ssbo(b, packet, dst_buf, curr_offset, .access = ACCESS_NON_READABLE); nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1); } nir_pop_loop(b, NULL); } nir_pop_if(b, NULL); } static nir_shader * build_dgc_prepare_shader(struct radv_device *dev) { nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare"); b.shader->info.workgroup_size[0] = 64; nir_ssa_def *global_id = get_global_ids(&b, 1); nir_ssa_def *sequence_id = global_id; nir_ssa_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride); nir_ssa_def *sequence_count = load_param32(&b, sequence_count); nir_ssa_def *stream_stride = load_param32(&b, stream_stride); nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count"); nir_store_var(&b, count_var, sequence_count, 0x1); nir_push_if(&b, nir_ieq_imm(&b, sequence_count, UINT32_MAX)); { nir_ssa_def *count_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_COUNT); nir_ssa_def *cnt = nir_load_ssbo(&b, 1, 32, count_buf, nir_imm_int(&b, 0), .align_mul = 4); nir_store_var(&b, count_var, cnt, 0x1); } nir_pop_if(&b, NULL); sequence_count = nir_load_var(&b, count_var); nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count)); { struct dgc_cmdbuf cmd_buf = { .descriptor = radv_meta_load_descriptor(&b, 0, DGC_DESC_PREPARE), .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"), }; nir_store_var(&b, cmd_buf.offset, nir_imul(&b, global_id, cmd_buf_stride), 1); nir_ssa_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride); nir_ssa_def *stream_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_STREAM); nir_ssa_def *stream_base = nir_imul(&b, sequence_id, stream_stride); nir_variable *upload_offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset"); nir_store_var(&b, upload_offset, nir_iadd(&b, load_param32(&b, cmd_buf_size), nir_imul(&b, load_param32(&b, upload_stride), sequence_id)), 0x1); nir_ssa_def *vbo_bind_mask = load_param32(&b, vbo_bind_mask); nir_ssa_def *vbo_cnt = nir_iand_imm(&b, load_param8(&b, vbo_cnt), 0x7F); nir_push_if(&b, nir_ine_imm(&b, vbo_bind_mask, 0)); { nir_variable *vbo_idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx"); nir_store_var(&b, vbo_idx, nir_imm_int(&b, 0), 0x1); nir_push_loop(&b); { nir_push_if(&b, nir_uge(&b, nir_load_var(&b, vbo_idx), vbo_cnt)); { nir_jump(&b, nir_jump_break); } nir_pop_if(&b, NULL); nir_ssa_def *vbo_offset = nir_imul_imm(&b, nir_load_var(&b, vbo_idx), 16); nir_variable *vbo_data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data"); nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_PARAMS); nir_store_var(&b, vbo_data, nir_load_ssbo(&b, 4, 32, param_buf, vbo_offset, .align_mul = 4), 0xf); nir_ssa_def *vbo_override = nir_ine_imm(&b, nir_iand(&b, vbo_bind_mask, nir_ishl(&b, nir_imm_int(&b, 1), nir_load_var(&b, vbo_idx))), 0); nir_push_if(&b, vbo_override); { nir_ssa_def *vbo_offset_offset = nir_iadd(&b, nir_imul_imm(&b, vbo_cnt, 16), nir_imul_imm(&b, nir_load_var(&b, vbo_idx), 8)); nir_ssa_def *vbo_over_data = nir_load_ssbo(&b, 2, 32, param_buf, vbo_offset_offset, .align_mul = 4); nir_ssa_def *stream_offset = nir_iadd( &b, stream_base, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 0x7FFF)); nir_ssa_def *stream_data = nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4); nir_ssa_def *va = nir_pack_64_2x32(&b, nir_channels(&b, stream_data, 0x3)); nir_ssa_def *size = nir_channel(&b, stream_data, 2); nir_ssa_def *stride = nir_channel(&b, stream_data, 3); nir_ssa_def *vs_state_offset = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 0), nir_imm_int(&b, 16), nir_imm_int(&b, 15)); va = nir_iadd(&b, va, nir_u2u64(&b, vs_state_offset)); nir_ssa_def *dyn_stride = nir_test_mask(&b, nir_channel(&b, vbo_over_data, 0), DGC_DYNAMIC_STRIDE); nir_ssa_def *old_stride = nir_ubfe(&b, nir_channel(&b, nir_load_var(&b, vbo_data), 1), nir_imm_int(&b, 16), nir_imm_int(&b, 14)); stride = nir_bcsel(&b, dyn_stride, stride, old_stride); nir_ssa_def *use_per_attribute_vb_descs = nir_test_mask(&b, nir_channel(&b, vbo_over_data, 0), 1u << 31); nir_variable *num_records = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "num_records"); nir_store_var(&b, num_records, size, 0x1); nir_push_if(&b, use_per_attribute_vb_descs); { nir_ssa_def *attrib_end = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 1), nir_imm_int(&b, 16), nir_imm_int(&b, 16)); nir_ssa_def *attrib_index_offset = nir_ubfe(&b, nir_channel(&b, vbo_over_data, 1), nir_imm_int(&b, 0), nir_imm_int(&b, 16)); nir_push_if(&b, nir_ult(&b, nir_load_var(&b, num_records), attrib_end)); { nir_store_var(&b, num_records, nir_imm_int(&b, 0), 0x1); } nir_push_else(&b, NULL); nir_push_if(&b, nir_ieq_imm(&b, stride, 0)); { nir_store_var(&b, num_records, nir_imm_int(&b, 1), 0x1); } nir_push_else(&b, NULL); { nir_ssa_def *r = nir_iadd( &b, nir_iadd_imm( &b, nir_udiv(&b, nir_isub(&b, nir_load_var(&b, num_records), attrib_end), stride), 1), attrib_index_offset); nir_store_var(&b, num_records, r, 0x1); } nir_pop_if(&b, NULL); nir_pop_if(&b, NULL); nir_ssa_def *convert_cond = nir_ine_imm(&b, nir_load_var(&b, num_records), 0); if (dev->physical_device->rad_info.gfx_level == GFX9) convert_cond = nir_imm_bool(&b, false); else if (dev->physical_device->rad_info.gfx_level != GFX8) convert_cond = nir_iand(&b, convert_cond, nir_ieq_imm(&b, stride, 0)); nir_ssa_def *new_records = nir_iadd( &b, nir_imul(&b, nir_iadd_imm(&b, nir_load_var(&b, num_records), -1), stride), attrib_end); new_records = nir_bcsel(&b, convert_cond, new_records, nir_load_var(&b, num_records)); nir_store_var(&b, num_records, new_records, 0x1); } nir_push_else(&b, NULL); { if (dev->physical_device->rad_info.gfx_level != GFX8) { nir_push_if(&b, nir_ine_imm(&b, stride, 0)); { nir_ssa_def *r = nir_iadd(&b, nir_load_var(&b, num_records), nir_iadd_imm(&b, stride, -1)); nir_store_var(&b, num_records, nir_udiv(&b, r, stride), 0x1); } nir_pop_if(&b, NULL); } } nir_pop_if(&b, NULL); nir_ssa_def *rsrc_word3 = nir_channel(&b, nir_load_var(&b, vbo_data), 3); if (dev->physical_device->rad_info.gfx_level >= GFX10) { nir_ssa_def *oob_select = nir_bcsel( &b, nir_ieq_imm(&b, stride, 0), nir_imm_int(&b, V_008F0C_OOB_SELECT_RAW), nir_imm_int(&b, V_008F0C_OOB_SELECT_STRUCTURED)); rsrc_word3 = nir_iand_imm(&b, rsrc_word3, C_008F0C_OOB_SELECT); rsrc_word3 = nir_ior(&b, rsrc_word3, nir_ishl_imm(&b, oob_select, 28)); } nir_ssa_def *va_hi = nir_iand_imm(&b, nir_unpack_64_2x32_split_y(&b, va), 0xFFFF); stride = nir_iand_imm(&b, stride, 0x3FFF); nir_ssa_def *new_vbo_data[4] = {nir_unpack_64_2x32_split_x(&b, va), nir_ior(&b, nir_ishl_imm(&b, stride, 16), va_hi), nir_load_var(&b, num_records), rsrc_word3}; nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf); } nir_pop_if(&b, NULL); /* On GFX9, it seems bounds checking is disabled if both * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and * GFX10.3 but it doesn't hurt. */ nir_ssa_def *num_records = nir_channel(&b, nir_load_var(&b, vbo_data), 2); nir_ssa_def *buf_va = nir_iand_imm( &b, nir_pack_64_2x32(&b, nir_channels(&b, nir_load_var(&b, vbo_data), 0x3)), (1ull << 48) - 1ull); nir_push_if(&b, nir_ior(&b, nir_ieq_imm(&b, num_records, 0), nir_ieq_imm(&b, buf_va, 0))); { nir_ssa_def *use_dynamic_vertex_input = nir_test_mask(&b, load_param8(&b, vbo_cnt), DGC_DYNAMIC_VERTEX_INPUT); nir_push_if(&b, use_dynamic_vertex_input); { nir_ssa_def *new_vbo_data[4] = { nir_imm_int(&b, 0), nir_imm_int(&b, S_008F04_STRIDE(16)), nir_imm_int(&b, 0), nir_channel(&b, nir_load_var(&b, vbo_data), 3)}; nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf); } nir_push_else(&b, NULL); { nir_ssa_def *new_vbo_data[4] = {nir_imm_int(&b, 0), nir_imm_int(&b, 0), nir_imm_int(&b, 0), nir_imm_int(&b, 0)}; nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf); } nir_pop_if(&b, NULL); } nir_pop_if(&b, NULL); nir_ssa_def *upload_off = nir_iadd(&b, nir_load_var(&b, upload_offset), vbo_offset); nir_store_ssbo(&b, nir_load_var(&b, vbo_data), cmd_buf.descriptor, upload_off, .access = ACCESS_NON_READABLE); nir_store_var(&b, vbo_idx, nir_iadd_imm(&b, nir_load_var(&b, vbo_idx), 1), 0x1); } nir_pop_loop(&b, NULL); nir_ssa_def *packet[3] = { nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)), load_param16(&b, vbo_reg), nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))}; dgc_emit(&b, &cmd_buf, nir_vec(&b, packet, 3)); nir_store_var(&b, upload_offset, nir_iadd(&b, nir_load_var(&b, upload_offset), nir_imul_imm(&b, vbo_cnt, 16)), 0x1); } nir_pop_if(&b, NULL); nir_ssa_def *push_const_mask = load_param64(&b, push_constant_mask); nir_push_if(&b, nir_ine_imm(&b, push_const_mask, 0)); { nir_ssa_def *const_copy = nir_ine_imm(&b, load_param8(&b, const_copy), 0); nir_ssa_def *const_copy_size = load_param16(&b, const_copy_size); nir_ssa_def *const_copy_words = nir_ushr_imm(&b, const_copy_size, 2); const_copy_words = nir_bcsel(&b, const_copy, const_copy_words, nir_imm_int(&b, 0)); nir_variable *idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "const_copy_idx"); nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1); nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_PARAMS); nir_ssa_def *param_offset = nir_imul_imm(&b, vbo_cnt, 24); nir_ssa_def *param_offset_offset = nir_iadd_imm(&b, param_offset, MESA_VULKAN_SHADER_STAGES * 12); nir_ssa_def *param_const_offset = nir_iadd_imm(&b, param_offset, MAX_PUSH_CONSTANTS_SIZE + MESA_VULKAN_SHADER_STAGES * 12); nir_push_loop(&b); { nir_ssa_def *cur_idx = nir_load_var(&b, idx); nir_push_if(&b, nir_uge(&b, cur_idx, const_copy_words)); { nir_jump(&b, nir_jump_break); } nir_pop_if(&b, NULL); nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data"); nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx)); update = nir_bcsel( &b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64 /* bits in push_const_mask */)), update, nir_imm_int64(&b, 0)); nir_push_if(&b, nir_ine_imm(&b, update, 0)); { nir_ssa_def *stream_offset = nir_load_ssbo( &b, 1, 32, param_buf, nir_iadd(&b, param_offset_offset, nir_ishl_imm(&b, cur_idx, 2)), .align_mul = 4); nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset), .align_mul = 4); nir_store_var(&b, data, new_data, 0x1); } nir_push_else(&b, NULL); { nir_store_var( &b, data, nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_const_offset, nir_ishl_imm(&b, cur_idx, 2)), .align_mul = 4), 0x1); } nir_pop_if(&b, NULL); nir_store_ssbo( &b, nir_load_var(&b, data), cmd_buf.descriptor, nir_iadd(&b, nir_load_var(&b, upload_offset), nir_ishl_imm(&b, cur_idx, 2)), .access = ACCESS_NON_READABLE); nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1); } nir_pop_loop(&b, NULL); nir_variable *shader_idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "shader_idx"); nir_store_var(&b, shader_idx, nir_imm_int(&b, 0), 0x1); nir_ssa_def *shader_cnt = load_param16(&b, push_constant_shader_cnt); nir_push_loop(&b); { nir_ssa_def *cur_shader_idx = nir_load_var(&b, shader_idx); nir_push_if(&b, nir_uge(&b, cur_shader_idx, shader_cnt)); { nir_jump(&b, nir_jump_break); } nir_pop_if(&b, NULL); nir_ssa_def *reg_info = nir_load_ssbo(&b, 3, 32, param_buf, nir_iadd(&b, param_offset, nir_imul_imm(&b, cur_shader_idx, 12)), .align_mul = 4); nir_ssa_def *upload_sgpr = nir_ubfe(&b, nir_channel(&b, reg_info, 0), nir_imm_int(&b, 0), nir_imm_int(&b, 16)); nir_ssa_def *inline_sgpr = nir_ubfe(&b, nir_channel(&b, reg_info, 0), nir_imm_int(&b, 16), nir_imm_int(&b, 16)); nir_ssa_def *inline_mask = nir_pack_64_2x32(&b, nir_channels(&b, reg_info, 0x6)); nir_push_if(&b, nir_ine_imm(&b, upload_sgpr, 0)); { nir_ssa_def *pkt[3] = { nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)), upload_sgpr, nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset)) }; dgc_emit(&b, &cmd_buf, nir_vec(&b, pkt, 3)); } nir_pop_if(&b, NULL); nir_push_if(&b, nir_ine_imm(&b, inline_sgpr, 0)); { nir_ssa_def *inline_len = nir_bit_count(&b, inline_mask); nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1); nir_ssa_def *pkt[2] = { nir_pkt3(&b, PKT3_SET_SH_REG, inline_len), inline_sgpr }; dgc_emit(&b, &cmd_buf, nir_vec(&b, pkt, 2)); nir_push_loop(&b); { nir_ssa_def *cur_idx = nir_load_var(&b, idx); nir_push_if(&b, nir_uge(&b, cur_idx, nir_imm_int(&b, 64 /* bits in inline_mask */))); { nir_jump(&b, nir_jump_break); } nir_pop_if(&b, NULL); nir_ssa_def *l = nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx); nir_push_if(&b, nir_ieq_imm(&b, nir_iand(&b, l, inline_mask), 0)); { nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1); nir_jump(&b, nir_jump_continue); } nir_pop_if(&b, NULL); nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data"); nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx)); update = nir_bcsel( &b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64 /* bits in push_const_mask */)), update, nir_imm_int64(&b, 0)); nir_push_if(&b, nir_ine_imm(&b, update, 0)); { nir_ssa_def *stream_offset = nir_load_ssbo( &b, 1, 32, param_buf, nir_iadd(&b, param_offset_offset, nir_ishl_imm(&b, cur_idx, 2)), .align_mul = 4); nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset), .align_mul = 4); nir_store_var(&b, data, new_data, 0x1); } nir_push_else(&b, NULL); { nir_store_var(&b, data, nir_load_ssbo(&b, 1, 32, param_buf, nir_iadd(&b, param_const_offset, nir_ishl_imm(&b, cur_idx, 2)), .align_mul = 4), 0x1); } nir_pop_if(&b, NULL); dgc_emit(&b, &cmd_buf, nir_load_var(&b, data)); nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1); } nir_pop_loop(&b, NULL); } nir_pop_if(&b, NULL); nir_store_var(&b, shader_idx, nir_iadd_imm(&b, cur_shader_idx, 1), 0x1); } nir_pop_loop(&b, NULL); } nir_pop_if(&b, 0); nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, emit_state), 1)); { nir_ssa_def *stream_offset = nir_iadd(&b, load_param16(&b, state_offset), stream_base); nir_ssa_def *state = nir_load_ssbo(&b, 1, 32, stream_buf, stream_offset, .align_mul = 4); state = nir_iand_imm(&b, state, 1); nir_ssa_def *reg = nir_ior(&b, load_param32(&b, pa_su_sc_mode_cntl_base), nir_ishl_imm(&b, state, 2)); nir_ssa_def *cmd_values[3] = { nir_imm_int(&b, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)), nir_imm_int(&b, (R_028814_PA_SU_SC_MODE_CNTL - SI_CONTEXT_REG_OFFSET) >> 2), reg}; dgc_emit(&b, &cmd_buf, nir_vec(&b, cmd_values, 3)); } nir_pop_if(&b, NULL); nir_ssa_def *scissor_count = load_param16(&b, scissor_count); nir_push_if(&b, nir_ine_imm(&b, scissor_count, 0)); { nir_ssa_def *scissor_offset = load_param16(&b, scissor_offset); nir_variable *idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "scissor_copy_idx"); nir_store_var(&b, idx, nir_imm_int(&b, 0), 1); nir_push_loop(&b); { nir_ssa_def *cur_idx = nir_load_var(&b, idx); nir_push_if(&b, nir_uge(&b, cur_idx, scissor_count)); { nir_jump(&b, nir_jump_break); } nir_pop_if(&b, NULL); nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_PARAMS); nir_ssa_def *param_offset = nir_iadd(&b, scissor_offset, nir_imul_imm(&b, cur_idx, 4)); nir_ssa_def *value = nir_load_ssbo(&b, 1, 32, param_buf, param_offset, .align_mul = 4); dgc_emit(&b, &cmd_buf, value); nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 1); } nir_pop_loop(&b, NULL); } nir_pop_if(&b, NULL); nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0)); { nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr); nir_ssa_def *stream_offset = nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base); nir_ssa_def *draw_data0 = nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4); nir_ssa_def *vertex_count = nir_channel(&b, draw_data0, 0); nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1); nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 2); nir_ssa_def *first_instance = nir_channel(&b, draw_data0, 3); nir_push_if(&b, nir_iand(&b, nir_ine_imm(&b, vertex_count, 0), nir_ine_imm(&b, instance_count, 0))); { dgc_emit_userdata_vertex(&b, &cmd_buf, vtx_base_sgpr, vertex_offset, first_instance, sequence_id); dgc_emit_instance_count(&b, &cmd_buf, instance_count); dgc_emit_draw(&b, &cmd_buf, vertex_count); } nir_pop_if(&b, 0); } nir_push_else(&b, NULL); { nir_variable *index_size_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "index_size"); nir_store_var(&b, index_size_var, load_param16(&b, base_index_size), 0x1); nir_variable *max_index_count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count"); nir_store_var(&b, max_index_count_var, load_param32(&b, max_index_count), 0x1); nir_ssa_def *bind_index_buffer = nir_ieq_imm(&b, nir_load_var(&b, index_size_var), 0); nir_push_if(&b, bind_index_buffer); { nir_ssa_def *index_stream_offset = nir_iadd(&b, load_param16(&b, index_buffer_offset), stream_base); nir_ssa_def *data = nir_load_ssbo(&b, 4, 32, stream_buf, index_stream_offset, .align_mul = 4); nir_ssa_def *vk_index_type = nir_channel(&b, data, 3); nir_ssa_def *index_type = nir_bcsel( &b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_32)), nir_imm_int(&b, V_028A7C_VGT_INDEX_32), nir_imm_int(&b, V_028A7C_VGT_INDEX_16)); index_type = nir_bcsel(&b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_8)), nir_imm_int(&b, V_028A7C_VGT_INDEX_8), index_type); nir_ssa_def *index_size = nir_iand_imm( &b, nir_ushr(&b, nir_imm_int(&b, 0x142), nir_imul_imm(&b, index_type, 4)), 0xf); nir_store_var(&b, index_size_var, index_size, 0x1); nir_ssa_def *max_index_count = nir_udiv(&b, nir_channel(&b, data, 2), index_size); nir_store_var(&b, max_index_count_var, max_index_count, 0x1); nir_ssa_def *cmd_values[3 + 2 + 3]; if (dev->physical_device->rad_info.gfx_level >= GFX9) { unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX; if (dev->physical_device->rad_info.gfx_level < GFX9 || (dev->physical_device->rad_info.gfx_level == GFX9 && dev->physical_device->rad_info.me_fw_version < 26)) opcode = PKT3_SET_UCONFIG_REG; cmd_values[0] = nir_imm_int(&b, PKT3(opcode, 1, 0)); cmd_values[1] = nir_imm_int( &b, (R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28)); cmd_values[2] = index_type; } else { cmd_values[0] = nir_imm_int(&b, PKT3(PKT3_INDEX_TYPE, 0, 0)); cmd_values[1] = index_type; cmd_values[2] = nir_imm_int(&b, PKT3_NOP_PAD); } nir_ssa_def *addr_upper = nir_channel(&b, data, 1); addr_upper = nir_ishr_imm(&b, nir_ishl_imm(&b, addr_upper, 16), 16); cmd_values[3] = nir_imm_int(&b, PKT3(PKT3_INDEX_BASE, 1, 0)); cmd_values[4] = nir_channel(&b, data, 0); cmd_values[5] = addr_upper; cmd_values[6] = nir_imm_int(&b, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); cmd_values[7] = max_index_count; dgc_emit(&b, &cmd_buf, nir_vec(&b, cmd_values, 8)); } nir_pop_if(&b, NULL); nir_ssa_def *index_size = nir_load_var(&b, index_size_var); nir_ssa_def *max_index_count = nir_load_var(&b, max_index_count_var); nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr); nir_ssa_def *stream_offset = nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base); index_size = nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, index_size_var), index_size); max_index_count = nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, max_index_count_var), max_index_count); nir_ssa_def *draw_data0 = nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset, .align_mul = 4); nir_ssa_def *draw_data1 = nir_load_ssbo( &b, 1, 32, stream_buf, nir_iadd_imm(&b, stream_offset, 16), .align_mul = 4); nir_ssa_def *index_count = nir_channel(&b, draw_data0, 0); nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1); nir_ssa_def *first_index = nir_channel(&b, draw_data0, 2); nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 3); nir_ssa_def *first_instance = nir_channel(&b, draw_data1, 0); nir_push_if(&b, nir_iand(&b, nir_ine_imm(&b, index_count, 0), nir_ine_imm(&b, instance_count, 0))); { dgc_emit_userdata_vertex(&b, &cmd_buf, vtx_base_sgpr, vertex_offset, first_instance, sequence_id); dgc_emit_instance_count(&b, &cmd_buf, instance_count); dgc_emit_draw_indexed(&b, &cmd_buf, first_index, index_count, max_index_count); } nir_pop_if(&b, 0); } nir_pop_if(&b, NULL); /* Pad the cmdbuffer if we did not use the whole stride */ nir_push_if(&b, nir_ine(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_end)); { nir_ssa_def *cnt = nir_isub(&b, cmd_buf_end, nir_load_var(&b, cmd_buf.offset)); cnt = nir_ushr_imm(&b, cnt, 2); cnt = nir_iadd_imm(&b, cnt, -2); nir_ssa_def *pkt = nir_pkt3(&b, PKT3_NOP, cnt); dgc_emit(&b, &cmd_buf, pkt); } nir_pop_if(&b, NULL); } nir_pop_if(&b, NULL); build_dgc_buffer_tail(&b, sequence_count); return b.shader; } void radv_device_finish_dgc_prepare_state(struct radv_device *device) { radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.dgc_prepare.pipeline, &device->meta_state.alloc); radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.dgc_prepare.p_layout, &device->meta_state.alloc); radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), device->meta_state.dgc_prepare.ds_layout, &device->meta_state.alloc); } VkResult radv_device_init_dgc_prepare_state(struct radv_device *device) { VkResult result; nir_shader *cs = build_dgc_prepare_shader(device); VkDescriptorSetLayoutCreateInfo ds_create_info = { .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, .bindingCount = DGC_NUM_DESCS, .pBindings = (VkDescriptorSetLayoutBinding[]){ {.binding = DGC_DESC_STREAM, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = NULL}, {.binding = DGC_DESC_PREPARE, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = NULL}, {.binding = DGC_DESC_PARAMS, .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = NULL}, {.binding = DGC_DESC_COUNT, .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = NULL}, }}; result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &ds_create_info, &device->meta_state.alloc, &device->meta_state.dgc_prepare.ds_layout); if (result != VK_SUCCESS) goto fail; const VkPipelineLayoutCreateInfo leaf_pl_create_info = { .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .setLayoutCount = 1, .pSetLayouts = &device->meta_state.dgc_prepare.ds_layout, .pushConstantRangeCount = 1, .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct radv_dgc_params)}, }; result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info, &device->meta_state.alloc, &device->meta_state.dgc_prepare.p_layout); if (result != VK_SUCCESS) goto fail; VkPipelineShaderStageCreateInfo shader_stage = { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = vk_shader_module_handle_from_nir(cs), .pName = "main", .pSpecializationInfo = NULL, }; VkComputePipelineCreateInfo pipeline_info = { .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .stage = shader_stage, .flags = 0, .layout = device->meta_state.dgc_prepare.p_layout, }; result = radv_CreateComputePipelines( radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, &pipeline_info, &device->meta_state.alloc, &device->meta_state.dgc_prepare.pipeline); if (result != VK_SUCCESS) goto fail; ralloc_free(cs); return VK_SUCCESS; fail: radv_device_finish_dgc_prepare_state(device); ralloc_free(cs); return result; }