intel/nir: use a single intel intrinsic to deal with ray traversal
In the future we'll want to reuse this intrinsic to deal with ray queries. Ray queries will use a different global pointer and programmatically change the control/level arguments of the trace send instruction. v2: Comment on barrier after sync trace instruction (Caio) Generalize lsc helper (Caio) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13719>
This commit is contained in:

committed by
Marge Bot

parent
39f6cd5d79
commit
bb40e999d1
@@ -1649,7 +1649,7 @@ typedef struct {
|
||||
|
||||
#include "nir_intrinsics.h"
|
||||
|
||||
#define NIR_INTRINSIC_MAX_CONST_INDEX 5
|
||||
#define NIR_INTRINSIC_MAX_CONST_INDEX 6
|
||||
|
||||
/** Represents an intrinsic
|
||||
*
|
||||
|
@@ -254,6 +254,9 @@ index("nir_rounding_mode", "rounding_mode")
|
||||
# Whether or not to saturate in conversions
|
||||
index("unsigned", "saturate")
|
||||
|
||||
# Whether or not trace_ray_intel is synchronous
|
||||
index("bool", "synchronous")
|
||||
|
||||
intrinsic("nop", flags=[CAN_ELIMINATE])
|
||||
|
||||
intrinsic("convert_alu_types", dest_comp=0, src_comp=[0],
|
||||
@@ -1366,10 +1369,9 @@ intrinsic("btd_stack_push_intel", indices=[STACK_SIZE])
|
||||
# src[] = { }
|
||||
intrinsic("btd_retire_intel")
|
||||
|
||||
# Intel-specific ray-tracing intrinsics
|
||||
intrinsic("trace_ray_initial_intel")
|
||||
intrinsic("trace_ray_commit_intel")
|
||||
intrinsic("trace_ray_continue_intel")
|
||||
# Intel-specific ray-tracing intrinsic
|
||||
# src[] = { globals, level, operation } SYNCHRONOUS=synchronous
|
||||
intrinsic("trace_ray_intel", src_comp=[1, 1, 1], indices=[SYNCHRONOUS])
|
||||
|
||||
# System values used for ray-tracing on Intel
|
||||
system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64])
|
||||
|
@@ -955,6 +955,19 @@ enum a64_logical_srcs {
|
||||
A64_LOGICAL_NUM_SRCS
|
||||
};
|
||||
|
||||
enum rt_logical_srcs {
|
||||
/** Address of the globals */
|
||||
RT_LOGICAL_SRC_GLOBALS,
|
||||
/** Level at which the tracing should start */
|
||||
RT_LOGICAL_SRC_BVH_LEVEL,
|
||||
/** Type of tracing operation */
|
||||
RT_LOGICAL_SRC_TRACE_RAY_CONTROL,
|
||||
/** Synchronous tracing (ray query) */
|
||||
RT_LOGICAL_SRC_SYNCHRONOUS,
|
||||
|
||||
RT_LOGICAL_NUM_SRCS
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
/**
|
||||
* Allow brw_urb_write_flags enums to be ORed together.
|
||||
|
@@ -6663,31 +6663,53 @@ static void
|
||||
lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
|
||||
{
|
||||
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||
const fs_reg &bvh_level = inst->src[0];
|
||||
assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);
|
||||
const uint32_t trace_ray_control = inst->src[1].ud;
|
||||
const fs_reg &globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
|
||||
const fs_reg &bvh_level =
|
||||
inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
|
||||
inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
|
||||
bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
|
||||
inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
|
||||
const fs_reg &trace_ray_control =
|
||||
inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
|
||||
inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
|
||||
bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
|
||||
inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
|
||||
const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
|
||||
assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
|
||||
const bool synchronous = synchronous_src.ud;
|
||||
|
||||
const unsigned mlen = 1;
|
||||
const fs_builder ubld = bld.exec_all().group(8, 0);
|
||||
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
ubld.MOV(header, brw_imm_ud(0));
|
||||
ubld.group(2, 0).MOV(header,
|
||||
retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));
|
||||
/* TODO: Bit 128 is ray_query */
|
||||
ubld.group(2, 0).MOV(header, retype(globals_addr, BRW_REGISTER_TYPE_UD));
|
||||
if (synchronous)
|
||||
ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
|
||||
|
||||
const unsigned ex_mlen = inst->exec_size / 8;
|
||||
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);
|
||||
if (bvh_level.file == BRW_IMMEDIATE_VALUE) {
|
||||
bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));
|
||||
if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
|
||||
trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
|
||||
bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
|
||||
(bvh_level.ud & 0x7)));
|
||||
} else {
|
||||
bld.AND(payload, bvh_level, brw_imm_ud(0x7));
|
||||
if (trc_bits != 0)
|
||||
bld.OR(payload, payload, brw_imm_ud(trc_bits));
|
||||
bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
|
||||
bld.OR(payload, payload, bvh_level);
|
||||
}
|
||||
|
||||
/* When doing synchronous traversal, the HW implicitly computes the
|
||||
* stack_id using the following formula :
|
||||
*
|
||||
* EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
|
||||
*
|
||||
* Only in the asynchronous case we need to set the stack_id given from the
|
||||
* payload register.
|
||||
*/
|
||||
if (!synchronous) {
|
||||
bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
|
||||
retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
|
||||
brw_imm_uw(0x7ff));
|
||||
}
|
||||
bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
|
||||
retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
|
||||
brw_imm_uw(0x7ff));
|
||||
|
||||
/* Update the original instruction. */
|
||||
inst->opcode = SHADER_OPCODE_SEND;
|
||||
|
@@ -3997,6 +3997,29 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_rt_lsc_fence(const fs_builder &bld, enum lsc_flush_type flush_type)
|
||||
{
|
||||
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||
|
||||
const fs_builder ubld = bld.exec_all().group(8, 0);
|
||||
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
|
||||
brw_imm_ud(0) /* desc */,
|
||||
brw_imm_ud(0) /* ex_desc */,
|
||||
brw_vec8_grf(0, 0) /* payload */);
|
||||
send->sfid = GFX12_SFID_UGM;
|
||||
send->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
|
||||
flush_type, true);
|
||||
send->mlen = 1; /* g0 header */
|
||||
send->ex_mlen = 0;
|
||||
send->size_written = REG_SIZE; /* Temp write for scheduling */
|
||||
send->send_has_side_effects = true;
|
||||
|
||||
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
|
||||
nir_intrinsic_instr *instr)
|
||||
@@ -4016,27 +4039,6 @@ fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
|
||||
bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_trace_ray_initial_intel:
|
||||
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
|
||||
bld.null_reg_ud(),
|
||||
brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD),
|
||||
brw_imm_ud(GEN_RT_TRACE_RAY_INITAL));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_trace_ray_commit_intel:
|
||||
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
|
||||
bld.null_reg_ud(),
|
||||
brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
|
||||
brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_trace_ray_continue_intel:
|
||||
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
|
||||
bld.null_reg_ud(),
|
||||
brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
|
||||
brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE));
|
||||
break;
|
||||
|
||||
default:
|
||||
nir_emit_intrinsic(bld, instr);
|
||||
break;
|
||||
@@ -5869,6 +5871,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
||||
bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_trace_ray_intel: {
|
||||
const bool synchronous = nir_intrinsic_synchronous(instr);
|
||||
assert(brw_shader_stage_is_bindless(stage) || synchronous);
|
||||
if (synchronous)
|
||||
emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_EVICT);
|
||||
fs_reg srcs[RT_LOGICAL_NUM_SRCS];
|
||||
srcs[RT_LOGICAL_SRC_GLOBALS] = get_nir_src(instr->src[0]);
|
||||
srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]);
|
||||
srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]);
|
||||
srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
|
||||
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
|
||||
srcs, RT_LOGICAL_NUM_SRCS);
|
||||
|
||||
/* There is no actual value to use in the destination register of the
|
||||
* synchronous trace instruction. All of the communication with the HW
|
||||
* unit happens through memory reads/writes. So to ensure that the
|
||||
* operation has completed before we go read the results in memory, we
|
||||
* need a barrier followed by an invalidate before accessing memory.
|
||||
*/
|
||||
if (synchronous) {
|
||||
bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
|
||||
emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_INVALIDATE);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("unknown intrinsic");
|
||||
}
|
||||
|
@@ -214,7 +214,11 @@ lower_shader_calls_instr(struct nir_builder *b, nir_instr *instr, void *data)
|
||||
.shader_index_multiplier = sbt_stride,
|
||||
};
|
||||
brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
|
||||
nir_trace_ray_initial_intel(b);
|
||||
nir_trace_ray_intel(b,
|
||||
nir_load_btd_global_arg_addr_intel(b),
|
||||
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
|
||||
.synchronous = false);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -294,7 +294,11 @@ lower_ray_walk_intrinsics(nir_shader *shader,
|
||||
* optimization passes.
|
||||
*/
|
||||
nir_push_if(&b, nir_imm_true(&b));
|
||||
nir_trace_ray_continue_intel(&b);
|
||||
nir_trace_ray_intel(&b,
|
||||
nir_load_btd_global_arg_addr_intel(&b),
|
||||
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
|
||||
nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
|
||||
.synchronous = false);
|
||||
nir_jump(&b, nir_jump_halt);
|
||||
nir_pop_if(&b, NULL);
|
||||
progress = true;
|
||||
@@ -313,7 +317,11 @@ lower_ray_walk_intrinsics(nir_shader *shader,
|
||||
}
|
||||
nir_push_else(&b, NULL);
|
||||
{
|
||||
nir_trace_ray_commit_intel(&b);
|
||||
nir_trace_ray_intel(&b,
|
||||
nir_load_btd_global_arg_addr_intel(&b),
|
||||
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
|
||||
nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
|
||||
.synchronous = false);
|
||||
nir_jump(&b, nir_jump_halt);
|
||||
}
|
||||
nir_pop_if(&b, NULL);
|
||||
|
Reference in New Issue
Block a user