intel/nir: use a single intel intrinsic to deal with ray traversal

In the future we'll want to reuse this intrinsic to deal with ray
queries. Ray queries will use a different global pointer and
programmatically change the control/level arguments of the trace send
instruction.

v2: Comment on barrier after sync trace instruction (Caio)
    Generalize lsc helper (Caio)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13719>
This commit is contained in:
Lionel Landwerlin
2021-06-14 17:30:31 +03:00
committed by Marge Bot
parent 39f6cd5d79
commit bb40e999d1
7 changed files with 121 additions and 44 deletions

View File

@@ -1649,7 +1649,7 @@ typedef struct {
#include "nir_intrinsics.h"
#define NIR_INTRINSIC_MAX_CONST_INDEX 5
#define NIR_INTRINSIC_MAX_CONST_INDEX 6
/** Represents an intrinsic
*

View File

@@ -254,6 +254,9 @@ index("nir_rounding_mode", "rounding_mode")
# Whether or not to saturate in conversions
index("unsigned", "saturate")
# Whether or not trace_ray_intel is synchronous
index("bool", "synchronous")
intrinsic("nop", flags=[CAN_ELIMINATE])
intrinsic("convert_alu_types", dest_comp=0, src_comp=[0],
@@ -1366,10 +1369,9 @@ intrinsic("btd_stack_push_intel", indices=[STACK_SIZE])
# src[] = { }
intrinsic("btd_retire_intel")
# Intel-specific ray-tracing intrinsics
intrinsic("trace_ray_initial_intel")
intrinsic("trace_ray_commit_intel")
intrinsic("trace_ray_continue_intel")
# Intel-specific ray-tracing intrinsic
# src[] = { globals, level, operation } SYNCHRONOUS=synchronous
intrinsic("trace_ray_intel", src_comp=[1, 1, 1], indices=[SYNCHRONOUS])
# System values used for ray-tracing on Intel
system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64])

View File

@@ -955,6 +955,19 @@ enum a64_logical_srcs {
A64_LOGICAL_NUM_SRCS
};
enum rt_logical_srcs {
/** Address of the globals */
RT_LOGICAL_SRC_GLOBALS,
/** Level at which the tracing should start */
RT_LOGICAL_SRC_BVH_LEVEL,
/** Type of tracing operation */
RT_LOGICAL_SRC_TRACE_RAY_CONTROL,
/** Synchronous tracing (ray query) */
RT_LOGICAL_SRC_SYNCHRONOUS,
RT_LOGICAL_NUM_SRCS
};
#ifdef __cplusplus
/**
* Allow brw_urb_write_flags enums to be ORed together.

View File

@@ -6663,31 +6663,53 @@ static void
lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
const fs_reg &bvh_level = inst->src[0];
assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);
const uint32_t trace_ray_control = inst->src[1].ud;
const fs_reg &globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
const fs_reg &bvh_level =
inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
const fs_reg &trace_ray_control =
inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
const bool synchronous = synchronous_src.ud;
const unsigned mlen = 1;
const fs_builder ubld = bld.exec_all().group(8, 0);
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(header, brw_imm_ud(0));
ubld.group(2, 0).MOV(header,
retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));
/* TODO: Bit 128 is ray_query */
ubld.group(2, 0).MOV(header, retype(globals_addr, BRW_REGISTER_TYPE_UD));
if (synchronous)
ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
const unsigned ex_mlen = inst->exec_size / 8;
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);
if (bvh_level.file == BRW_IMMEDIATE_VALUE) {
bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));
if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
(bvh_level.ud & 0x7)));
} else {
bld.AND(payload, bvh_level, brw_imm_ud(0x7));
if (trc_bits != 0)
bld.OR(payload, payload, brw_imm_ud(trc_bits));
bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
bld.OR(payload, payload, bvh_level);
}
/* When doing synchronous traversal, the HW implicitly computes the
* stack_id using the following formula :
*
* EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
*
* Only in the asynchronous case we need to set the stack_id given from the
* payload register.
*/
if (!synchronous) {
bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
brw_imm_uw(0x7ff));
}
bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
brw_imm_uw(0x7ff));
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;

View File

@@ -3997,6 +3997,29 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
}
}
static void
emit_rt_lsc_fence(const fs_builder &bld, enum lsc_flush_type flush_type)
{
const intel_device_info *devinfo = bld.shader->devinfo;
const fs_builder ubld = bld.exec_all().group(8, 0);
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
brw_imm_ud(0) /* desc */,
brw_imm_ud(0) /* ex_desc */,
brw_vec8_grf(0, 0) /* payload */);
send->sfid = GFX12_SFID_UGM;
send->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
flush_type, true);
send->mlen = 1; /* g0 header */
send->ex_mlen = 0;
send->size_written = REG_SIZE; /* Temp write for scheduling */
send->send_has_side_effects = true;
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
}
void
fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
@@ -4016,27 +4039,6 @@ fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type));
break;
case nir_intrinsic_trace_ray_initial_intel:
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
bld.null_reg_ud(),
brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD),
brw_imm_ud(GEN_RT_TRACE_RAY_INITAL));
break;
case nir_intrinsic_trace_ray_commit_intel:
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
bld.null_reg_ud(),
brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT));
break;
case nir_intrinsic_trace_ray_continue_intel:
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
bld.null_reg_ud(),
brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE));
break;
default:
nir_emit_intrinsic(bld, instr);
break;
@@ -5869,6 +5871,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
break;
case nir_intrinsic_trace_ray_intel: {
const bool synchronous = nir_intrinsic_synchronous(instr);
assert(brw_shader_stage_is_bindless(stage) || synchronous);
if (synchronous)
emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_EVICT);
fs_reg srcs[RT_LOGICAL_NUM_SRCS];
srcs[RT_LOGICAL_SRC_GLOBALS] = get_nir_src(instr->src[0]);
srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]);
srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]);
srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
srcs, RT_LOGICAL_NUM_SRCS);
/* There is no actual value to use in the destination register of the
* synchronous trace instruction. All of the communication with the HW
* unit happens through memory reads/writes. So to ensure that the
* operation has completed before we go read the results in memory, we
* need a barrier followed by an invalidate before accessing memory.
*/
if (synchronous) {
bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
emit_rt_lsc_fence(bld, LSC_FLUSH_TYPE_INVALIDATE);
}
break;
}
default:
unreachable("unknown intrinsic");
}

View File

@@ -214,7 +214,11 @@ lower_shader_calls_instr(struct nir_builder *b, nir_instr *instr, void *data)
.shader_index_multiplier = sbt_stride,
};
brw_nir_rt_store_mem_ray(b, &ray_defs, BRW_RT_BVH_LEVEL_WORLD);
nir_trace_ray_initial_intel(b);
nir_trace_ray_intel(b,
nir_load_btd_global_arg_addr_intel(b),
nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD),
nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL),
.synchronous = false);
return true;
}

View File

@@ -294,7 +294,11 @@ lower_ray_walk_intrinsics(nir_shader *shader,
* optimization passes.
*/
nir_push_if(&b, nir_imm_true(&b));
nir_trace_ray_continue_intel(&b);
nir_trace_ray_intel(&b,
nir_load_btd_global_arg_addr_intel(&b),
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
nir_imm_int(&b, GEN_RT_TRACE_RAY_CONTINUE),
.synchronous = false);
nir_jump(&b, nir_jump_halt);
nir_pop_if(&b, NULL);
progress = true;
@@ -313,7 +317,11 @@ lower_ray_walk_intrinsics(nir_shader *shader,
}
nir_push_else(&b, NULL);
{
nir_trace_ray_commit_intel(&b);
nir_trace_ray_intel(&b,
nir_load_btd_global_arg_addr_intel(&b),
nir_imm_int(&b, BRW_RT_BVH_LEVEL_OBJECT),
nir_imm_int(&b, GEN_RT_TRACE_RAY_COMMIT),
.synchronous = false);
nir_jump(&b, nir_jump_halt);
}
nir_pop_if(&b, NULL);