Files
third_party_mesa3d/src/intel/compiler/brw_vec4.h

385 lines
13 KiB
C
Raw Normal View History

/*
* Copyright © 2011 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_VEC4_H
#define BRW_VEC4_H
#include "brw_shader.h"
#ifdef __cplusplus
#include "brw_ir_vec4.h"
intel/ir: Import shader performance analysis pass. This introduces an analysis pass intended to estimate several performance statistics of the shader, including cycle count latency and throughput values, based on static modeling. It has instruction performance information more comprehensive than the current scheduling pass for all platforms between Gen4-11, and works on both the FS and VEC4 back-end. The most immediate purpose of this pass is to implement a heuristic meant to determine whether using SIMD32 dispatch for a fragment shader can be expected to help more than it hurts. In addition this will allow the effect of passes run after scheduling (e.g. the TGL software scoreboard pass and the VEC4 dependency control pass) to be visible in shader-db statistics. But that isn't the end of the story, other potential applications of this pass (not part of this MR) I've been playing around with are: - Implement a similar SIMD16 heuristic allowing the identification of inefficient SIMD16 fragment shaders. - Implement similar SIMD16 and SIMD32 heuristics for the compute shader stage -- Currently compute shader builds always use the SIMD16 shader if available and never use the SIMD32 shader unless strictly necessary, which is suboptimal under certain conditions. - Hook up to the instruction scheduler in order to improve the accuracy of its timing information. - Use as heuristic in order to drive the selection of scheduling modes (Matt was experimenting with that). - Plug to the TGL software scoreboard pass in order to implement a more effective SBID token allocation algorithm, since in general the optimal token allocation depends on the timings of all instructions in the program. - Use its bottleneck detection functionality in order to implement a heuristic computing a more optimal bound for the number of fragment shader threads executed in parallel (by adjusting the MaximumNumberofThreadsPerPSD control of 3DSTATE_PS). As a follow-up I'm planning to submit updated timing information for Gen12 platforms -- Everything else required to support Gen12 like SWSB handling is already included in this patch, but there were some IP concerns regarding the TGL timing parameters since they cannot currently be obtained with the documentation and hardware which is publicly available. The timing parameters for any previous Gen7-11 platforms can be obtained by anyone by sampling the timestamp register using e.g. shader_time, though I have some more convenient instrumentation coming up. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-03-26 14:59:02 -07:00
#include "brw_ir_performance.h"
#include "brw_vec4_builder.h"
#include "brw_vec4_live_variables.h"
#endif
#include "compiler/glsl/ir.h"
#include "compiler/nir/nir.h"
#ifdef __cplusplus
extern "C" {
#endif
const unsigned *
brw_vec4_generate_assembly(const struct brw_compiler *compiler,
void *log_data,
void *mem_ctx,
const nir_shader *nir,
struct brw_vue_prog_data *prog_data,
const struct cfg_t *cfg,
const brw::performance &perf,
struct brw_compile_stats *stats);
#ifdef __cplusplus
} /* extern "C" */
namespace brw {
/**
* The vertex shader front-end.
*
* Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
* fixed-function) into VS IR.
*/
class vec4_visitor : public backend_shader
{
public:
vec4_visitor(const struct brw_compiler *compiler,
void *log_data,
const struct brw_sampler_prog_key_data *key,
struct brw_vue_prog_data *prog_data,
const nir_shader *shader,
void *mem_ctx,
bool no_spills,
int shader_time_index);
dst_reg dst_null_f()
{
return dst_reg(brw_null_reg());
}
dst_reg dst_null_df()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
}
dst_reg dst_null_d()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
}
dst_reg dst_null_ud()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
}
const struct brw_sampler_prog_key_data * const key_tex;
struct brw_vue_prog_data * const prog_data;
char *fail_msg;
bool failed;
/**
* GLSL IR currently being processed, which is associated with our
* driver IR instructions for debugging purposes.
*/
const void *base_ir;
const char *current_annotation;
int first_non_payload_grf;
unsigned int max_grf;
BRW_ANALYSIS(live_analysis, brw::vec4_live_variables,
backend_shader *) live_analysis;
intel/ir: Import shader performance analysis pass. This introduces an analysis pass intended to estimate several performance statistics of the shader, including cycle count latency and throughput values, based on static modeling. It has instruction performance information more comprehensive than the current scheduling pass for all platforms between Gen4-11, and works on both the FS and VEC4 back-end. The most immediate purpose of this pass is to implement a heuristic meant to determine whether using SIMD32 dispatch for a fragment shader can be expected to help more than it hurts. In addition this will allow the effect of passes run after scheduling (e.g. the TGL software scoreboard pass and the VEC4 dependency control pass) to be visible in shader-db statistics. But that isn't the end of the story, other potential applications of this pass (not part of this MR) I've been playing around with are: - Implement a similar SIMD16 heuristic allowing the identification of inefficient SIMD16 fragment shaders. - Implement similar SIMD16 and SIMD32 heuristics for the compute shader stage -- Currently compute shader builds always use the SIMD16 shader if available and never use the SIMD32 shader unless strictly necessary, which is suboptimal under certain conditions. - Hook up to the instruction scheduler in order to improve the accuracy of its timing information. - Use as heuristic in order to drive the selection of scheduling modes (Matt was experimenting with that). - Plug to the TGL software scoreboard pass in order to implement a more effective SBID token allocation algorithm, since in general the optimal token allocation depends on the timings of all instructions in the program. - Use its bottleneck detection functionality in order to implement a heuristic computing a more optimal bound for the number of fragment shader threads executed in parallel (by adjusting the MaximumNumberofThreadsPerPSD control of 3DSTATE_PS). As a follow-up I'm planning to submit updated timing information for Gen12 platforms -- Everything else required to support Gen12 like SWSB handling is already included in this patch, but there were some IP concerns regarding the TGL timing parameters since they cannot currently be obtained with the documentation and hardware which is publicly available. The timing parameters for any previous Gen7-11 platforms can be obtained by anyone by sampling the timestamp register using e.g. shader_time, though I have some more convenient instrumentation coming up. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-03-26 14:59:02 -07:00
BRW_ANALYSIS(performance_analysis, brw::performance,
vec4_visitor *) performance_analysis;
bool need_all_constants_in_pull_buffer;
/* Regs for vertex results. Generated at ir_variable visiting time
* for the ir->location's used.
*/
dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
int uniforms;
src_reg shader_start_time;
bool run();
void fail(const char *msg, ...);
int setup_uniforms(int payload_reg);
bool reg_allocate_trivial();
bool reg_allocate();
void evaluate_spill_costs(float *spill_costs, bool *no_spill);
int choose_spill_reg(struct ra_graph *g);
void spill_reg(unsigned spill_reg);
void move_grf_array_access_to_scratch();
void move_uniform_array_access_to_pull_constants();
void move_push_constants_to_pull_constants();
void split_uniform_registers();
void pack_uniform_registers();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
void split_virtual_grfs();
bool opt_vector_float();
bool opt_reduce_swizzle();
bool dead_code_eliminate();
bool opt_cmod_propagation();
bool opt_copy_propagation(bool do_constant_prop = true);
bool opt_cse_local(bblock_t *block, const vec4_live_variables &live);
bool opt_cse();
bool opt_algebraic();
bool opt_register_coalesce();
bool eliminate_find_live_channel();
bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
void opt_set_dependency_control();
void opt_schedule_instructions();
void convert_to_hw_regs();
void fixup_3src_null_dest();
bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
i965/vec4: add a SIMD lowering pass Generally, instructions in Align16 mode only ever write to a single register and don't need any form of SIMD splitting, that's why we have never had a SIMD splitting pass in the vec4 backend. However, double-precision instructions typically write 2 registers and in some cases they run into certain hardware bugs and limitations that we need to work around by splitting the instructions so we only write to 1 register at a time. This patch implements a SIMD splitting pass similar to the one in the scalar backend. Because we only use double-precision instructions in Align16 mode in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64) the pass should be a no-op on any other generation. For now the pass only handles the gen7 restriction where any instruction that writes 2 registers also needs to read 2 registers. This affects double-precision instructions reading uniforms, for example. Later patches will extend the lowering pass adding a few more cases. v2: - Move the simd lowering pass after the main optimization loop and run copy-propagation and dce if it reports progress (Curro) - Compute number of registers written instead of fixing it to 1 (Iago) - Use group from backend_instruction (Iago) - Drop assertion that checked that we only split 8-wide instructions into 4-wide. (Curro) - Don't assume that instructions can only be 8-wide, we might want to use 16-wide instructions in the future too (Curro) - Wrap gen7 workarounds in a conditional to ease adding workarounds for other gens in the future (Curro) - Handle dst/src overlap hazard (Curro) - Use the horiz_offset() helper to simplify the implementation (Curro) - Drop the assertion that checks that each split instruction writes exactly one register (Curro) - Use the copy constructor to generate split instructions with all the relevant fields initialized to the values in the original instruction instead of copying only a handful of them manually (Curro) v3 (Iago): - When copying to a temporary, allocate the number of registers required for the copy based on the size written of the lowered instruction instead of assuming that all lowered instructions produce single-register writes - Adapt to changes in offset() Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-08-29 10:41:45 +02:00
bool lower_simd_width();
i965/vec4: add a scalarization pass for double-precision instructions The hardware only supports 32-bit swizzles, which means that we can only access directly channels XY of a DF making access to channels ZW more difficult, specially considering the various regioning restrictions imposed by the hardware. The combination of both things makes handling ramdom swizzles on DF operands rather difficult, as there are many combinations that can't be represented at all, at least not without some work and some level of instruction splitting depending on the case. Writemasks are 64-bit in general, however XY and ZW writemasks also work in 32-bit, which means these writemasks can't be represented natively, adding to the complexity. For now, we decided to try and simplify things as much as possible to avoid dealing with all this from the get go by adding a scalarization pass that runs after the main optimization loop. By fully scalarizing DF instructions in align16 we avoid most of the complexity introduced by the aforementioned hardware restrictions and we have an easier path to an initial fully functional version for the vector backend in Haswell and IvyBridge. Later, we can improve the implementation so we don't necessarily scalarize everything, iteratively adding more complexity and building on top of a framework that is already working. Curro drafted some ideas for how this could be done here: https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82 v2: - Use a copy constructor for the scalar instructions so we copy all relevant instructions fields from the original instruction. v3: Fix indention in one switch (Matt) Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-05-24 09:20:51 +02:00
bool scalarize_df();
bool lower_64bit_mad_to_mul_add();
void apply_logical_swizzle(struct brw_reg *hw_reg,
vec4_instruction *inst, int arg);
i965/vec4: add a SIMD lowering pass Generally, instructions in Align16 mode only ever write to a single register and don't need any form of SIMD splitting, that's why we have never had a SIMD splitting pass in the vec4 backend. However, double-precision instructions typically write 2 registers and in some cases they run into certain hardware bugs and limitations that we need to work around by splitting the instructions so we only write to 1 register at a time. This patch implements a SIMD splitting pass similar to the one in the scalar backend. Because we only use double-precision instructions in Align16 mode in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64) the pass should be a no-op on any other generation. For now the pass only handles the gen7 restriction where any instruction that writes 2 registers also needs to read 2 registers. This affects double-precision instructions reading uniforms, for example. Later patches will extend the lowering pass adding a few more cases. v2: - Move the simd lowering pass after the main optimization loop and run copy-propagation and dce if it reports progress (Curro) - Compute number of registers written instead of fixing it to 1 (Iago) - Use group from backend_instruction (Iago) - Drop assertion that checked that we only split 8-wide instructions into 4-wide. (Curro) - Don't assume that instructions can only be 8-wide, we might want to use 16-wide instructions in the future too (Curro) - Wrap gen7 workarounds in a conditional to ease adding workarounds for other gens in the future (Curro) - Handle dst/src overlap hazard (Curro) - Use the horiz_offset() helper to simplify the implementation (Curro) - Drop the assertion that checks that each split instruction writes exactly one register (Curro) - Use the copy constructor to generate split instructions with all the relevant fields initialized to the values in the original instruction instead of copying only a handful of them manually (Curro) v3 (Iago): - When copying to a temporary, allocate the number of registers required for the copy based on the size written of the lowered instruction instead of assuming that all lowered instructions produce single-register writes - Adapt to changes in offset() Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-08-29 10:41:45 +02:00
vec4_instruction *emit(vec4_instruction *inst);
vec4_instruction *emit(enum opcode opcode);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0, const src_reg &src1);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0, const src_reg &src1,
const src_reg &src2);
vec4_instruction *emit_before(bblock_t *block,
vec4_instruction *inst,
vec4_instruction *new_inst);
#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
EMIT1(MOV)
EMIT1(NOT)
EMIT1(RNDD)
EMIT1(RNDE)
EMIT1(RNDZ)
EMIT1(FRC)
EMIT1(F32TO16)
EMIT1(F16TO32)
EMIT2(ADD)
EMIT2(MUL)
EMIT2(MACH)
EMIT2(MAC)
EMIT2(AND)
EMIT2(OR)
EMIT2(XOR)
EMIT2(DP3)
EMIT2(DP4)
EMIT2(DPH)
EMIT2(SHL)
EMIT2(SHR)
EMIT2(ASR)
vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
enum brw_conditional_mod condition);
vec4_instruction *IF(src_reg src0, src_reg src1,
enum brw_conditional_mod condition);
vec4_instruction *IF(enum brw_predicate predicate);
EMIT1(SCRATCH_READ)
EMIT2(SCRATCH_WRITE)
EMIT3(LRP)
EMIT1(BFREV)
EMIT3(BFE)
EMIT2(BFI1)
EMIT3(BFI2)
EMIT1(FBH)
EMIT1(FBL)
EMIT1(CBIT)
EMIT3(MAD)
EMIT2(ADDC)
EMIT2(SUBB)
EMIT1(DIM)
#undef EMIT1
#undef EMIT2
#undef EMIT3
vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
src_reg src0, src_reg src1);
/**
* Copy any live channel from \p src to the first channel of the
* result.
*/
src_reg emit_uniformize(const src_reg &src);
/** Fix all float operands of a 3-source instruction. */
intel/vec4: Try to emit a single load for multiple 3-src instruction operands If a 3-source instruction uses immediate values 1.0 and -1.0, just load 1.0 into a register. Use the negation source modifier to get -1.0. This has trivial impact now, but it prevents a few thousand regressions on vec4 platforms with "nir/algebraic: Recognize open-coded flrp(-1, 1, a) and flrp(1, -1, a)" All Gen6 and Gen7 platforms had similar results. (Haswell shown) total instructions in shared programs: 13487412 -> 13487406 (<.01%) instructions in affected programs: 541 -> 535 (-1.11%) helped: 6 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.36% max: 2.08% x̄: 1.65% x̃: 1.80% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -2.33% -0.97% Instructions are helped. total cycles in shared programs: 376402564 -> 376402500 (<.01%) cycles in affected programs: 10348 -> 10284 (-0.62%) helped: 10 HURT: 1 helped stats (abs) min: 2 max: 26 x̄: 7.00 x̃: 2 helped stats (rel) min: 0.13% max: 2.05% x̄: 0.89% x̃: 0.79% HURT stats (abs) min: 6 max: 6 x̄: 6.00 x̃: 6 HURT stats (rel) min: 0.29% max: 0.29% x̄: 0.29% x̃: 0.29% 95% mean confidence interval for cycles value: -11.72 0.08 95% mean confidence interval for cycles %-change: -1.20% -0.36% Inconclusive result (value mean confidence interval includes 0). No shader-db changes on any other Intel platform. Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-06-06 11:21:15 -07:00
void fix_float_operands(src_reg op[3], nir_alu_instr *instr);
src_reg fix_3src_operand(const src_reg &src);
vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1 = src_reg());
src_reg fix_math_operand(const src_reg &src);
void emit_pack_half_2x16(dst_reg dst, src_reg src0);
void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
void emit_texture(ir_texture_opcode op,
dst_reg dest,
const glsl_type *dest_type,
src_reg coordinate,
int coord_components,
src_reg shadow_comparator,
src_reg lod, src_reg lod2,
src_reg sample_index,
uint32_t constant_offset,
src_reg offset_value,
src_reg mcs,
uint32_t surface, src_reg surface_reg,
src_reg sampler_reg);
src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
src_reg surface);
void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
void emit_ndc_computation();
void emit_psiz_and_flags(dst_reg reg);
vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
virtual void emit_urb_slot(dst_reg reg, int varying);
void emit_shader_time_begin();
void emit_shader_time_end();
void emit_shader_time_write(int shader_time_subindex, src_reg value);
src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
int base_offset);
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
int base_offset);
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
int base_offset,
src_reg indirect);
void emit_pull_constant_load_reg(dst_reg dst,
src_reg surf_index,
src_reg offset,
bblock_t *before_block,
vec4_instruction *before_inst);
src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
vec4_instruction *inst, src_reg src);
void resolve_ud_negate(src_reg *reg);
bool lower_minmax();
src_reg get_timestamp();
void dump_instruction(const backend_instruction *inst) const;
void dump_instruction(const backend_instruction *inst, FILE *file) const;
bool is_high_sampler(src_reg sampler);
bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
intel/compiler: Drop nir_lower_to_source_mods() and related handling. I think we're unanimous in wanting to drop nir_lower_to_source_mods. It's a bit of complexity to handle in the backend, but perhaps more importantly, would be even more complexity to handle in nir_search. And, it turns out that since we made other compiler improvements in the last few years, they no longer appear to buy us anything of value. Summarizing the results from shader-db from this patch: - Icelake (scalar mode) Instruction counts: - 411 helped, 598 hurt (out of 139,470 shaders) - 99.2% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.78 instructions. - total instructions in shared programs: 17214951 -> 17215206 (<.01%) - instructions in affected programs: 1143879 -> 1144134 (0.02%) Cycles: - 1042 helped, 1357 hurt - total cycles in shared programs: 365613294 -> 365882263 (0.07%) - cycles in affected programs: 138155497 -> 138424466 (0.19%) - Haswell (both scalar and vector modes) Instruction counts: - 73 helped, 1680 hurt (out of 139,470 shaders) - 98.7% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.9 instructions. - total instructions in shared programs: 14199527 -> 14202262 (0.02%) - instructions in affected programs: 446499 -> 449234 (0.61%) Cycles: - 5253 helped, 5559 hurt - total cycles in shared programs: 359996545 -> 360038731 (0.01%) - cycles in affected programs: 155897127 -> 155939313 (0.03%) Given that ~99% of shader-db remains unaffected, and the affected programs are hurt by about 1-2 instructions - which are all cheap ALU instructions - this is unlikely to be measurable in terms of any real performance impact that would affect users. So, drop them and simplify the backend, and hopefully enable other future simplifications in NIR. Reviewed-by: Eric Anholt <eric@anholt.net> [v1] Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4616>
2020-04-18 01:20:42 -07:00
void emit_conversion_from_double(dst_reg dst, src_reg src);
void emit_conversion_to_double(dst_reg dst, src_reg src);
vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
bool for_write,
bblock_t *block = NULL,
vec4_instruction *ref = NULL);
virtual void emit_nir_code();
virtual void nir_setup_uniforms();
virtual void nir_emit_impl(nir_function_impl *impl);
virtual void nir_emit_cf_list(exec_list *list);
virtual void nir_emit_if(nir_if *if_stmt);
virtual void nir_emit_loop(nir_loop *loop);
virtual void nir_emit_block(nir_block *block);
virtual void nir_emit_instr(nir_instr *instr);
virtual void nir_emit_load_const(nir_load_const_instr *instr);
src_reg get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr);
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
virtual void nir_emit_alu(nir_alu_instr *instr);
virtual void nir_emit_jump(nir_jump_instr *instr);
virtual void nir_emit_texture(nir_tex_instr *instr);
virtual void nir_emit_undef(nir_ssa_undef_instr *instr);
virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
dst_reg get_nir_dest(const nir_dest &dest, enum brw_reg_type type);
dst_reg get_nir_dest(const nir_dest &dest, nir_alu_type type);
dst_reg get_nir_dest(const nir_dest &dest);
src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
unsigned num_components = 4);
src_reg get_nir_src(const nir_src &src, nir_alu_type type,
unsigned num_components = 4);
src_reg get_nir_src(const nir_src &src,
unsigned num_components = 4);
src_reg get_nir_src_imm(const nir_src &src);
src_reg get_indirect_offset(nir_intrinsic_instr *instr);
dst_reg *nir_locals;
dst_reg *nir_ssa_values;
protected:
void emit_vertex();
void setup_payload_interference(struct ra_graph *g, int first_payload_node,
int reg_node_count);
virtual void setup_payload() = 0;
virtual void emit_prolog() = 0;
virtual void emit_thread_end() = 0;
virtual void emit_urb_write_header(int mrf) = 0;
virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
virtual void gs_emit_vertex(int stream_id);
virtual void gs_end_primitive();
private:
/**
* If true, then register allocation should fail instead of spilling.
*/
const bool no_spills;
int shader_time_index;
unsigned last_scratch; /**< measured in 32-byte (register size) units */
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* BRW_VEC4_H */