diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 16d2dfc04e1..242966b1bba 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1180,8 +1180,6 @@ setup_isel_context(Program* program, } calc_min_waves(program); - program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); unsigned scratch_size = 0; if (program->stage == gs_copy_vs) { diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index a156d109b31..ef25b1794ad 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -115,10 +115,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->physical_sgprs = 800; program->sgpr_alloc_granule = 16; program->sgpr_limit = 102; - if (family == CHIP_TONGA || family == CHIP_ICELAND) { - program->sgpr_alloc_granule = 96; - program->sgpr_limit = 94; /* workaround hardware bug */ - } + if (family == CHIP_TONGA || family == CHIP_ICELAND) + program->sgpr_alloc_granule = 96; /* workaround hardware bug */ } else { program->physical_sgprs = 512; program->sgpr_alloc_granule = 8; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 5f6c5b00a29..56d88e0f4f8 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -285,17 +285,23 @@ uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs) return align(std::max(addressable_vgprs, granule), granule); } -uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves) +unsigned round_down(unsigned a, unsigned b) { - uint16_t sgprs = (program->physical_sgprs / max_waves) - program->sgpr_alloc_granule + 1; - sgprs = get_sgpr_alloc(program, sgprs); + return a - (a % b); +} + +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves) +{ + /* it's not possible to allocate more than 128 SGPRs */ + uint16_t sgprs = std::min(program->physical_sgprs / waves, 128); + sgprs = round_down(sgprs, program->sgpr_alloc_granule); sgprs -= get_extra_sgprs(program); return std::min(sgprs, program->sgpr_limit); } -uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves) +uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves) { - uint16_t vgprs = program->physical_vgprs / max_waves & ~(program->vgpr_alloc_granule - 1); + uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1); return std::min(vgprs, program->vgpr_limit); } @@ -326,8 +332,12 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit; + assert(program->min_waves >= 1); + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) { + if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { program->num_waves = 0; program->max_reg_demand = new_demand; } else { diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 16c341a55b1..5b151d9429d 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -73,8 +73,10 @@ struct ra_ctx { std::unordered_map vectors; std::unordered_map split_vectors; aco_ptr pseudo_dummy; - unsigned max_used_sgpr = 0; - unsigned max_used_vgpr = 0; + uint16_t max_used_sgpr = 0; + uint16_t max_used_vgpr = 0; + uint16_t sgpr_limit; + uint16_t vgpr_limit; std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ ra_test_policy policy; @@ -89,6 +91,8 @@ struct ra_ctx { policy(policy_) { pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); + sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); } }; @@ -650,14 +654,14 @@ void add_subdword_definition(Program *program, aco_ptr& instr, unsi void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) { - unsigned max_addressible_sgpr = ctx.program->sgpr_limit; + uint16_t max_addressible_sgpr = ctx.sgpr_limit; unsigned size = rc.size(); if (rc.type() == RegType::vgpr) { assert(reg >= 256); - unsigned hi = reg - 256 + size - 1; + uint16_t hi = reg - 256 + size - 1; ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); } else if (reg + rc.size() <= max_addressible_sgpr) { - unsigned hi = reg + size - 1; + uint16_t hi = reg + size - 1; ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); } } @@ -1241,11 +1245,9 @@ bool get_reg_specified(ra_ctx& ctx, } bool increase_register_file(ra_ctx& ctx, RegType type) { - uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; - uint16_t max_addressible_vgpr = ctx.program->vgpr_limit; - if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) { + if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); - } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { + } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); } else { return false; @@ -2677,11 +2679,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc } /* num_gpr = rnd_up(max_used_gpr + 1) */ - program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4); - if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */ - program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit); - else - program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8); + program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1); + program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1); } } diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 02e16c05f5d..39c53ea8f19 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1774,14 +1774,16 @@ void spill(Program* program, live& live_vars) /* calculate target register demand */ RegisterDemand register_target = program->max_reg_demand; - if (register_target.sgpr > program->sgpr_limit) - register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size; - register_target.sgpr = program->sgpr_limit; + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + if (register_target.sgpr > sgpr_limit) + register_target.vgpr += (register_target.sgpr - sgpr_limit + program->wave_size - 1 + 32) / program->wave_size; + register_target.sgpr = sgpr_limit; - if (register_target.vgpr > program->vgpr_limit) - register_target.sgpr = program->sgpr_limit - 5; + if (register_target.vgpr > vgpr_limit) + register_target.sgpr = sgpr_limit - 5; int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size; - register_target.vgpr = program->vgpr_limit - spills_to_vgpr; + register_target.vgpr = vgpr_limit - spills_to_vgpr; /* initialize ctx */ spill_ctx ctx(register_target, program, live_vars.register_demand); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 3b21741fbd4..72d8db15009 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -679,6 +679,7 @@ bool validate_ra(Program *program) { bool err = false; aco::live live_vars = aco::live_var_analysis(program); std::vector> phi_sgpr_ops(program->blocks.size()); + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves); std::map assignments; for (Block& block : program->blocks) { @@ -704,7 +705,7 @@ bool validate_ra(Program *program) { if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg()) err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i); if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) || - (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < program->sgpr_limit)) + (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit)) err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i); if (op.physReg() == vcc && !program->needs_vcc) err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i); @@ -725,7 +726,7 @@ bool validate_ra(Program *program) { if (assignments[def.tempId()].defloc.block) err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId()); if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) || - (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < program->sgpr_limit)) + (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit)) err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i); if (def.physReg() == vcc && !program->needs_vcc) err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i); diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index c7df8f2e9fc..bbb83ee9b65 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -80,6 +80,8 @@ void create_program(enum chip_class chip_class, Stage stage, unsigned wave_size, program.reset(new Program); aco::init_program(program.get(), stage, &info, chip_class, family, &config); + program->workgroup_size = UINT_MAX; + calc_min_waves(program.get()); program->debug.func = nullptr; program->debug.private_data = nullptr;