aco: use bit vectors for liveness sets

This seems to be much faster than hash sets. When compiling pipelines from
5 games, live_var_analysis takes about a third the time it used to and
fossilize-replay is ~1.77% faster.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6733>
This commit is contained in:
Rhys Perry
2020-09-14 16:45:55 +01:00
committed by Marge Bot
parent ec2185c598
commit d2c18b7bf3
6 changed files with 175 additions and 34 deletions

View File

@@ -1647,16 +1647,9 @@ private:
uint32_t allocationID = 1;
};
struct TempHash {
std::size_t operator()(Temp t) const {
return t.id();
}
};
using TempSet = std::unordered_set<Temp, TempHash>;
struct live {
/* live temps out per block */
std::vector<TempSet> live_out;
std::vector<IDSet> live_out;
/* register demand (sgpr/vgpr) per instruction per block */
std::vector<std::vector<RegisterDemand>> register_demand;
};
@@ -1692,7 +1685,7 @@ void value_numbering(Program* program);
void optimize(Program* program);
void setup_reduce_temp(Program* program);
void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block);
void ssa_elimination(Program* program);
void lower_to_hw_instr(Program* program);
void schedule_program(Program* program, live& live_vars);

View File

@@ -91,18 +91,18 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
register_demand.resize(block->instructions.size());
block->register_demand = RegisterDemand();
TempSet live = lives.live_out[block->index];
IDSet live = lives.live_out[block->index];
/* add the live_out_exec to live */
bool exec_live = false;
if (block->live_out_exec != Temp()) {
live.insert(block->live_out_exec);
live.insert(block->live_out_exec.id());
exec_live = true;
}
/* initialize register demand */
for (Temp t : live)
new_demand += t;
for (unsigned t : live)
new_demand += Temp(t, program->temp_rc[t]);
new_demand.sgpr -= phi_sgpr_ops[block->index];
/* traverse the instructions backwards */
@@ -126,7 +126,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
program->needs_vcc = true;
const Temp temp = definition.getTemp();
const size_t n = live.erase(temp);
const size_t n = live.erase(temp.id());
if (n) {
new_demand -= temp;
@@ -158,7 +158,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if (operand.isFixed() && operand.physReg() == vcc)
program->needs_vcc = true;
const Temp temp = operand.getTemp();
const bool inserted = live.insert(temp).second;
const bool inserted = live.insert(temp.id()).second;
if (inserted) {
operand.setFirstKill(true);
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
@@ -198,7 +198,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
program->needs_vcc = true;
const Temp temp = definition.getTemp();
const size_t n = live.erase(temp);
const size_t n = live.erase(temp.id());
if (n)
definition.setKill(false);
@@ -209,12 +209,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
}
/* now, we need to merge the live-ins into the live-out sets */
for (Temp t : live) {
std::vector<unsigned>& preds = t.is_linear() ? block->linear_preds : block->logical_preds;
for (unsigned t : live) {
RegClass rc = program->temp_rc[t];
std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
#ifndef NDEBUG
if (preds.empty())
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index);
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
#endif
for (unsigned pred_idx : preds) {
@@ -240,7 +241,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if (operand.isFixed() && operand.physReg() == vcc)
program->needs_vcc = true;
/* check if we changed an already processed block */
const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second;
const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
if (inserted) {
operand.setKill(true);
worklist.insert(preds[i]);

View File

@@ -88,7 +88,7 @@ bool collect_phi_info(cssa_ctx& ctx)
ctx.program->blocks[pred].logical_idom :
ctx.program->blocks[pred].linear_idom;
} while (def_points[i] != pred &&
ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
ctx.live_vars.live_out[pred].count(op.tempId()));
}
}
@@ -118,7 +118,7 @@ bool collect_phi_info(cssa_ctx& ctx)
} else if (def_points[i] == block.index) {
interferes = true;
/* operand might interfere with phi-def */
} else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) {
} else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].tempId())) {
interferes = true;
/* else check for interferences with other operands */
} else {

View File

@@ -1701,7 +1701,7 @@ void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
} /* end namespace */
void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block)
{
ra_ctx ctx(program);
std::vector<std::vector<Temp>> phi_ressources;
@@ -1711,14 +1711,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
Block& block = *it;
/* first, compute the death points of all live vars within the block */
TempSet& live = live_out_per_block[block.index];
IDSet& live = live_out_per_block[block.index];
std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
aco_ptr<Instruction>& instr = *rit;
if (is_phi(instr)) {
if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
live.erase(instr->definitions[0].getTemp());
live.erase(instr->definitions[0].tempId());
continue;
}
/* collect information about affinity-related temporaries */
@@ -1748,7 +1748,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* add operands to live variables */
for (const Operand& op : instr->operands) {
if (op.isTemp())
live.emplace(op.getTemp());
live.insert(op.tempId());
}
}
@@ -1757,7 +1757,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
const Definition& def = instr->definitions[i];
if (!def.isTemp())
continue;
live.erase(def.getTemp());
live.erase(def.tempId());
/* mark last-seen phi operand */
std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
@@ -1793,14 +1793,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
for (Block& block : program->blocks) {
TempSet& live = live_out_per_block[block.index];
IDSet& live = live_out_per_block[block.index];
/* initialize register file */
assert(block.index != 0 || live.empty());
RegisterFile register_file;
ctx.war_hint.reset();
for (Temp t : live) {
Temp renamed = handle_live_in(ctx, t, &block);
for (unsigned t : live) {
Temp renamed = handle_live_in(ctx, Temp(t, program->temp_rc[t]), &block);
assignment& var = ctx.assignments[renamed.id()];
/* due to live-range splits, the live-in might be a phi, now */
if (var.assigned)
@@ -1953,7 +1953,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
register_file.fill(definition);
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
}
live.emplace(definition.getTemp());
live.insert(definition.tempId());
/* update phi affinities */
for (const Operand& op : phi->operands) {
@@ -2154,7 +2154,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* set live if it has a kill point */
if (!definition.isKill())
live.emplace(definition.getTemp());
live.insert(definition.tempId());
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
register_file.fill(definition);
@@ -2215,7 +2215,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* set live if it has a kill point */
if (!definition->isKill())
live.emplace(definition->getTemp());
live.insert(definition->tempId());
ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
register_file.fill(*definition);

View File

@@ -27,6 +27,7 @@
#include <cassert>
#include <iterator>
#include <vector>
namespace aco {
@@ -234,6 +235,151 @@ private:
size_type length{ 0 }; //!> Size of the span
};
/*
* Cache-friendly set of 32-bit IDs with O(1) insert/erase/lookup and
* the ability to efficiently iterate over contained elements.
*
* Internally implemented as a bit vector: If the set contains an ID, the
* corresponding bit is set. It doesn't use std::vector<bool> since we then
* couldn't efficiently iterate over the elements.
*
* The interface resembles a subset of std::set/std::unordered_set.
*/
struct IDSet {
struct Iterator {
const IDSet *set;
union {
struct {
uint32_t bit:6;
uint32_t word:26;
};
uint32_t id;
};
Iterator& operator ++();
bool operator != (const Iterator& other) const;
uint32_t operator * () const;
};
size_t count(uint32_t id) const {
if (id >= words.size() * 64)
return 0;
return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
}
Iterator find(uint32_t id) const {
if (!count(id))
return end();
Iterator it;
it.set = this;
it.bit = id % 64u;
it.word = id / 64u;
return it;
}
std::pair<Iterator, bool> insert(uint32_t id) {
if (words.size() * 64u <= id)
words.resize(DIV_ROUND_UP(id + 1, 64u));
Iterator it;
it.set = this;
it.bit = id % 64u;
it.word = id / 64u;
uint64_t mask = 1ull << it.bit;
if (words[it.word] & mask)
return std::make_pair(it, false);
words[it.word] |= mask;
bits_set++;
return std::make_pair(it, true);
}
size_t erase(uint32_t id) {
if (!count(id))
return 0;
words[id / 64u] ^= 1ull << (id % 64u);
bits_set--;
return 1;
}
Iterator cbegin() const {
Iterator it;
it.set = this;
for (size_t i = 0; i < words.size(); i++) {
if (words[i]) {
it.word = i;
it.bit = ffsll(words[i]) - 1;
return it;
}
}
return end();
}
Iterator cend() const {
Iterator it;
it.set = this;
it.word = words.size();
it.bit = 0;
return it;
}
Iterator begin() const {
return cbegin();
}
Iterator end() const {
return cend();
}
bool empty() const {
return bits_set == 0;
}
size_t size() const {
return bits_set;
}
std::vector<uint64_t> words;
uint32_t bits_set;
};
inline IDSet::Iterator& IDSet::Iterator::operator ++() {
uint64_t m = set->words[word];
m &= ~((2ull << bit) - 1ull);
if (!m) {
/* don't continue past the end */
if (word == set->words.size())
return *this;
word++;
for (; word < set->words.size(); word++) {
if (set->words[word]) {
bit = ffsll(set->words[word]) - 1;
return *this;
}
}
bit = 0;
} else {
bit = ffsll(m) - 1;
}
return *this;
}
inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
assert(set == other.set);
return id != other.id;
}
inline uint32_t IDSet::Iterator::operator * () const {
return (word << 6) | bit;
}
} // namespace aco
#endif // ACO_UTIL_H

View File

@@ -739,7 +739,8 @@ bool validate_ra(Program *program, const struct radv_nir_compiler_options *optio
regs.fill(0);
std::set<Temp> live;
live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end());
for (unsigned id : live_vars.live_out[block.index])
live.insert(Temp(id, program->temp_rc[id]));
/* remove killed p_phi sgpr operands */
for (Temp tmp : phi_sgpr_ops[block.index])
live.erase(tmp);