aco: use bit vectors for liveness sets
This seems to be much faster than hash sets. When compiling pipelines from 5 games, live_var_analysis takes about a third the time it used to and fossilize-replay is ~1.77% faster. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6733>
This commit is contained in:
@@ -1647,16 +1647,9 @@ private:
|
||||
uint32_t allocationID = 1;
|
||||
};
|
||||
|
||||
struct TempHash {
|
||||
std::size_t operator()(Temp t) const {
|
||||
return t.id();
|
||||
}
|
||||
};
|
||||
using TempSet = std::unordered_set<Temp, TempHash>;
|
||||
|
||||
struct live {
|
||||
/* live temps out per block */
|
||||
std::vector<TempSet> live_out;
|
||||
std::vector<IDSet> live_out;
|
||||
/* register demand (sgpr/vgpr) per instruction per block */
|
||||
std::vector<std::vector<RegisterDemand>> register_demand;
|
||||
};
|
||||
@@ -1692,7 +1685,7 @@ void value_numbering(Program* program);
|
||||
void optimize(Program* program);
|
||||
void setup_reduce_temp(Program* program);
|
||||
void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
|
||||
void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
|
||||
void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block);
|
||||
void ssa_elimination(Program* program);
|
||||
void lower_to_hw_instr(Program* program);
|
||||
void schedule_program(Program* program, live& live_vars);
|
||||
|
@@ -91,18 +91,18 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
|
||||
register_demand.resize(block->instructions.size());
|
||||
block->register_demand = RegisterDemand();
|
||||
TempSet live = lives.live_out[block->index];
|
||||
IDSet live = lives.live_out[block->index];
|
||||
|
||||
/* add the live_out_exec to live */
|
||||
bool exec_live = false;
|
||||
if (block->live_out_exec != Temp()) {
|
||||
live.insert(block->live_out_exec);
|
||||
live.insert(block->live_out_exec.id());
|
||||
exec_live = true;
|
||||
}
|
||||
|
||||
/* initialize register demand */
|
||||
for (Temp t : live)
|
||||
new_demand += t;
|
||||
for (unsigned t : live)
|
||||
new_demand += Temp(t, program->temp_rc[t]);
|
||||
new_demand.sgpr -= phi_sgpr_ops[block->index];
|
||||
|
||||
/* traverse the instructions backwards */
|
||||
@@ -126,7 +126,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
program->needs_vcc = true;
|
||||
|
||||
const Temp temp = definition.getTemp();
|
||||
const size_t n = live.erase(temp);
|
||||
const size_t n = live.erase(temp.id());
|
||||
|
||||
if (n) {
|
||||
new_demand -= temp;
|
||||
@@ -158,7 +158,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
if (operand.isFixed() && operand.physReg() == vcc)
|
||||
program->needs_vcc = true;
|
||||
const Temp temp = operand.getTemp();
|
||||
const bool inserted = live.insert(temp).second;
|
||||
const bool inserted = live.insert(temp.id()).second;
|
||||
if (inserted) {
|
||||
operand.setFirstKill(true);
|
||||
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
|
||||
@@ -198,7 +198,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
|
||||
program->needs_vcc = true;
|
||||
const Temp temp = definition.getTemp();
|
||||
const size_t n = live.erase(temp);
|
||||
const size_t n = live.erase(temp.id());
|
||||
|
||||
if (n)
|
||||
definition.setKill(false);
|
||||
@@ -209,12 +209,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
}
|
||||
|
||||
/* now, we need to merge the live-ins into the live-out sets */
|
||||
for (Temp t : live) {
|
||||
std::vector<unsigned>& preds = t.is_linear() ? block->linear_preds : block->logical_preds;
|
||||
for (unsigned t : live) {
|
||||
RegClass rc = program->temp_rc[t];
|
||||
std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (preds.empty())
|
||||
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index);
|
||||
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
|
||||
#endif
|
||||
|
||||
for (unsigned pred_idx : preds) {
|
||||
@@ -240,7 +241,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
if (operand.isFixed() && operand.physReg() == vcc)
|
||||
program->needs_vcc = true;
|
||||
/* check if we changed an already processed block */
|
||||
const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second;
|
||||
const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
|
||||
if (inserted) {
|
||||
operand.setKill(true);
|
||||
worklist.insert(preds[i]);
|
||||
|
@@ -88,7 +88,7 @@ bool collect_phi_info(cssa_ctx& ctx)
|
||||
ctx.program->blocks[pred].logical_idom :
|
||||
ctx.program->blocks[pred].linear_idom;
|
||||
} while (def_points[i] != pred &&
|
||||
ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
|
||||
ctx.live_vars.live_out[pred].count(op.tempId()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ bool collect_phi_info(cssa_ctx& ctx)
|
||||
} else if (def_points[i] == block.index) {
|
||||
interferes = true;
|
||||
/* operand might interfere with phi-def */
|
||||
} else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) {
|
||||
} else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].tempId())) {
|
||||
interferes = true;
|
||||
/* else check for interferences with other operands */
|
||||
} else {
|
||||
|
@@ -1701,7 +1701,7 @@ void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
|
||||
} /* end namespace */
|
||||
|
||||
|
||||
void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
|
||||
void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block)
|
||||
{
|
||||
ra_ctx ctx(program);
|
||||
std::vector<std::vector<Temp>> phi_ressources;
|
||||
@@ -1711,14 +1711,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
Block& block = *it;
|
||||
|
||||
/* first, compute the death points of all live vars within the block */
|
||||
TempSet& live = live_out_per_block[block.index];
|
||||
IDSet& live = live_out_per_block[block.index];
|
||||
|
||||
std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
|
||||
for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
|
||||
aco_ptr<Instruction>& instr = *rit;
|
||||
if (is_phi(instr)) {
|
||||
if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
|
||||
live.erase(instr->definitions[0].getTemp());
|
||||
live.erase(instr->definitions[0].tempId());
|
||||
continue;
|
||||
}
|
||||
/* collect information about affinity-related temporaries */
|
||||
@@ -1748,7 +1748,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
/* add operands to live variables */
|
||||
for (const Operand& op : instr->operands) {
|
||||
if (op.isTemp())
|
||||
live.emplace(op.getTemp());
|
||||
live.insert(op.tempId());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1757,7 +1757,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
const Definition& def = instr->definitions[i];
|
||||
if (!def.isTemp())
|
||||
continue;
|
||||
live.erase(def.getTemp());
|
||||
live.erase(def.tempId());
|
||||
/* mark last-seen phi operand */
|
||||
std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
|
||||
if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
|
||||
@@ -1793,14 +1793,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
|
||||
|
||||
for (Block& block : program->blocks) {
|
||||
TempSet& live = live_out_per_block[block.index];
|
||||
IDSet& live = live_out_per_block[block.index];
|
||||
/* initialize register file */
|
||||
assert(block.index != 0 || live.empty());
|
||||
RegisterFile register_file;
|
||||
ctx.war_hint.reset();
|
||||
|
||||
for (Temp t : live) {
|
||||
Temp renamed = handle_live_in(ctx, t, &block);
|
||||
for (unsigned t : live) {
|
||||
Temp renamed = handle_live_in(ctx, Temp(t, program->temp_rc[t]), &block);
|
||||
assignment& var = ctx.assignments[renamed.id()];
|
||||
/* due to live-range splits, the live-in might be a phi, now */
|
||||
if (var.assigned)
|
||||
@@ -1953,7 +1953,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
register_file.fill(definition);
|
||||
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
|
||||
}
|
||||
live.emplace(definition.getTemp());
|
||||
live.insert(definition.tempId());
|
||||
|
||||
/* update phi affinities */
|
||||
for (const Operand& op : phi->operands) {
|
||||
@@ -2154,7 +2154,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
|
||||
/* set live if it has a kill point */
|
||||
if (!definition.isKill())
|
||||
live.emplace(definition.getTemp());
|
||||
live.insert(definition.tempId());
|
||||
|
||||
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
|
||||
register_file.fill(definition);
|
||||
@@ -2215,7 +2215,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
|
||||
|
||||
/* set live if it has a kill point */
|
||||
if (!definition->isKill())
|
||||
live.emplace(definition->getTemp());
|
||||
live.insert(definition->tempId());
|
||||
|
||||
ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
|
||||
register_file.fill(*definition);
|
||||
|
@@ -27,6 +27,7 @@
|
||||
|
||||
#include <cassert>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
namespace aco {
|
||||
|
||||
@@ -234,6 +235,151 @@ private:
|
||||
size_type length{ 0 }; //!> Size of the span
|
||||
};
|
||||
|
||||
/*
|
||||
* Cache-friendly set of 32-bit IDs with O(1) insert/erase/lookup and
|
||||
* the ability to efficiently iterate over contained elements.
|
||||
*
|
||||
* Internally implemented as a bit vector: If the set contains an ID, the
|
||||
* corresponding bit is set. It doesn't use std::vector<bool> since we then
|
||||
* couldn't efficiently iterate over the elements.
|
||||
*
|
||||
* The interface resembles a subset of std::set/std::unordered_set.
|
||||
*/
|
||||
struct IDSet {
|
||||
struct Iterator {
|
||||
const IDSet *set;
|
||||
union {
|
||||
struct {
|
||||
uint32_t bit:6;
|
||||
uint32_t word:26;
|
||||
};
|
||||
uint32_t id;
|
||||
};
|
||||
|
||||
Iterator& operator ++();
|
||||
|
||||
bool operator != (const Iterator& other) const;
|
||||
|
||||
uint32_t operator * () const;
|
||||
};
|
||||
|
||||
size_t count(uint32_t id) const {
|
||||
if (id >= words.size() * 64)
|
||||
return 0;
|
||||
|
||||
return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
|
||||
}
|
||||
|
||||
Iterator find(uint32_t id) const {
|
||||
if (!count(id))
|
||||
return end();
|
||||
|
||||
Iterator it;
|
||||
it.set = this;
|
||||
it.bit = id % 64u;
|
||||
it.word = id / 64u;
|
||||
return it;
|
||||
}
|
||||
|
||||
std::pair<Iterator, bool> insert(uint32_t id) {
|
||||
if (words.size() * 64u <= id)
|
||||
words.resize(DIV_ROUND_UP(id + 1, 64u));
|
||||
|
||||
Iterator it;
|
||||
it.set = this;
|
||||
it.bit = id % 64u;
|
||||
it.word = id / 64u;
|
||||
|
||||
uint64_t mask = 1ull << it.bit;
|
||||
if (words[it.word] & mask)
|
||||
return std::make_pair(it, false);
|
||||
|
||||
words[it.word] |= mask;
|
||||
bits_set++;
|
||||
return std::make_pair(it, true);
|
||||
}
|
||||
|
||||
size_t erase(uint32_t id) {
|
||||
if (!count(id))
|
||||
return 0;
|
||||
|
||||
words[id / 64u] ^= 1ull << (id % 64u);
|
||||
bits_set--;
|
||||
return 1;
|
||||
}
|
||||
|
||||
Iterator cbegin() const {
|
||||
Iterator it;
|
||||
it.set = this;
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
if (words[i]) {
|
||||
it.word = i;
|
||||
it.bit = ffsll(words[i]) - 1;
|
||||
return it;
|
||||
}
|
||||
}
|
||||
return end();
|
||||
}
|
||||
|
||||
Iterator cend() const {
|
||||
Iterator it;
|
||||
it.set = this;
|
||||
it.word = words.size();
|
||||
it.bit = 0;
|
||||
return it;
|
||||
}
|
||||
|
||||
Iterator begin() const {
|
||||
return cbegin();
|
||||
}
|
||||
|
||||
Iterator end() const {
|
||||
return cend();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return bits_set == 0;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return bits_set;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> words;
|
||||
uint32_t bits_set;
|
||||
};
|
||||
|
||||
inline IDSet::Iterator& IDSet::Iterator::operator ++() {
|
||||
uint64_t m = set->words[word];
|
||||
m &= ~((2ull << bit) - 1ull);
|
||||
if (!m) {
|
||||
/* don't continue past the end */
|
||||
if (word == set->words.size())
|
||||
return *this;
|
||||
|
||||
word++;
|
||||
for (; word < set->words.size(); word++) {
|
||||
if (set->words[word]) {
|
||||
bit = ffsll(set->words[word]) - 1;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
bit = 0;
|
||||
} else {
|
||||
bit = ffsll(m) - 1;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
|
||||
assert(set == other.set);
|
||||
return id != other.id;
|
||||
}
|
||||
|
||||
inline uint32_t IDSet::Iterator::operator * () const {
|
||||
return (word << 6) | bit;
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
||||
#endif // ACO_UTIL_H
|
||||
|
@@ -739,7 +739,8 @@ bool validate_ra(Program *program, const struct radv_nir_compiler_options *optio
|
||||
regs.fill(0);
|
||||
|
||||
std::set<Temp> live;
|
||||
live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end());
|
||||
for (unsigned id : live_vars.live_out[block.index])
|
||||
live.insert(Temp(id, program->temp_rc[id]));
|
||||
/* remove killed p_phi sgpr operands */
|
||||
for (Temp tmp : phi_sgpr_ops[block.index])
|
||||
live.erase(tmp);
|
||||
|
Reference in New Issue
Block a user