r300/compiler: Handle loops in deadcode analysis.

This also allows us to split the loop emulation into two phases.  A
tranformation phase which either unrolls loops or prepares them to be
emulated, and the emulation phase which unrolls remaining loops until the
instruction limit is reached.  The second phase is completed after the
deadcode analysis in order to get a more accurate count of the number of
instructions in the body of loops.
This commit is contained in:
Tom Stellard
2010-06-11 23:09:36 -07:00
committed by Marek Olšák
parent 91c37599f6
commit 697d666d78
5 changed files with 111 additions and 76 deletions

View File

@@ -97,6 +97,8 @@ static void debug_program_log(struct r300_fragment_program_compiler* c, const ch
void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
{
struct emulate_loop_state loop_state;
rewrite_depth_out(c);
debug_program_log(c, "before compilation");
@@ -104,14 +106,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
/* XXX Ideally this should be done only for r3xx, but since
* we don't have branching support for r5xx, we use the emulation
* on all chipsets. */
if (c->Base.is_r500) {
rc_emulate_loops(&c->Base, R500_PFS_MAX_INST);
} else {
rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST);
}
debug_program_log(c, "after emulate loops");
rc_transform_unroll_loops(&c->Base, &loop_state);
debug_program_log(c, "after transform loops");
rc_emulate_branches(&c->Base);
debug_program_log(c, "after emulate branches");
@@ -161,6 +160,15 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
debug_program_log(c, "after deadcode");
if(c->Base.is_r500){
rc_emulate_loops(&loop_state, R500_PFS_MAX_INST);
}
else{
rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST);
}
debug_program_log(c, "after emulate looops");
rc_optimize(&c->Base);
debug_program_log(c, "after dataflow optimize");

View File

@@ -593,6 +593,8 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
{
struct emulate_loop_state loop_state;
compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
addArtificialOutputs(compiler);
@@ -602,10 +604,14 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
/* XXX Ideally this should be done only for r3xx, but since
* we don't have branching support for r5xx, we use the emulation
* on all chipsets. */
rc_transform_unroll_loops(&compiler->Base, &loop_state);
debug_program_log(compiler, "after transform loops");
if (compiler->Base.is_r500){
rc_emulate_loops(&compiler->Base, R500_VS_MAX_ALU);
rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
} else {
rc_emulate_loops(&compiler->Base, R300_VS_MAX_ALU);
rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
}
debug_program_log(compiler, "after emulate loops");

View File

@@ -202,32 +202,61 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
inst = inst->Prev) {
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
if (opcode->IsFlowControl) {
if (opcode->Opcode == RC_OPCODE_ENDIF) {
push_branch(&s);
} else {
if (s.BranchStackSize) {
struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
switch(opcode->Opcode){
/* Mark all sources in the loop body as used before doing
* normal deadcode analysis. This is probably not optimal.
*/
case RC_OPCODE_ENDLOOP:
{
int endloops = 1;
struct rc_instruction *ptr;
for(ptr = inst->Prev; endloops > 0; ptr = ptr->Prev){
opcode = rc_get_opcode_info(ptr->U.I.Opcode);
if(ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
endloops--;
continue;
}
if(ptr->U.I.Opcode == RC_OPCODE_ENDLOOP){
endloops++;
continue;
}
if(opcode->HasDstReg){
int src = 0;
unsigned int srcmasks[3];
rc_compute_sources_for_writemask(ptr,
ptr->U.I.DstReg.WriteMask, srcmasks);
for(src=0; src < opcode->NumSrcRegs; src++){
mark_used(&s,
ptr->U.I.SrcReg[src].File,
ptr->U.I.SrcReg[src].Index,
srcmasks[src]);
}
}
}
break;
}
case RC_OPCODE_ENDIF:
push_branch(&s);
break;
default:
if (opcode->IsFlowControl && s.BranchStackSize) {
struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
if (opcode->Opcode == RC_OPCODE_IF) {
or_updatemasks(&s.R,
&s.R,
branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
if (opcode->Opcode == RC_OPCODE_IF) {
or_updatemasks(&s.R,
&s.R,
branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
s.BranchStackSize--;
} else if (opcode->Opcode == RC_OPCODE_ELSE) {
if (branch->HaveElse) {
rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
} else {
memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
branch->HaveElse = 1;
}
s.BranchStackSize--;
} else if (opcode->Opcode == RC_OPCODE_ELSE) {
if (branch->HaveElse) {
rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
} else {
rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
branch->HaveElse = 1;
}
} else {
rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__);
rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
}
}
}

View File

@@ -38,22 +38,6 @@
#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
struct emulate_loop_state {
struct radeon_compiler * C;
struct loop_info * Loops;
unsigned int LoopCount;
unsigned int LoopReserved;
};
struct loop_info {
struct rc_instruction * BeginLoop;
struct rc_instruction * Cond;
struct rc_instruction * If;
struct rc_instruction * Brk;
struct rc_instruction * EndIf;
struct rc_instruction * EndLoop;
};
struct const_value {
struct radeon_compiler * C;
@@ -214,8 +198,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
}
static int transform_const_loop(struct emulate_loop_state * s,
struct loop_info * loop,
struct rc_instruction * cond)
struct loop_info * loop)
{
int end_loops = 1;
int iterations;
@@ -228,13 +211,13 @@ static int transform_const_loop(struct emulate_loop_state * s,
/* Find the counter and the upper limit */
if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){
limit = &cond->U.I.SrcReg[0];
counter = &cond->U.I.SrcReg[1];
if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){
limit = &loop->Cond->U.I.SrcReg[0];
counter = &loop->Cond->U.I.SrcReg[1];
}
else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){
limit = &cond->U.I.SrcReg[1];
counter = &cond->U.I.SrcReg[0];
else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){
limit = &loop->Cond->U.I.SrcReg[1];
counter = &loop->Cond->U.I.SrcReg[0];
}
else{
DBG("No constant limit.\n");
@@ -414,7 +397,7 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
}
/* Check if the number of loops is known at compile time. */
if(transform_const_loop(s, loop, ptr)){
if(transform_const_loop(s, loop)){
return loop->BeginLoop->Next;
}
@@ -425,9 +408,14 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
return loop->EndLoop;
}
static void rc_transform_loops(struct emulate_loop_state * s)
void rc_transform_unroll_loops(struct radeon_compiler *c,
struct emulate_loop_state * s)
{
struct rc_instruction * ptr = s->C->Program.Instructions.Next;
struct rc_instruction * ptr;
memset(s, 0, sizeof(struct emulate_loop_state));
s->C = c;
ptr = s->C->Program.Instructions.Next;
while(ptr != &s->C->Program.Instructions) {
if(ptr->Type == RC_INSTRUCTION_NORMAL &&
ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
@@ -440,7 +428,7 @@ static void rc_transform_loops(struct emulate_loop_state * s)
}
}
static void rc_unroll_loops(struct emulate_loop_state *s,
void rc_emulate_loops(struct emulate_loop_state *s,
unsigned int max_instructions)
{
int i;
@@ -456,19 +444,3 @@ static void rc_unroll_loops(struct emulate_loop_state *s,
loop_unroll(s, &s->Loops[i], iterations);
}
}
void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions)
{
struct emulate_loop_state s;
memset(&s, 0, sizeof(struct emulate_loop_state));
s.C = c;
/* We may need to move these two operations to r3xx_(vert|frag)prog.c
* and run the optimization passes between them in order to increase
* the number of unrolls we can do for each loop.
*/
rc_transform_loops(&s);
rc_unroll_loops(&s, max_instructions);
}

View File

@@ -7,6 +7,26 @@
struct radeon_compiler;
void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions);
struct loop_info {
struct rc_instruction * BeginLoop;
struct rc_instruction * Cond;
struct rc_instruction * If;
struct rc_instruction * Brk;
struct rc_instruction * EndIf;
struct rc_instruction * EndLoop;
};
struct emulate_loop_state {
struct radeon_compiler * C;
struct loop_info * Loops;
unsigned int LoopCount;
unsigned int LoopReserved;
};
void rc_transform_unroll_loops(struct radeon_compiler *c,
struct emulate_loop_state * s);
void rc_emulate_loops(struct emulate_loop_state *s,
unsigned int max_instructions);
#endif /* RADEON_EMULATE_LOOPS_H */