Replace on-trace GC frame syncing with interpreter exit.

Need to sync GC objects to stack only during atomic GC phase.
Need to setup a proper frame structure only for calling finalizers.
Force an exit to the interpreter and let it handle the uncommon cases.
Finally solves the "NYI: gcstep sync with frames" issue.
This commit is contained in:
Mike Pall 2010-04-18 13:41:30 +02:00
parent ff82df797a
commit 932cda0fe3
11 changed files with 1887 additions and 1908 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2613,6 +2613,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
| mov L:RB, [DISPATCH+DISPATCH_GL(jit_L)]
| mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
| mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
| mov dword [DISPATCH+DISPATCH_GL(jit_L)], 0
| mov L:RB->base, BASE
|.if X64WIN
| lea CARG2, [rsp+4*8]

File diff suppressed because it is too large Load Diff

View File

@ -2752,67 +2752,32 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
/* -- GC handling --------------------------------------------------------- */
/* Sync all live GC values to Lua stack slots. */
static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base)
{
/* Some care must be taken when allocating registers here, since this is
** not part of the fast path. All scratch registers are evicted in the
** fast path, so it's easiest to force allocation from scratch registers
** only. This avoids register allocation state unification.
*/
RegSet allow = rset_exclude(RSET_SCRATCH & RSET_GPR, base);
SnapEntry *map = &as->T->snapmap[snap->mapofs];
MSize n, nent = snap->nent;
for (n = 0; n < nent; n++) {
SnapEntry sn = map[n];
IRRef ref = snap_ref(sn);
/* NYI: sync the frame, bump base, set topslot, clear new slots. */
if ((sn & (SNAP_CONT|SNAP_FRAME)))
lj_trace_err(as->J, LJ_TRERR_NYIGCF);
if (!irref_isk(ref)) {
IRIns *ir = IR(ref);
if (irt_isgcv(ir->t)) {
int32_t ofs = 8*(int32_t)(snap_slot(sn)-1);
Reg src = ra_alloc1(as, ref, allow);
emit_movtomro(as, src, base, ofs);
emit_movmroi(as, base, ofs+4, irt_toitype(ir->t));
checkmclim(as);
}
}
}
}
/* Check GC threshold and do one or more GC steps. */
static void asm_gc_check(ASMState *as, SnapShot *snap)
static void asm_gc_check(ASMState *as)
{
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
IRRef args[2];
MCLabel l_end;
Reg base, lstate, tmp;
RegSet drop = RSET_SCRATCH;
if (ra_hasreg(IR(REF_BASE)->r)) /* Stack may be reallocated by the GC. */
drop |= RID2RSET(IR(REF_BASE)->r); /* Need to evict BASE, too. */
ra_evictset(as, drop);
ra_evictset(as, RSET_SCRATCH);
l_end = emit_label(as);
/* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */
emit_rr(as, XO_TEST, RID_RET, RID_RET);
args[0] = ASMREF_L;
args[1] = ASMREF_TMP1;
asm_gencall(as, ci, args);
tmp = ra_releasetmp(as, ASMREF_TMP1);
emit_loadi(as, tmp, (int32_t)as->gcsteps);
/* We don't know spadj yet, so get the C frame from L->cframe. */
emit_movmroi(as, tmp, CFRAME_OFS_PC,
(int32_t)as->T->snapmap[snap->mapofs+snap->nent]);
emit_gri(as, XG_ARITHi(XOg_AND), tmp|REX_64, CFRAME_RAWMASK);
lstate = IR(ASMREF_L)->r;
emit_rmro(as, XO_MOV, tmp|REX_64, lstate, offsetof(lua_State, cframe));
/* It's ok if lstate is already in a non-scratch reg. But all allocations
** in the non-fast path must use a scratch reg. See comment above.
** in the non-fast path must use a scratch reg (avoids unification).
*/
lstate = IR(ASMREF_L)->r;
base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_SCRATCH & RSET_GPR, lstate));
emit_movtomro(as, base|REX_64, lstate, offsetof(lua_State, base));
asm_gc_sync(as, snap, base);
/* BASE/L get restored anyway, better do it inside the slow path. */
if (as->parent || as->curins == as->loopref) ra_restore(as, REF_BASE);
if (rset_test(RSET_SCRATCH, base) && (as->parent || as->snapno != 0))
ra_restore(as, REF_BASE);
if (rset_test(RSET_SCRATCH, lstate) && ra_hasreg(IR(ASMREF_L)->r))
ra_restore(as, ASMREF_L);
/* Jump around GC step if GC total < GC threshold. */
@ -3034,7 +2999,7 @@ static void asm_loop(ASMState *as)
/* LOOP is a guard, so the snapno is up to date. */
as->loopsnapno = as->snapno;
if (as->gcsteps)
asm_gc_check(as, &as->T->snap[as->loopsnapno]);
asm_gc_check(as);
/* LOOP marks the transition from the variant to the invariant part. */
as->testmcp = as->invmcp = NULL;
as->sectref = 0;
@ -3126,7 +3091,7 @@ static void asm_head_side(ASMState *as)
allow = asm_head_side_base(as, pbase, allow);
/* Scan all parent SLOADs and collect register dependencies. */
for (i = as->curins; i > REF_BASE; i--) {
for (i = as->stopins; i > REF_BASE; i--) {
IRIns *ir = IR(i);
RegSP rs;
lua_assert(ir->o == IR_SLOAD && (ir->op2 & IRSLOAD_PARENT));
@ -3161,7 +3126,7 @@ static void asm_head_side(ASMState *as)
/* Reload spilled target registers. */
if (pass2) {
for (i = as->curins; i > REF_BASE; i--) {
for (i = as->stopins; i > REF_BASE; i--) {
IRIns *ir = IR(i);
if (irt_ismarked(ir->t)) {
RegSet mask;
@ -3686,8 +3651,11 @@ void lj_asm_trace(jit_State *J, Trace *T)
RA_DBG_REF();
checkmclim(as);
if (as->gcsteps)
asm_gc_check(as, &as->T->snap[0]);
if (as->gcsteps) {
as->curins = as->T->snap[0].ref;
asm_snap_prep(as); /* The GC check is a guard. */
asm_gc_check(as);
}
ra_evictk(as);
if (as->parent)
asm_head_side(as);

View File

@ -442,6 +442,7 @@ static void gc_finalize(lua_State *L)
GCobj *o = gcnext(gcref(g->gc.mmudata));
GCudata *ud = gco2ud(o);
cTValue *mo;
lua_assert(gcref(g->jit_L) == NULL); /* Must not be called on trace. */
/* Unchain from list of userdata to be finalized. */
if (o == gcref(g->gc.mmudata))
setgcrefnull(g->gc.mmudata);
@ -457,16 +458,8 @@ static void gc_finalize(lua_State *L)
/* Save and restore lots of state around the __gc callback. */
uint8_t oldh = hook_save(g);
MSize oldt = g->gc.threshold;
GCobj *oldjl = gcref(g->jit_L);
MSize oldjs = 0;
ptrdiff_t oldjb = 0;
int errcode;
TValue *top;
if (oldjl) {
oldjs = gco2th(oldjl)->stacksize;
oldjb = savestack(gco2th(oldjl), mref(g->jit_base, TValue ));
setgcrefnull(g->jit_L);
}
lj_trace_abort(g);
top = L->top;
L->top = top+2;
@ -477,12 +470,6 @@ static void gc_finalize(lua_State *L)
errcode = lj_vm_pcall(L, top+1, 1+0, -1); /* Stack: |mo|ud| -> | */
hook_restore(g, oldh);
g->gc.threshold = oldt; /* Restore GC threshold. */
if (oldjl) {
if (gco2th(oldjl)->stacksize < oldjs)
lj_state_growstack(gco2th(oldjl), oldjs - gco2th(oldjl)->stacksize);
setgcref(g->jit_L, oldjl);
setmref(g->jit_base, restorestack(gco2th(oldjl), oldjb));
}
if (errcode)
lj_err_throw(L, errcode); /* Propagate errors. */
}
@ -514,7 +501,6 @@ static void atomic(global_State *g, lua_State *L)
{
size_t udsize;
g->gc.state = GCSatomic;
gc_mark_uv(g); /* Need to remark open upvalues (the thread may be dead). */
gc_propagate_gray(g); /* Propagate any left-overs. */
@ -539,9 +525,7 @@ static void atomic(global_State *g, lua_State *L)
/* Prepare for sweep phase. */
g->gc.currentwhite = cast_byte(otherwhite(g)); /* Flip current white. */
g->gc.sweepstr = 0;
setmref(g->gc.sweep, &g->gc.root);
g->gc.state = GCSsweepstring;
g->gc.estimate = g->gc.total - (MSize)udsize; /* Initial estimate. */
}
@ -556,7 +540,14 @@ static size_t gc_onestep(lua_State *L)
case GCSpropagate:
if (gcref(g->gc.gray) != NULL)
return propagatemark(g); /* Propagate one gray object. */
atomic(g, L); /* End of mark phase. */
g->gc.state = GCSatomic; /* End of mark phase. */
return 0;
case GCSatomic:
if (gcref(g->jit_L)) /* Don't run atomic phase on trace. */
return LJ_MAX_MEM;
atomic(g, L);
g->gc.state = GCSsweepstring; /* Start of sweep phase. */
g->gc.sweepstr = 0;
return 0;
case GCSsweepstring: {
MSize old = g->gc.total;
@ -572,7 +563,12 @@ static size_t gc_onestep(lua_State *L)
setmref(g->gc.sweep, gc_sweep(g, mref(g->gc.sweep, GCRef), GCSWEEPMAX));
if (gcref(*mref(g->gc.sweep, GCRef)) == NULL) {
gc_shrink(g, L);
g->gc.state = GCSfinalize; /* End of sweep phase. */
if (gcref(g->gc.mmudata)) { /* Need any finalizations? */
g->gc.state = GCSfinalize;
} else { /* Otherwise skip this phase to help the JIT. */
g->gc.state = GCSpause; /* End of GC cycle. */
g->gc.debt = 0;
}
}
lua_assert(old >= g->gc.total);
g->gc.estimate -= old - g->gc.total;
@ -580,6 +576,8 @@ static size_t gc_onestep(lua_State *L)
}
case GCSfinalize:
if (gcref(g->gc.mmudata) != NULL) {
if (gcref(g->jit_L)) /* Don't call finalizers on trace. */
return LJ_MAX_MEM;
gc_finalize(L); /* Finalize one userdata object. */
if (g->gc.estimate > GCFINALIZECOST)
g->gc.estimate -= GCFINALIZECOST;
@ -633,11 +631,13 @@ void LJ_FASTCALL lj_gc_step_fixtop(lua_State *L)
#if LJ_HASJIT
/* Perform multiple GC steps. Called from JIT-compiled code. */
void LJ_FASTCALL lj_gc_step_jit(lua_State *L, MSize steps)
int LJ_FASTCALL lj_gc_step_jit(lua_State *L, MSize steps)
{
L->top = curr_topL(L);
while (steps-- > 0 && lj_gc_step(L) == 0)
;
/* Return 1 to force a trace exit. */
return (G(L)->gc.state == GCSatomic || G(L)->gc.state == GCSfinalize);
}
#endif
@ -647,23 +647,20 @@ void lj_gc_fullgc(lua_State *L)
global_State *g = G(L);
int32_t ostate = g->vmstate;
setvmstate(g, GC);
if (g->gc.state <= GCSpropagate) { /* Caught somewhere in the middle. */
g->gc.sweepstr = 0;
if (g->gc.state <= GCSatomic) { /* Caught somewhere in the middle. */
setmref(g->gc.sweep, &g->gc.root); /* Sweep everything (preserving it). */
setgcrefnull(g->gc.gray); /* Reset lists from partial propagation. */
setgcrefnull(g->gc.grayagain);
setgcrefnull(g->gc.weak);
g->gc.state = GCSsweepstring; /* Fast forward to the sweep phase. */
g->gc.sweepstr = 0;
}
lua_assert(g->gc.state != GCSpause && g->gc.state != GCSpropagate);
while (g->gc.state != GCSfinalize) { /* Finish sweep. */
lua_assert(g->gc.state == GCSsweepstring || g->gc.state == GCSsweep);
gc_onestep(L);
}
while (g->gc.state == GCSsweepstring || g->gc.state == GCSsweep)
gc_onestep(L); /* Finish sweep. */
lua_assert(g->gc.state == GCSfinalize || g->gc.state == GCSpause);
/* Now perform a full GC. */
gc_mark_start(g);
while (g->gc.state != GCSpause)
gc_onestep(L);
g->gc.state = GCSpause;
do { gc_onestep(L); } while (g->gc.state != GCSpause);
g->gc.threshold = (g->gc.estimate/100) * g->gc.pause;
g->vmstate = ostate;
}

View File

@ -47,7 +47,7 @@ LJ_FUNC void lj_gc_freeall(global_State *g);
LJ_FUNCA int LJ_FASTCALL lj_gc_step(lua_State *L);
LJ_FUNCA void LJ_FASTCALL lj_gc_step_fixtop(lua_State *L);
#if LJ_HASJIT
LJ_FUNC void LJ_FASTCALL lj_gc_step_jit(lua_State *L, MSize steps);
LJ_FUNC int LJ_FASTCALL lj_gc_step_jit(lua_State *L, MSize steps);
#endif
LJ_FUNC void lj_gc_fullgc(lua_State *L);

View File

@ -1228,7 +1228,7 @@ static void fs_init(LexState *ls, FuncState *fs)
fs->flags = 0;
fs->framesize = 2; /* Minimum frame size. */
fs->kt = lj_tab_new(L, 0, 0);
/* Anchor table of constants and prototype (to avoid being collected). */
/* Anchor table of constants in stack to avoid being collected. */
settabV(L, L->top, fs->kt);
incr_top(L);
}

View File

@ -1885,13 +1885,22 @@ static TRef rec_tnew(jit_State *J, uint32_t ah)
/* -- Record bytecode ops ------------------------------------------------- */
/* Optimize state after comparison. */
static void optstate_comp(jit_State *J, int cond)
/* Prepare for comparison. */
static void rec_comp_prep(jit_State *J)
{
/* Prevent merging with snapshot #0 (GC exit) since we fixup the PC. */
if (J->cur.nsnap == 1 && J->cur.snap[0].ref == J->cur.nins)
emitir_raw(IRT(IR_NOP, IRT_NIL), 0, 0);
lj_snap_add(J);
}
/* Fixup comparison. */
static void rec_comp_fixup(jit_State *J, int cond)
{
BCIns jmpins = J->pc[1];
const BCIns *npc = J->pc + 2 + (cond ? bc_j(jmpins) : 0);
SnapShot *snap = &J->cur.snap[J->cur.nsnap-1];
/* Avoid re-recording the comparison in side traces. */
/* Set PC to opposite target to avoid re-recording the comp. in side trace. */
J->cur.snapmap[snap->mapofs + snap->nent] = SNAP_MKPC(npc);
J->needsnap = 1;
/* Shrink last snapshot if possible. */
@ -1987,7 +1996,7 @@ void lj_record_ins(jit_State *J)
break; /* Interpreter will throw for two different types. */
}
}
lj_snap_add(J);
rec_comp_prep(J);
irop = (int)op - (int)BC_ISLT + (int)IR_LT;
if (ta == IRT_NUM) {
if ((irop & 1)) irop ^= 4; /* ISGE/ISGT are unordered. */
@ -2004,7 +2013,7 @@ void lj_record_ins(jit_State *J)
break;
}
emitir(IRTG(irop, ta), ra, rc);
optstate_comp(J, ((int)op ^ irop) & 1);
rec_comp_fixup(J, ((int)op ^ irop) & 1);
}
break;
@ -2015,14 +2024,14 @@ void lj_record_ins(jit_State *J)
/* Emit nothing for two non-table, non-udata consts. */
if (!(tref_isk2(ra, rc) && !(tref_istab(ra) || tref_isudata(ra)))) {
int diff;
lj_snap_add(J);
rec_comp_prep(J);
diff = rec_objcmp(J, ra, rc, rav, rcv);
if (diff == 1 && (tref_istab(ra) || tref_isudata(ra))) {
/* Only check __eq if different, but the same type (table or udata). */
rec_mm_equal(J, &ix, (int)op);
break;
}
optstate_comp(J, ((int)op & 1) == !diff);
rec_comp_fixup(J, ((int)op & 1) == !diff);
}
break;

View File

@ -692,8 +692,12 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
);
pc = exd.pc;
trace_hotside(J, pc);
cf = cframe_raw(L->cframe);
setcframe_pc(cf, pc);
if (G(L)->gc.state == GCSatomic || G(L)->gc.state == GCSfinalize)
lj_gc_step(L); /* Exited because of GC: drive GC forward. */
else
trace_hotside(J, pc);
if (bc_op(*pc) == BC_JLOOP) {
BCIns *retpc = &J->trace[bc_d(*pc)]->startins;
if (bc_isret(bc_op(*retpc))) {
@ -703,10 +707,10 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
*J->patchpc = *retpc;
} else {
pc = retpc;
setcframe_pc(cf, pc);
}
}
}
setcframe_pc(cf, pc);
/* Return MULTRES or 0. */
switch (bc_op(*pc)) {
case BC_CALLM: case BC_CALLMT:

View File

@ -49,7 +49,6 @@ TREDEF(BADRA, "inconsistent register allocation")
TREDEF(NYIIR, "NYI: cannot assemble IR instruction %d")
TREDEF(NYIPHI, "NYI: PHI shuffling too complex")
TREDEF(NYICOAL, "NYI: register coalescing too complex")
TREDEF(NYIGCF, "NYI: gcstep sync with frames")
#undef TREDEF