diff --git a/src/Makefile b/src/Makefile index 6e0c7463..13344a77 100644 --- a/src/Makefile +++ b/src/Makefile @@ -443,7 +443,7 @@ LJCORE_O= lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o \ lj_state.o lj_dispatch.o lj_vmevent.o lj_vmmath.o lj_api.o \ lj_lex.o lj_parse.o lj_bcread.o lj_bcwrite.o \ lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \ - lj_opt_dce.o lj_opt_loop.o lj_opt_split.o \ + lj_opt_dce.o lj_opt_loop.o lj_opt_split.o lj_opt_sink.o \ lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \ lj_asm.o lj_trace.o lj_gdbjit.o \ lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_ccallback.o \ diff --git a/src/Makefile.dep b/src/Makefile.dep index ff4492fb..1c7e5dc0 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -142,6 +142,8 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \ lj_dispatch.h lj_traceerr.h lj_vm.h +lj_opt_sink.o: lj_opt_sink.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ + lj_ir.h lj_jit.h lj_iropt.h lj_target.h lj_target_*.h lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \ lj_arch.h lj_err.h lj_errmsg.h lj_str.h lj_ir.h lj_jit.h lj_ircall.h \ lj_iropt.h lj_vm.h @@ -153,8 +155,9 @@ lj_record.o: lj_record.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_ff.h lj_ffdef.h lj_ir.h lj_jit.h lj_ircall.h lj_iropt.h lj_trace.h \ lj_dispatch.h lj_traceerr.h lj_record.h lj_ffrecord.h lj_snap.h lj_vm.h lj_snap.o: lj_snap.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \ - lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \ - lj_dispatch.h lj_traceerr.h lj_snap.h lj_target.h lj_target_*.h + lj_tab.h lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h \ + lj_trace.h lj_dispatch.h lj_traceerr.h lj_snap.h lj_target.h \ + lj_target_*.h lj_ctype.h lj_cdata.h lj_state.o: lj_state.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_meta.h \ lj_state.h lj_frame.h lj_bc.h lj_ctype.h lj_trace.h lj_jit.h lj_ir.h \ @@ -188,12 +191,13 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \ lj_target.h lj_target_*.h lj_mcode.h lj_carith.c lj_carith.h lj_clib.c \ lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c lj_ircall.h \ lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \ - lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_mcode.c lj_snap.c \ - lj_record.c lj_record.h lj_ffrecord.h lj_crecord.c lj_crecord.h \ - lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h lj_emit_*.h lj_asm_*.h \ - lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c \ - lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \ - lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c lib_init.c + lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_opt_sink.c \ + lj_mcode.c lj_snap.c lj_record.c lj_record.h lj_ffrecord.h lj_crecord.c \ + lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h lj_emit_*.h \ + lj_asm_*.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c \ + lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c \ + lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c \ + lib_init.c luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h host/buildvm.o: host/buildvm.c host/buildvm.h lj_def.h lua.h luaconf.h \ lj_arch.h lj_obj.h lj_def.h lj_arch.h lj_gc.h lj_obj.h lj_bc.h lj_ir.h \ diff --git a/src/jit/dump.lua b/src/jit/dump.lua index 3d62c4ea..98933971 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -374,10 +374,13 @@ local function dump_snap(tr) end -- Return a register name or stack slot for a rid/sp location. -local function ridsp_name(ridsp) +local function ridsp_name(ridsp, ins) if not disass then disass = require("jit.dis_"..jit.arch) end - local rid = band(ridsp, 0xff) - if ridsp > 255 then return format("[%x]", shr(ridsp, 8)*4) end + local rid, slot = band(ridsp, 0xff), shr(ridsp, 8) + if rid == 253 or rid == 254 then + return slot == 0 and " {sink" or format(" {%04d", ins-slot) + end + if ridsp > 255 then return format("[%x]", slot*4) end if rid < 128 then return disass.regname(rid) end return "" end @@ -458,13 +461,15 @@ local function dump_ir(tr, dumpsnap, dumpreg) end elseif op ~= "NOP " and op ~= "CARG " and (dumpreg or op ~= "RENAME") then + local rid = band(ridsp, 255) if dumpreg then - out:write(format("%04d %-5s ", ins, ridsp_name(ridsp))) + out:write(format("%04d %-6s", ins, ridsp_name(ridsp, ins))) else out:write(format("%04d ", ins)) end out:write(format("%s%s %s %s ", - band(ot, 128) == 0 and " " or ">", + (rid == 254 or rid == 253) and "}" or + (band(ot, 128) == 0 and " " or ">"), band(ot, 64) == 0 and " " or "+", irtype[t], op)) local m1, m2 = band(m, 3), band(m, 3*4) diff --git a/src/lj_asm.c b/src/lj_asm.c index 9bce9292..8ff3eaf7 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -782,19 +782,44 @@ static int asm_snap_canremat(ASMState *as) static void asm_snap_alloc1(ASMState *as, IRRef ref) { IRIns *ir = IR(ref); - if (!ra_used(ir)) { - RegSet allow = (!LJ_SOFTFP && irt_isnum(ir->t)) ? RSET_FPR : RSET_GPR; - /* Get a weak register if we have a free one or can rematerialize. */ - if ((as->freeset & allow) || - (allow == RSET_FPR && asm_snap_canremat(as))) { - Reg r = ra_allocref(as, ref, allow); /* Allocate a register. */ - if (!irt_isphi(ir->t)) - ra_weak(as, r); /* But mark it as weakly referenced. */ - checkmclim(as); - RA_DBGX((as, "snapreg $f $r", ref, ir->r)); + if (!(ra_used(ir) || ir->r == RID_SUNK)) { + if (ir->r == RID_SINK) { + ir->r = RID_SUNK; +#if LJ_HASFFI + if (ir->o == IR_CNEWI) { /* Allocate CNEWI value. */ + asm_snap_alloc1(as, ir->op2); + if (LJ_32 && (ir+1)->o == IR_HIOP) + asm_snap_alloc1(as, (ir+1)->op2); + } +#endif + else { /* Allocate stored values for TNEW, TDUP and CNEW. */ + IRIns *irs; + lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP || ir->o == IR_CNEW); + for (irs = IR(as->curins); irs > ir; irs--) + if (irs->r == RID_SINK && ir + irs->s == irs) { + lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE || + irs->o == IR_FSTORE || irs->o == IR_XSTORE); + asm_snap_alloc1(as, irs->op2); + if (LJ_32 && (irs+1)->o == IR_HIOP) + asm_snap_alloc1(as, (irs+1)->op2); + } + } + } else if (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT) { + asm_snap_alloc1(as, ir->op1); } else { - ra_spill(as, ir); /* Otherwise force a spill slot. */ - RA_DBGX((as, "snapspill $f $s", ref, ir->s)); + RegSet allow = (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR; + if ((as->freeset & allow) || + (allow == RSET_FPR && asm_snap_canremat(as))) { + /* Get a weak register if we have a free one or can rematerialize. */ + Reg r = ra_allocref(as, ref, allow); /* Allocate a register. */ + if (!irt_isphi(ir->t)) + ra_weak(as, r); /* But mark it as weakly referenced. */ + checkmclim(as); + RA_DBGX((as, "snapreg $f $r", ref, ir->r)); + } else { + ra_spill(as, ir); /* Otherwise force a spill slot. */ + RA_DBGX((as, "snapspill $f $s", ref, ir->s)); + } } } } @@ -848,7 +873,7 @@ static void asm_snap_prep(ASMState *as) { if (as->curins < as->snapref) { do { - lua_assert(as->snapno != 0); + if (as->snapno == 0) return; /* Called by sunk stores before snap #0. */ as->snapno--; as->snapref = as->T->snap[as->snapno].ref; } while (as->curins < as->snapref); @@ -1180,6 +1205,8 @@ static void asm_phi(ASMState *as, IRIns *ir) RegSet afree = (as->freeset & allow); IRIns *irl = IR(ir->op1); IRIns *irr = IR(ir->op2); + if (ir->r == RID_SINK) /* Sink PHI. */ + return; /* Spill slot shuffling is not implemented yet (but rarely needed). */ if (ra_hasspill(irl->s) || ra_hasspill(irr->s)) lj_trace_err(as->J, LJ_TRERR_NYIPHI); @@ -1494,7 +1521,7 @@ static void asm_tail_link(ASMState *as) /* -- Trace setup --------------------------------------------------------- */ /* Clear reg/sp for all instructions and add register hints. */ -static void asm_setup_regsp(ASMState *as) +static void asm_setup_regsp(ASMState *as, int sink) { GCtrace *T = as->T; IRRef nins = T->nins; @@ -1545,6 +1572,14 @@ static void asm_setup_regsp(ASMState *as) inloop = 0; as->evenspill = SPS_FIRST; for (lastir = IR(nins); ir < lastir; ir++) { + if (sink) { + if (ir->r == RID_SINK) + continue; + if (ir->r == RID_SUNK) { /* Revert after ASM restart. */ + ir->r = RID_SINK; + continue; + } + } switch (ir->o) { case IR_LOOP: inloop = 1; @@ -1716,6 +1751,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T) ASMState as_; ASMState *as = &as_; MCode *origtop; + int sink; /* Ensure an initialized instruction beyond the last one for HIOP checks. */ J->cur.nins = lj_ir_nextins(J); @@ -1736,6 +1772,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T) as->mcp = as->mctop; as->mclim = as->mcbot + MCLIM_REDZONE; asm_setup_target(as); + sink = (IR(REF_BASE)->prev == 1); do { as->mcp = as->mctop; @@ -1751,7 +1788,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T) as->gcsteps = 0; as->sectref = as->loopref; as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED; - asm_setup_regsp(as); + asm_setup_regsp(as, sink); if (!as->loopref) asm_tail_link(as); diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index e6ab3573..19250254 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -693,6 +693,8 @@ static void asm_newref(ASMState *as, IRIns *ir) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey]; IRRef args[3]; + if (ir->r == RID_SINK) /* Sink newref. */ + return; args[0] = ASMREF_L; /* lua_State *L */ args[1] = ir->op1; /* GCtab *t */ args[2] = ASMREF_TMP1; /* cTValue *key */ @@ -836,9 +838,13 @@ static void asm_xload(ASMState *as, IRIns *ir) static void asm_xstore(ASMState *as, IRIns *ir, int32_t ofs) { - Reg src = ra_alloc1(as, ir->op2, RSET_GPR); - asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, - rset_exclude(RSET_GPR, src), ofs); + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + } else { + Reg src = ra_alloc1(as, ir->op2, RSET_GPR); + asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, + rset_exclude(RSET_GPR, src), ofs); + } } static void asm_ahuvload(ASMState *as, IRIns *ir) @@ -876,21 +882,25 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) static void asm_ahustore(ASMState *as, IRIns *ir) { - RegSet allow = RSET_GPR; - Reg idx, src = RID_NONE, type = RID_NONE; - int32_t ofs = 0; - int hiop = ((ir+1)->o == IR_HIOP); - if (!irt_ispri(ir->t)) { - src = ra_alloc1(as, ir->op2, allow); - rset_clear(allow, src); + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + } else { + RegSet allow = RSET_GPR; + Reg idx, src = RID_NONE, type = RID_NONE; + int32_t ofs = 0; + int hiop = ((ir+1)->o == IR_HIOP); + if (!irt_ispri(ir->t)) { + src = ra_alloc1(as, ir->op2, allow); + rset_clear(allow, src); + } + if (hiop) + type = ra_alloc1(as, (ir+1)->op2, allow); + else + type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); + idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type)); + if (ra_hasreg(src)) emit_lso(as, ARMI_STR, src, idx, ofs); + emit_lso(as, ARMI_STR, type, idx, ofs+4); } - if (hiop) - type = ra_alloc1(as, (ir+1)->op2, allow); - else - type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); - idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type)); - if (ra_hasreg(src)) emit_lso(as, ARMI_STR, src, idx, ofs); - emit_lso(as, ARMI_STR, type, idx, ofs+4); } static void asm_sload(ASMState *as, IRIns *ir) @@ -1382,7 +1392,10 @@ static void asm_hiop(ASMState *as, IRIns *ir) asm_fpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_HI : CC_LO); return; } else if ((ir-1)->o == IR_XSTORE) { - asm_xstore(as, ir, 4); + if ((ir-1)->r == RID_SINK) + asm_snap_prep(as); + else + asm_xstore(as, ir, 4); return; } if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index b42f9f9a..def3eb2a 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -769,14 +769,18 @@ nolo: static void asm_newref(ASMState *as, IRIns *ir) { - const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey]; - IRRef args[3]; - args[0] = ASMREF_L; /* lua_State *L */ - args[1] = ir->op1; /* GCtab *t */ - args[2] = ASMREF_TMP1; /* cTValue *key */ - asm_setupresult(as, ir, ci); /* TValue * */ - asm_gencall(as, ci, args); - asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op2); + if (ir->r == RID_SINK) { /* Sink newref. */ + return; + } else { + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey]; + IRRef args[3]; + args[0] = ASMREF_L; /* lua_State *L */ + args[1] = ir->op1; /* GCtab *t */ + args[2] = ASMREF_TMP1; /* cTValue *key */ + asm_setupresult(as, ir, ci); /* TValue * */ + asm_gencall(as, ci, args); + asm_tvptr(as, ra_releasetmp(as, ASMREF_TMP1), ir->op2); + } } static void asm_uref(ASMState *as, IRIns *ir) @@ -912,9 +916,14 @@ static void asm_xload(ASMState *as, IRIns *ir) static void asm_xstore(ASMState *as, IRIns *ir, int32_t ofs) { - Reg src = ra_alloc1z(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); - asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, - rset_exclude(RSET_GPR, src), ofs); + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + return; + } else { + Reg src = ra_alloc1z(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); + asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, + rset_exclude(RSET_GPR, src), ofs); + } } static void asm_ahuvload(ASMState *as, IRIns *ir) @@ -947,6 +956,10 @@ static void asm_ahustore(ASMState *as, IRIns *ir) RegSet allow = RSET_GPR; Reg idx, src = RID_NONE, type = RID_NONE; int32_t ofs = 0; + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + return; + } if (irt_isnum(ir->t)) { src = ra_alloc1(as, ir->op2, RSET_FPR); } else { @@ -1561,8 +1574,12 @@ static void asm_hiop(ASMState *as, IRIns *ir) return; } else if ((ir-1)->o == IR_XSTORE) { as->curins--; /* Handle both stores here. */ - asm_xstore(as, ir, LJ_LE ? 4 : 0); - asm_xstore(as, ir-1, LJ_LE ? 0 : 4); + if ((ir-1)->r == RID_SINK) { + asm_snap_prep(as); + } else { + asm_xstore(as, ir, LJ_LE ? 4 : 0); + asm_xstore(as, ir-1, LJ_LE ? 0 : 4); + } return; } if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index 5d538fc8..142ef212 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -773,6 +773,8 @@ static void asm_newref(ASMState *as, IRIns *ir) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey]; IRRef args[3]; + if (ir->r == RID_SINK) /* Sink newref. */ + return; args[0] = ASMREF_L; /* lua_State *L */ args[1] = ir->op1; /* GCtab *t */ args[2] = ASMREF_TMP1; /* cTValue *key */ @@ -892,12 +894,16 @@ static void asm_fload(ASMState *as, IRIns *ir) static void asm_fstore(ASMState *as, IRIns *ir) { - Reg src = ra_alloc1(as, ir->op2, RSET_GPR); - IRIns *irf = IR(ir->op1); - Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src)); - int32_t ofs = field_ofs[irf->op2]; - PPCIns pi = asm_fxstoreins(ir); - emit_tai(as, pi, src, idx, ofs); + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + } else { + Reg src = ra_alloc1(as, ir->op2, RSET_GPR); + IRIns *irf = IR(ir->op1); + Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src)); + int32_t ofs = field_ofs[irf->op2]; + PPCIns pi = asm_fxstoreins(ir); + emit_tai(as, pi, src, idx, ofs); + } } static void asm_xload(ASMState *as, IRIns *ir) @@ -912,6 +918,10 @@ static void asm_xload(ASMState *as, IRIns *ir) static void asm_xstore(ASMState *as, IRIns *ir, int32_t ofs) { IRIns *irb; + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + return; + } if (ofs == 0 && mayfuse(as, ir->op2) && (irb = IR(ir->op2))->o == IR_BSWAP && ra_noreg(irb->r) && (irt_isint(ir->t) || irt_isu32(ir->t))) { /* Fuse BSWAP with XSTORE to stwbrx. */ @@ -968,6 +978,10 @@ static void asm_ahustore(ASMState *as, IRIns *ir) RegSet allow = RSET_GPR; Reg idx, src = RID_NONE, type = RID_NONE; int32_t ofs = AHUREF_LSX; + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + return; + } if (irt_isnum(ir->t)) { src = ra_alloc1(as, ir->op2, RSET_FPR); } else { @@ -1747,8 +1761,12 @@ static void asm_hiop(ASMState *as, IRIns *ir) return; } else if ((ir-1)->o == IR_XSTORE) { as->curins--; /* Handle both stores here. */ - asm_xstore(as, ir, 0); - asm_xstore(as, ir-1, 4); + if ((ir-1)->r == RID_SINK) { + asm_snap_prep(as); + } else { + asm_xstore(as, ir, 0); + asm_xstore(as, ir-1, 4); + } return; } if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 4537e1d5..ae14b3b6 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1155,6 +1155,8 @@ static void asm_newref(ASMState *as, IRIns *ir) IRRef args[3]; IRIns *irkey; Reg tmp; + if (ir->r == RID_SINK) /* Sink newref. */ + return; args[0] = ASMREF_L; /* lua_State *L */ args[1] = ir->op1; /* GCtab *t */ args[2] = ASMREF_TMP1; /* cTValue *key */ @@ -1259,6 +1261,10 @@ static void asm_fxstore(ASMState *as, IRIns *ir) RegSet allow = RSET_GPR; Reg src = RID_NONE, osrc = RID_NONE; int32_t k = 0; + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + return; + } /* The IRT_I16/IRT_U16 stores should never be simplified for constant ** values since mov word [mem], imm16 has a length-changing prefix. */ @@ -1372,6 +1378,10 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) static void asm_ahustore(ASMState *as, IRIns *ir) { + if (ir->r == RID_SINK) { /* Sink store. */ + asm_snap_prep(as); + return; + } if (irt_isnum(ir->t)) { Reg src = ra_alloc1(as, ir->op2, RSET_FPR); asm_fuseahuref(as, ir->op1, RSET_GPR); @@ -2251,7 +2261,10 @@ static void asm_hiop(ASMState *as, IRIns *ir) asm_comp_int64(as, ir); return; } else if ((ir-1)->o == IR_XSTORE) { - asm_fxstore(as, ir); + if ((ir-1)->r == RID_SINK) + asm_snap_prep(as); + else + asm_fxstore(as, ir); return; } if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 81d522e8..a17e2065 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -154,6 +154,7 @@ LJ_FUNC void lj_opt_split(jit_State *J); #else #define lj_opt_split(J) UNUSED(J) #endif +LJ_FUNC void lj_opt_sink(jit_State *J); #endif diff --git a/src/lj_jit.h b/src/lj_jit.h index 28cdd17a..517b3264 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -63,19 +63,20 @@ #define JIT_F_OPT_NARROW 0x00200000 #define JIT_F_OPT_LOOP 0x00400000 #define JIT_F_OPT_ABC 0x00800000 -#define JIT_F_OPT_FUSE 0x01000000 +#define JIT_F_OPT_SINK 0x01000000 +#define JIT_F_OPT_FUSE 0x02000000 /* Optimizations names for -O. Must match the order above. */ #define JIT_F_OPT_FIRST JIT_F_OPT_FOLD #define JIT_F_OPTSTRING \ - "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4fuse" + "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse" /* Optimization levels set a fixed combination of flags. */ #define JIT_F_OPT_0 0 #define JIT_F_OPT_1 (JIT_F_OPT_FOLD|JIT_F_OPT_CSE|JIT_F_OPT_DCE) #define JIT_F_OPT_2 (JIT_F_OPT_1|JIT_F_OPT_NARROW|JIT_F_OPT_LOOP) -#define JIT_F_OPT_3 \ - (JIT_F_OPT_2|JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_FUSE) +#define JIT_F_OPT_3 (JIT_F_OPT_2|\ + JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE) #define JIT_F_OPT_DEFAULT JIT_F_OPT_3 #if LJ_TARGET_WINDOWS || LJ_64 diff --git a/src/lj_opt_sink.c b/src/lj_opt_sink.c new file mode 100644 index 00000000..80ab5b6e --- /dev/null +++ b/src/lj_opt_sink.c @@ -0,0 +1,244 @@ +/* +** SINK: Allocation Sinking and Store Sinking. +** Copyright (C) 2005-2012 Mike Pall. See Copyright Notice in luajit.h +*/ + +#define lj_opt_sink_c +#define LUA_CORE + +#include "lj_obj.h" + +#if LJ_HASJIT + +#include "lj_ir.h" +#include "lj_jit.h" +#include "lj_iropt.h" +#include "lj_target.h" + +/* Some local macros to save typing. Undef'd at the end. */ +#define IR(ref) (&J->cur.ir[(ref)]) + +/* Check whether the store ref points to an eligible allocation. */ +static IRIns *sink_checkalloc(jit_State *J, IRIns *irs) +{ + IRIns *ir = IR(irs->op1); + if (!irref_isk(ir->op2)) + return NULL; /* Non-constant key. */ + if (ir->o == IR_HREFK || ir->o == IR_AREF) + ir = IR(ir->op1); + else if (!(ir->o == IR_HREF || ir->o == IR_NEWREF || + ir->o == IR_FREF || ir->o == IR_ADD)) + return NULL; /* Unhandled reference type (for XSTORE). */ + ir = IR(ir->op1); + if (!(ir->o == IR_TNEW || ir->o == IR_TDUP || ir->o == IR_CNEW)) + return NULL; /* Not an allocation. */ + if (ir + 255 < irs) + return NULL; /* Out of range. */ + return ir; /* Return allocation. */ +} + +/* Recursively check whether a value depends on a PHI. */ +static int sink_phidep(jit_State *J, IRRef ref) +{ + IRIns *ir = IR(ref); + if (irt_isphi(ir->t)) return 1; + if (ir->op1 >= REF_FIRST && sink_phidep(J, ir->op1)) return 1; + if (ir->op2 >= REF_FIRST && sink_phidep(J, ir->op2)) return 1; + return 0; +} + +/* Check whether a value is a sinkable PHI or a non-PHI. */ +static int sink_checkphi(jit_State *J, IRIns *ira, IRRef ref) +{ + if (ref >= REF_FIRST) { + IRIns *ir = IR(ref); + if (irt_isphi(ir->t) || (ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT && + irt_isphi(IR(ir->op1)->t))) { + ira->prev++; + return 1; /* Sinkable PHI. */ + } + return !sink_phidep(J, ref); /* Must be a non-PHI then. */ + } + return 1; /* Constant (non-PHI). */ +} + +/* Mark non-sinkable allocations using single-pass backward propagation. +** +** Roots for the marking process are: +** - Some PHIs or snapshots (see below). +** - Non-PHI, non-constant values stored to PHI allocations. +** - All guards. +** - Any remaining loads not eliminated by store-to-load forwarding. +** - Stores with non-constant keys. +** - All stored values. +*/ +static void sink_mark_ins(jit_State *J) +{ + IRIns *ir, *irlast = IR(J->cur.nins-1); + for (ir = irlast ; ; ir--) { + switch (ir->o) { + case IR_BASE: + return; /* Finished. */ + case IR_CALLL: /* IRCALL_lj_tab_len */ + case IR_ALOAD: case IR_HLOAD: case IR_XLOAD: + irt_setmark(IR(ir->op1)->t); /* Mark ref for remaining loads. */ + break; + case IR_FLOAD: + if (irt_ismarked(ir->t) || ir->op2 == IRFL_TAB_META) + irt_setmark(IR(ir->op1)->t); /* Mark table for remaining loads. */ + break; + case IR_ASTORE: case IR_HSTORE: case IR_FSTORE: case IR_XSTORE: { + IRIns *ira = sink_checkalloc(J, ir); + if (!ira || (irt_isphi(ira->t) && !sink_checkphi(J, ira, ir->op2))) + irt_setmark(IR(ir->op1)->t); /* Mark ineligible ref. */ + irt_setmark(IR(ir->op2)->t); /* Mark stored value. */ + break; + } +#if LJ_HASFFI + case IR_CNEWI: + if (irt_isphi(ir->t) && + (!sink_checkphi(J, ir, ir->op2) || + (LJ_32 && ir+1 < irlast && (ir+1)->o == IR_HIOP && + !sink_checkphi(J, ir, (ir+1)->op2)))) + irt_setmark(ir->t); /* Mark ineligible allocation. */ + /* fallthrough */ +#endif + case IR_USTORE: + irt_setmark(IR(ir->op2)->t); /* Mark stored value. */ + break; +#if LJ_HASFFI + case IR_CALLXS: +#endif + case IR_CALLS: + irt_setmark(IR(ir->op1)->t); /* Mark (potentially) stored values. */ + break; + case IR_PHI: { + IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); + irl->prev = irr->prev = 0; /* Clear PHI value counts. */ + if (irl->o == irr->o && + (irl->o == IR_TNEW || irl->o == IR_TDUP || + (LJ_HASFFI && (irl->o == IR_CNEW || irl->o == IR_CNEWI)))) + break; + irt_setmark(irl->t); + irt_setmark(irr->t); + break; + } + default: + if (irt_ismarked(ir->t) || irt_isguard(ir->t)) { /* Propagate mark. */ + if (ir->op1 >= REF_FIRST) irt_setmark(IR(ir->op1)->t); + if (ir->op2 >= REF_FIRST) irt_setmark(IR(ir->op2)->t); + } + break; + } + } +} + +/* Mark all instructions referenced by a snapshot. */ +static void sink_mark_snap(jit_State *J, SnapShot *snap) +{ + SnapEntry *map = &J->cur.snapmap[snap->mapofs]; + MSize n, nent = snap->nent; + for (n = 0; n < nent; n++) { + IRRef ref = snap_ref(map[n]); + if (!irref_isk(ref)) + irt_setmark(IR(ref)->t); + } +} + +/* Iteratively remark PHI refs with differing marks or PHI value counts. */ +static void sink_remark_phi(jit_State *J) +{ + IRIns *ir; + int remark; + do { + remark = 0; + for (ir = IR(J->cur.nins-1); ir->o == IR_PHI; ir--) { + IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); + if (((irl->t.irt ^ irr->t.irt) & IRT_MARK)) + remark = 1; + else if (irl->prev == irr->prev) + continue; + irt_setmark(IR(ir->op1)->t); + irt_setmark(IR(ir->op2)->t); + } + } while (remark); +} + +/* Sweep instructions and mark sunken allocations and stores. */ +static void sink_sweep_ins(jit_State *J) +{ + IRIns *ir, *irfirst = IR(J->cur.nk); + for (ir = IR(J->cur.nins-1) ; ir >= irfirst; ir--) { + switch (ir->o) { + case IR_ASTORE: case IR_HSTORE: case IR_FSTORE: case IR_XSTORE: { + IRIns *ira = sink_checkalloc(J, ir); + if (ira && !irt_ismarked(ira->t)) + ir->prev = REGSP(RID_SINK, (int)(ir - ira)); + else + ir->prev = REGSP_INIT; + break; + } + case IR_NEWREF: + if (!irt_ismarked(ir->t)) { + ir->prev = REGSP(RID_SINK, 0); + } else { + irt_clearmark(ir->t); + ir->prev = REGSP_INIT; + } + break; +#if LJ_HASFFI + case IR_CNEW: case IR_CNEWI: +#endif + case IR_TNEW: case IR_TDUP: + if (!irt_ismarked(ir->t)) { + ir->t.irt &= ~IRT_GUARD; + ir->prev = REGSP(RID_SINK, 0); + } else { + irt_clearmark(ir->t); + ir->prev = REGSP_INIT; + } + break; + case IR_PHI: { + IRIns *ira = IR(ir->op2); + if (!irt_ismarked(ira->t) && + (ira->o == IR_TNEW || ira->o == IR_TDUP || + (LJ_HASFFI && (ira->o == IR_CNEW || ira->o == IR_CNEWI)))) { + ir->prev = REGSP(RID_SINK, 0); + } else { + ir->prev = REGSP_INIT; + } + break; + } + default: + irt_clearmark(ir->t); + ir->prev = REGSP_INIT; + break; + } + } + IR(REF_BASE)->prev = 1; /* Signal SINK flags to assembler. */ +} + +/* Allocation sinking and store sinking. +** +** 1. Mark all non-sinkable allocations. +** 2. Then sink all remaining allocations and the related stores. +*/ +void lj_opt_sink(jit_State *J) +{ + const uint32_t need = (JIT_F_OPT_SINK|JIT_F_OPT_FWD| + JIT_F_OPT_DCE|JIT_F_OPT_CSE|JIT_F_OPT_FOLD); + if ((J->flags & need) == need && + (J->chain[IR_TNEW] || J->chain[IR_TDUP] || + (LJ_HASFFI && (J->chain[IR_CNEW] || J->chain[IR_CNEWI])))) { + if (!J->loopref) + sink_mark_snap(J, &J->cur.snap[J->cur.nsnap-1]); + sink_mark_ins(J); + if (J->loopref) + sink_remark_phi(J); + sink_sweep_ins(J); + } +} + +#undef IR + +#endif diff --git a/src/lj_snap.c b/src/lj_snap.c index 33edc8a6..1e6f10d0 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -11,6 +11,7 @@ #if LJ_HASJIT #include "lj_gc.h" +#include "lj_tab.h" #include "lj_state.h" #include "lj_frame.h" #include "lj_bc.h" @@ -20,10 +21,17 @@ #include "lj_trace.h" #include "lj_snap.h" #include "lj_target.h" +#if LJ_HASFFI +#include "lj_ctype.h" +#include "lj_cdata.h" +#endif /* Some local macros to save typing. Undef'd at the end. */ #define IR(ref) (&J->cur.ir[(ref)]) +/* Pass IR on to next optimization in chain (FOLD). */ +#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) + /* Emit raw IR without passing through optimizations. */ #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J)) @@ -370,6 +378,31 @@ static TRef snap_replay_const(jit_State *J, IRIns *ir) } } +/* De-duplicate parent reference. */ +static TRef snap_dedup(jit_State *J, SnapEntry *map, MSize nmax, IRRef ref) +{ + MSize j; + for (j = 0; j < nmax; j++) + if (snap_ref(map[j]) == ref) + return J->slot[snap_slot(map[j])]; + return 0; +} + +/* Emit parent reference with de-duplication. */ +static TRef snap_pref(jit_State *J, GCtrace *T, SnapEntry *map, MSize nmax, + BloomFilter seen, IRRef ref) +{ + IRIns *ir = &T->ir[ref]; + TRef tr; + if (irref_isk(ref)) + tr = snap_replay_const(J, ir); + else if (!regsp_used(ir->prev)) + tr = 0; + else if (!bloomtest(seen, ref) || (tr = snap_dedup(J, map, nmax, ref)) == 0) + tr = emitir(IRT(IR_PVAL, irt_type(ir->t)), ref - REF_BIAS, 0); + return tr; +} + /* Replay snapshot state to setup side trace. */ void lj_snap_replay(jit_State *J, GCtrace *T) { @@ -377,6 +410,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T) SnapEntry *map = &T->snapmap[snap->mapofs]; MSize n, nent = snap->nent; BloomFilter seen = 0; + int pass23 = 0; J->framedepth = 0; /* Emit IR for slots inherited from parent snapshot. */ for (n = 0; n < nent; n++) { @@ -386,21 +420,18 @@ void lj_snap_replay(jit_State *J, GCtrace *T) IRIns *ir = &T->ir[ref]; TRef tr; /* The bloom filter avoids O(nent^2) overhead for de-duping slots. */ - if (bloomtest(seen, ref)) { - MSize j; - for (j = 0; j < n; j++) - if (snap_ref(map[j]) == ref) { - tr = J->slot[snap_slot(map[j])]; - goto setslot; - } - } + if (bloomtest(seen, ref) && (tr = snap_dedup(J, map, n, ref)) != 0) + goto setslot; bloomset(seen, ref); if (irref_isk(ref)) { tr = snap_replay_const(J, ir); + } else if (!regsp_used(ir->prev)) { + pass23 = 1; + lua_assert(s != 0); + tr = s; } else { IRType t = irt_type(ir->t); uint32_t mode = IRSLOAD_INHERIT|IRSLOAD_PARENT; - lua_assert(regsp_used(ir->prev)); if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM; if (ir->o == IR_SLOAD) mode |= (ir->op2 & IRSLOAD_READONLY); tr = emitir_raw(IRT(IR_SLOAD, t), s, mode); @@ -411,13 +442,126 @@ void lj_snap_replay(jit_State *J, GCtrace *T) if ((sn & SNAP_FRAME)) J->baseslot = s+1; } + if (pass23) { + IRIns *irlast = &T->ir[(snap+1)->ref]; + lua_assert(J->exitno+1 < T->nsnap); + pass23 = 0; + /* Emit dependent PVALs. */ + for (n = 0; n < nent; n++) { + SnapEntry sn = map[n]; + IRRef refp = snap_ref(sn); + IRIns *ir = &T->ir[refp]; + if (regsp_reg(ir->r) == RID_SUNK) { + if (J->slot[snap_slot(sn)] != snap_slot(sn)) continue; + pass23 = 1; + lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP || + ir->o == IR_CNEW || ir->o == IR_CNEWI); + if (ir->op1 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op1); + if (ir->op2 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op2); + if (LJ_HASFFI && ir->o == IR_CNEWI) { + if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP) + snap_pref(J, T, map, nent, seen, (ir+1)->op2); + } else { + IRIns *irs; + for (irs = ir+1; irs < irlast; irs++) + if (irs->r == RID_SINK && ir + irs->s == irs) { + if (snap_pref(J, T, map, nent, seen, irs->op2) == 0) + snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1); + else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) && + irs+1 < irlast && (irs+1)->o == IR_HIOP) + snap_pref(J, T, map, nent, seen, (irs+1)->op2); + } + } + } else if (!irref_isk(refp) && !regsp_used(ir->prev)) { + lua_assert(ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT); + J->slot[snap_slot(sn)] = snap_pref(J, T, map, nent, seen, ir->op1); + } + } + /* Replay sunk instructions. */ + for (n = 0; pass23 && n < nent; n++) { + SnapEntry sn = map[n]; + IRRef refp = snap_ref(sn); + IRIns *ir = &T->ir[refp]; + if (regsp_reg(ir->r) == RID_SUNK) { + TRef op1, op2; + if (J->slot[snap_slot(sn)] != snap_slot(sn)) { /* De-dup allocs. */ + J->slot[snap_slot(sn)] = J->slot[J->slot[snap_slot(sn)]]; + continue; + } + op1 = ir->op1; + if (op1 >= T->nk) op1 = snap_pref(J, T, map, nent, seen, op1); + op2 = ir->op2; + if (op2 >= T->nk) op2 = snap_pref(J, T, map, nent, seen, op2); + if (LJ_HASFFI && ir->o == IR_CNEWI) { + if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP) { + lj_needsplit(J); /* Emit joining HIOP. */ + op2 = emitir_raw(IRT(IR_HIOP, IRT_I64), op2, + snap_pref(J, T, map, nent, seen, (ir+1)->op2)); + } + J->slot[snap_slot(sn)] = emitir(ir->ot, op1, op2); + } else { + IRIns *irs; + TRef tr = emitir(ir->ot, op1, op2); + J->slot[snap_slot(sn)] = tr; + for (irs = ir+1; irs < irlast; irs++) + if (irs->r == RID_SINK && ir + irs->s == irs) { + IRIns *irr = &T->ir[irs->op1]; + TRef val, key = irr->op2, tmp = tr; + if (irr->o != IR_FREF) { + IRIns *irk = &T->ir[key]; + if (irr->o == IR_HREFK) + key = lj_ir_kslot(J, snap_replay_const(J, &T->ir[irk->op1]), + irk->op2); + else + key = snap_replay_const(J, irk); + if (irr->o == IR_HREFK || irr->o == IR_AREF) { + IRIns *irf = &T->ir[irr->op1]; + tmp = emitir(irf->ot, tmp, irf->op2); + } + } + tmp = emitir(irr->ot, tmp, key); + val = snap_pref(J, T, map, nent, seen, irs->op2); + if (val == 0) { + IRIns *irc = &T->ir[irs->op2]; + lua_assert(irc->o == IR_CONV && irc->op2 == IRCONV_NUM_INT); + val = snap_pref(J, T, map, nent, seen, irc->op1); + val = emitir(IRTN(IR_CONV), val, IRCONV_NUM_INT); + } else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) && + irs+1 < irlast && (irs+1)->o == IR_HIOP) { + IRType t = IRT_I64; + if (LJ_SOFTFP && irt_type((irs+1)->t) == IRT_SOFTFP) + t = IRT_NUM; + if (irref_isk(irs->op2) && irref_isk((irs+1)->op2)) { + uint64_t k = (uint32_t)T->ir[irs->op2].i + + ((uint64_t)T->ir[(irs+1)->op2].i << 32); + val = lj_ir_k64(J, t == IRT_I64 ? IR_KINT64 : IR_KNUM, + lj_ir_k64_find(J, k)); + } else { + val = emitir_raw(IRT(IR_HIOP, t), val, + snap_pref(J, T, map, nent, seen, (irs+1)->op2)); + } + tmp = emitir(IRT(irs->o, t), tmp, val); + continue; + } + tmp = emitir(irs->ot, tmp, val); + } + } + } + } + } J->base = J->slot + J->baseslot; J->maxslot = snap->nslots - J->baseslot; lj_snap_add(J); + if (pass23) /* Need explicit GC step _after_ initial snapshot. */ + emitir_raw(IRTG(IR_GCSTEP, IRT_NIL), 0, 0); } /* -- Snapshot restore ---------------------------------------------------- */ +static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, + SnapNo snapno, BloomFilter rfilt, + IRIns *ir, TValue *o); + /* Restore a value from the trace exit state. */ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, SnapNo snapno, BloomFilter rfilt, @@ -450,8 +594,12 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, } } else { /* Restore from register. */ Reg r = regsp_reg(rs); - lua_assert(ra_hasreg(r)); - if (irt_isinteger(t)) { + if (ra_noreg(r)) { + lua_assert(ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT); + snap_restoreval(J, T, ex, snapno, rfilt, ir->op1, o); + if (LJ_DUALNUM) setnumV(o, (lua_Number)intV(o)); + return; + } else if (irt_isinteger(t)) { setintV(o, (int32_t)ex->gpr[r-RID_MIN_GPR]); #if !LJ_SOFTFP } else if (irt_isnum(t)) { @@ -468,6 +616,148 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, } } +#if LJ_HASFFI +/* Restore raw data from the trace exit state. */ +static void snap_restoredata(GCtrace *T, ExitState *ex, + SnapNo snapno, BloomFilter rfilt, + IRRef ref, void *dst, CTSize sz) +{ + IRIns *ir = &T->ir[ref]; + RegSP rs = ir->prev; + int32_t *src; + union { uint64_t u64; float f; } tmp; + if (irref_isk(ref)) { + if (ir->o == IR_KNUM || ir->o == IR_KINT64) { + src = mref(ir->ptr, int32_t); + } else if (sz == 8) { + tmp.u64 = (uint64_t)(uint32_t)ir->i; + src = (int32_t *)&tmp.u64; + } else { + src = &ir->i; + } + } else { + if (LJ_UNLIKELY(bloomtest(rfilt, ref))) + rs = snap_renameref(T, snapno, ref, rs); + if (ra_hasspill(regsp_spill(rs))) { + src = &ex->spill[regsp_spill(rs)]; + } else { + Reg r = regsp_reg(rs); + if (ra_noreg(r)) { + /* Note: this assumes CNEWI is never used for SOFTFP split numbers. */ + lua_assert(sz == 8 && ir->o == IR_CONV && ir->op2 == IRCONV_NUM_INT); + snap_restoredata(T, ex, snapno, rfilt, ir->op1, dst, 4); + *(lua_Number *)dst = (lua_Number)*(int32_t *)dst; + return; + } + src = (int32_t *)&ex->gpr[r-RID_MIN_GPR]; +#if !LJ_SOFTFP + if (r >= RID_MAX_GPR) { + src = (int32_t *)&ex->fpr[r-RID_MIN_FPR]; +#if LJ_TARGET_PPC + if (sz == 4) { /* PPC FPRs are always doubles. */ + tmp.f = (float)*(double *)src; + src = (int32_t *)&tmp.f; + } +#else + if (LJ_BE && sz == 4) src++; +#endif + } +#endif + } + } + lua_assert(sz == 1 || sz == 2 || sz == 4 || sz == 8); + if (sz == 4) *(int32_t *)dst = *src; + else if (sz == 8) *(int64_t *)dst = *(int64_t *)src; + else if (sz == 1) *(int8_t *)dst = (int8_t)*src; + else *(int16_t *)dst = (int16_t)*src; +} +#endif + +/* Unsink allocation from the trace exit state. Unsink sunk stores. */ +static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, + SnapNo snapno, BloomFilter rfilt, + IRIns *ir, TValue *o) +{ + lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP || + ir->o == IR_CNEW || ir->o == IR_CNEWI); +#if LJ_HASFFI + if (ir->o == IR_CNEW || ir->o == IR_CNEWI) { + CTState *cts = ctype_ctsG(J2G(J)); + CTypeID id = (CTypeID)T->ir[ir->op1].i; + CTSize sz = lj_ctype_size(cts, id); + GCcdata *cd = lj_cdata_new(cts, id, sz); + setcdataV(J->L, o, cd); + if (ir->o == IR_CNEWI) { + uint8_t *p = (uint8_t *)cdataptr(cd); + lua_assert(sz == 4 || sz == 8); + if (LJ_32 && sz == 8 && ir+1 < T->ir + T->nins && (ir+1)->o == IR_HIOP) { + snap_restoredata(T, ex, snapno, rfilt, (ir+1)->op2, LJ_LE?p+4:p, 4); + if (LJ_BE) p += 4; + sz = 4; + } + snap_restoredata(T, ex, snapno, rfilt, ir->op2, p, sz); + } else { + IRIns *irs, *irlast = &T->ir[T->snap[snapno].ref]; + for (irs = ir+1; irs < irlast; irs++) + if (irs->r == RID_SINK && ir + irs->s == irs) { + IRIns *iro = &T->ir[T->ir[irs->op1].op2]; + uint8_t *p = (uint8_t *)cd; + CTSize szs; + lua_assert(irs->o == IR_XSTORE && T->ir[irs->op1].o == IR_ADD); + lua_assert(iro->o == IR_KINT || iro->o == IR_KINT64); + if (irt_is64(irs->t)) szs = 8; + else if (irt_isi8(irs->t) || irt_isu8(irs->t)) szs = 1; + else if (irt_isi16(irs->t) || irt_isu16(irs->t)) szs = 2; + else szs = 4; + if (LJ_64 && iro->o == IR_KINT64) + p += (int64_t)ir_k64(iro)->u64; + else + p += iro->i; + lua_assert(p >= (uint8_t *)cdataptr(cd) && + p + szs <= (uint8_t *)cdataptr(cd) + sz); + if (LJ_32 && irs+1 < T->ir + T->nins && (irs+1)->o == IR_HIOP) { + lua_assert(szs == 4); + snap_restoredata(T, ex, snapno, rfilt, (irs+1)->op2, LJ_LE?p+4:p,4); + if (LJ_BE) p += 4; + } + snap_restoredata(T, ex, snapno, rfilt, irs->op2, p, szs); + } + } + } else +#endif + { + IRIns *irs, *irlast; + GCtab *t = ir->o == IR_TNEW ? lj_tab_new(J->L, ir->op1, ir->op2) : + lj_tab_dup(J->L, ir_ktab(&T->ir[ir->op1])); + settabV(J->L, o, t); + irlast = &T->ir[T->snap[snapno].ref]; + for (irs = ir+1; irs < irlast; irs++) + if (irs->r == RID_SINK && ir + irs->s == irs) { + IRIns *irk = &T->ir[irs->op1]; + TValue tmp, *val; + lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE || + irs->o == IR_FSTORE); + if (irk->o == IR_FREF) { + lua_assert(irk->op2 == IRFL_TAB_META); + snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp); + /* NOBARRIER: The table is new (marked white). */ + setgcref(t->metatable, obj2gco(tabV(&tmp))); + } else { + irk = &T->ir[irk->op2]; + if (irk->o == IR_KSLOT) irk = &T->ir[irk->op1]; + lj_ir_kvalue(J->L, &tmp, irk); + val = lj_tab_set(J->L, t, &tmp); + /* NOBARRIER: The table is new (marked white). */ + snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, val); + if (LJ_SOFTFP && irs+1 < T->ir + T->nins && (irs+1)->o == IR_HIOP) { + snap_restoreval(J, T, ex, snapno, rfilt, (irs+1)->op2, &tmp); + val->u32.hi = tmp.u32.lo; + } + } + } + } +} + /* Restore interpreter state from exit state with the help of a snapshot. */ const BCIns *lj_snap_restore(jit_State *J, void *exptr) { @@ -500,10 +790,23 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) SnapEntry sn = map[n]; if (!(sn & SNAP_NORESTORE)) { TValue *o = &frame[snap_slot(sn)]; - snap_restoreval(J, T, ex, snapno, rfilt, snap_ref(sn), o); + IRRef ref = snap_ref(sn); + IRIns *ir = &T->ir[ref]; + if (ir->r == RID_SUNK) { + MSize j; + for (j = 0; j < n; j++) + if (snap_ref(map[j]) == ref) { /* De-duplicate sunk allocations. */ + copyTV(L, o, &frame[snap_slot(map[j])]); + goto dupslot; + } + snap_unsink(J, T, ex, snapno, rfilt, ir, o); + dupslot: + continue; + } + snap_restoreval(J, T, ex, snapno, rfilt, ref, o); if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM) && tvisint(o)) { TValue tmp; - snap_restoreval(J, T, ex, snapno, rfilt, snap_ref(sn)+1, &tmp); + snap_restoreval(J, T, ex, snapno, rfilt, ref+1, &tmp); o->u32.hi = tmp.u32.lo; } else if ((sn & (SNAP_CONT|SNAP_FRAME))) { /* Overwrite tag with frame link. */ @@ -528,5 +831,6 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) #undef IR #undef emitir_raw +#undef emitir #endif diff --git a/src/lj_target.h b/src/lj_target.h index 13de8fc6..4808a38c 100644 --- a/src/lj_target.h +++ b/src/lj_target.h @@ -16,17 +16,19 @@ typedef uint32_t Reg; /* The hi-bit is NOT set for an allocated register. This means the value ** can be directly used without masking. The hi-bit is set for a register -** allocation hint or for RID_INIT. +** allocation hint or for RID_INIT, RID_SINK or RID_SUNK. */ #define RID_NONE 0x80 #define RID_MASK 0x7f #define RID_INIT (RID_NONE|RID_MASK) +#define RID_SINK (RID_INIT-1) +#define RID_SUNK (RID_INIT-2) #define ra_noreg(r) ((r) & RID_NONE) #define ra_hasreg(r) (!((r) & RID_NONE)) /* The ra_hashint() macro assumes a previous test for ra_noreg(). */ -#define ra_hashint(r) ((r) != RID_INIT) +#define ra_hashint(r) ((r) < RID_SUNK) #define ra_gethint(r) ((Reg)((r) & RID_MASK)) #define ra_sethint(rr, r) rr = (uint8_t)((r)|RID_NONE) #define ra_samehint(r1, r2) (ra_gethint((r1)^(r2)) == 0) diff --git a/src/lj_trace.c b/src/lj_trace.c index ad00dc67..240e7fc8 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -606,6 +606,7 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud) J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */ } lj_opt_split(J); + lj_opt_sink(J); J->state = LJ_TRACE_ASM; break; diff --git a/src/ljamalg.c b/src/ljamalg.c index 1b58ceb4..b1124464 100644 --- a/src/ljamalg.c +++ b/src/ljamalg.c @@ -64,6 +64,7 @@ #include "lj_opt_dce.c" #include "lj_opt_loop.c" #include "lj_opt_split.c" +#include "lj_opt_sink.c" #include "lj_mcode.c" #include "lj_snap.c" #include "lj_record.c"