From 81259898ea177bb7b4becebf3d7686603f6b373b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 24 Nov 2016 18:56:19 +0100 Subject: [PATCH] ARM64: Emit more efficient trace exits. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 73 +++++++++++++++++++------------------------ src/lj_target_arm64.h | 14 ++++++--- src/vm_arm64.dasc | 31 +++++++++--------- 3 files changed, 57 insertions(+), 61 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 0a2f5306..19b3331d 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -47,53 +47,41 @@ static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow) /* -- Guard handling ------------------------------------------------------ */ -/* Generate an exit stub group at the bottom of the reserved MCode memory. */ -static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) -{ - MCode *mxp = as->mcbot; - int i; - if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop) - asm_mclimit(as); - /* str lr, [sp]; bl ->vm_exit_handler; .long group. */ - *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP); - *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); - mxp++; - *mxp++ = group*EXITSTUBS_PER_GROUP; - for (i = 0; i < EXITSTUBS_PER_GROUP; i++) - *mxp++ = A64I_B | ((-3-i)&0x03ffffffu); - lj_mcode_sync(as->mcbot, mxp); - lj_mcode_commitbot(as->J, mxp); - as->mcbot = mxp; - as->mclim = as->mcbot + MCLIM_REDZONE; - return mxp - EXITSTUBS_PER_GROUP; -} - /* Setup all needed exit stubs. */ static void asm_exitstub_setup(ASMState *as, ExitNo nexits) { ExitNo i; - if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) - lj_trace_err(as->J, LJ_TRERR_SNAPOV); - for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++) - if (as->J->exitstubgroup[i] == NULL) - as->J->exitstubgroup[i] = asm_exitstub_gen(as, i); + MCode *mxp = as->mctop; + if (mxp - (nexits + 3 + MCLIM_REDZONE) < as->mclim) + asm_mclimit(as); + /* 1: str lr,[sp]; bl ->vm_exit_handler; movz w0,traceno; bl <1; bl <1; ... */ + for (i = nexits-1; (int32_t)i >= 0; i--) + *--mxp = A64I_BL|((-3-i)&0x03ffffffu); + *--mxp = A64I_MOVZw|A64F_U16(as->T->traceno); + mxp--; + *mxp = A64I_BL|(((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); + *--mxp = A64I_STRx|A64F_D(RID_LR)|A64F_N(RID_SP); + as->mctop = mxp; +} + +static MCode *asm_exitstub_addr(ASMState *as, ExitNo exitno) +{ + /* Keep this in-sync with exitstub_trace_addr(). */ + return as->mctop + exitno + 3; } /* Emit conditional branch to exit for guard. */ static void asm_guardcc(ASMState *as, A64CC cc) { - MCode *target = exitstub_addr(as->J, as->snapno); + MCode *target = asm_exitstub_addr(as, as->snapno); MCode *p = as->mcp; if (LJ_UNLIKELY(p == as->invmcp)) { as->loopinv = 1; - *p = A64I_BL | ((target-p) & 0x03ffffffu); + *p = A64I_B | ((target-p) & 0x03ffffffu); emit_cond_branch(as, cc^1, p-1); return; } - /* No conditional calls. Emit b.cc/bl instead. */ - /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */ - emit_branch(as, A64I_BL, target); - emit_cond_branch(as, cc^1, p); + emit_cond_branch(as, cc, target); } /* -- Operand fusion ------------------------------------------------------ */ @@ -1568,8 +1556,7 @@ static void asm_stack_check(ASMState *as, BCReg topslot, } else { pbase = RID_BASE; } - emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno)); - emit_cond_branch(as, CC_LS^1, as->mcp+1); + emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno)); k = emit_isk12((8*topslot)); lua_assert(k); emit_n(as, A64I_CMPx^k, RID_TMP); @@ -1744,7 +1731,8 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk) /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */ int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED)); if (spadj == 0) { - as->mctop = --p; + *--p = A64I_NOP; + as->mctop = p; } else { /* Patch stack adjustment. */ uint32_t k = emit_isk12(spadj); @@ -1805,13 +1793,18 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MCode *pe = (MCode *)((char *)p + T->szmcode); MCode *cstart = NULL, *cend = p; MCode *mcarea = lj_mcode_patch(J, p, 0); - MCode *px = exitstub_addr(J, exitno); + MCode *px = exitstub_trace_addr(T, exitno); for (; p < pe; p++) { - /* Look for bl exitstub, replace with b target. */ + /* Look for bcc/b exitstub, replace with bcc/b target. */ uint32_t ins = *p; - if ((ins & 0xfc000000u) == 0x94000000u && - ((ins ^ (px-p)) & 0x03ffffffu) == 0) { - *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu); + if ((ins & 0xff000000u) == 0x54000000u && + ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); + cend = p+1; + if (!cstart) cstart = p; + } else if ((ins & 0xfc000000u) == 0x14000000u && + ((ins ^ (px-p)) & 0x03ffffffu) == 0) { + *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); cend = p+1; if (!cstart) cstart = p; } diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 0cef06d5..1cd02fe8 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -101,14 +101,18 @@ typedef struct { int32_t spill[256]; /* Spill slots. */ } ExitState; -/* PC after instruction that caused an exit. Used to find the trace number. */ -#define EXITSTATE_PCREG RID_LR /* Highest exit + 1 indicates stack check. */ #define EXITSTATE_CHECKEXIT 1 -#define EXITSTUB_SPACING 4 -#define EXITSTUBS_PER_GROUP 32 - +/* Return the address of a per-trace exit stub. */ +static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) +{ + while (*p == 0xd503201f) p++; /* Skip A64I_NOP. */ + return p + 3 + exitno; +} +/* Avoid dependence on lj_jit.h if only including lj_target.h. */ +#define exitstub_trace_addr(T, exitno) \ + exitstub_trace_addr_((MCode *)((char *)(T)->mcode + (T)->szmcode), (exitno)) /* -- Instructions -------------------------------------------------------- */ diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index a6227bf7..86c78fa5 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -1927,22 +1927,21 @@ static void build_subroutines(BuildCtx *ctx) | stp d30, d31, [sp, #30*8] | ldr CARG1, [sp, #64*8] // Load original value of lr. | add CARG3, sp, #64*8 // Recompute original value of sp. - | mv_vmstate CARG4, EXIT - | ldr CARG2w, [CARG1, #-4]! // Get exit instruction. - | stp CARG1, CARG3, [sp, #62*8] // Store exit pc/sp in RID_LR/RID_SP. - | lsl CARG2, CARG2, #38 - | add CARG1, CARG1, CARG2, asr #36 - | ldr CARG2w, [lr] // Load exit stub group offset. - | sub CARG1, CARG1, lr - | sub CARG1, CARG1, #4 - | ldr L, GL->cur_L - | add CARG1, CARG2, CARG1, lsr #2 // Compute exit number. - | ldr BASE, GL->jit_base - | st_vmstate CARG4 - | str CARG1w, [GL, #GL_J(exitno)] - | str BASE, L->base - | str L, [GL, #GL_J(L)] - | str xzr, GL->jit_base + | mv_vmstate CARG4, EXIT + | stp xzr, CARG3, [sp, #62*8] // Store 0/sp in RID_LR/RID_SP. + | sub CARG1, CARG1, lr + | ldr L, GL->cur_L + | lsr CARG1, CARG1, #2 + | ldr BASE, GL->jit_base + | sub CARG1, CARG1, #2 + | ldr CARG2w, [lr] // Load trace number. + | st_vmstate CARG4 + | str BASE, L->base + | ubfx CARG2w, CARG2w, #5, #16 + | str CARG1w, [GL, #GL_J(exitno)] + | str CARG2w, [GL, #GL_J(parent)] + | str L, [GL, #GL_J(L)] + | str xzr, GL->jit_base | add CARG1, GL, #GG_G2J | mov CARG2, sp | bl extern lj_trace_exit // (jit_State *J, ExitState *ex)