From 1a4ff1311740aa6c85f7a9101b6aa9bfaafa3f8e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 27 May 2020 19:20:44 +0200 Subject: [PATCH] Optimize table length computation with hinting. 10x faster on loop with t[#t+1] = x idiom. Also used by table.insert. --- src/lj_asm.c | 7 ++++ src/lj_ffrecord.c | 6 +-- src/lj_ir.h | 1 + src/lj_ircall.h | 1 + src/lj_iropt.h | 2 +- src/lj_opt_fold.c | 4 +- src/lj_opt_loop.c | 10 +++-- src/lj_opt_mem.c | 97 ++++++++++++++++++++++++++++++----------------- src/lj_opt_sink.c | 3 +- src/lj_record.c | 4 +- src/lj_tab.c | 79 ++++++++++++++++++++++---------------- src/lj_tab.h | 3 ++ 12 files changed, 135 insertions(+), 82 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index dd84a4f2..90373f27 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1634,6 +1634,12 @@ static void asm_fuseequal(ASMState *as, IRIns *ir) } } +static void asm_alen(ASMState *as, IRIns *ir) +{ + asm_callid(as, ir, ir->op2 == REF_NIL ? IRCALL_lj_tab_len : + IRCALL_lj_tab_len_hint); +} + /* -- Instruction dispatch ------------------------------------------------ */ /* Assemble a single instruction. */ @@ -1716,6 +1722,7 @@ static void asm_ir(ASMState *as, IRIns *ir) case IR_FLOAD: asm_fload(as, ir); break; case IR_XLOAD: asm_xload(as, ir); break; case IR_SLOAD: asm_sload(as, ir); break; + case IR_ALEN: asm_alen(as, ir); break; case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break; case IR_FSTORE: asm_fstore(as, ir); break; diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 42049511..2557cadf 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -281,7 +281,7 @@ static void LJ_FASTCALL recff_rawlen(jit_State *J, RecordFFData *rd) if (tref_isstr(tr)) J->base[0] = emitir(IRTI(IR_FLOAD), tr, IRFL_STR_LEN); else if (tref_istab(tr)) - J->base[0] = lj_ir_call(J, IRCALL_lj_tab_len, tr); + J->base[0] = emitir(IRTI(IR_ALEN), tr, TREF_NIL); /* else: Interpreter will throw. */ UNUSED(rd); } @@ -1026,7 +1026,7 @@ static void LJ_FASTCALL recff_table_insert(jit_State *J, RecordFFData *rd) rd->nres = 0; if (tref_istab(ix.tab) && ix.val) { if (!J->base[2]) { /* Simple push: t[#t+1] = v */ - TRef trlen = lj_ir_call(J, IRCALL_lj_tab_len, ix.tab); + TRef trlen = emitir(IRTI(IR_ALEN), ix.tab, TREF_NIL); GCtab *t = tabV(&rd->argv[0]); ix.key = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1)); settabV(J->L, &ix.tabv, t); @@ -1050,7 +1050,7 @@ static void LJ_FASTCALL recff_table_concat(jit_State *J, RecordFFData *rd) lj_opt_narrow_toint(J, J->base[2]) : lj_ir_kint(J, 1); TRef tre = (J->base[1] && J->base[2] && !tref_isnil(J->base[3])) ? lj_opt_narrow_toint(J, J->base[3]) : - lj_ir_call(J, IRCALL_lj_tab_len, tab); + emitir(IRTI(IR_ALEN), tab, TREF_NIL); TRef hdr = recff_bufhdr(J); TRef tr = lj_ir_call(J, IRCALL_lj_buf_puttab, hdr, tab, sep, tri, tre); emitir(IRTG(IR_NE, IRT_PTR), tr, lj_ir_kptr(J, NULL)); diff --git a/src/lj_ir.h b/src/lj_ir.h index 1a9a89a3..a801d5d0 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -106,6 +106,7 @@ _(XLOAD, L , ref, lit) \ _(SLOAD, L , lit, lit) \ _(VLOAD, L , ref, ___) \ + _(ALEN, L , ref, ref) \ \ _(ASTORE, S , ref, ref) \ _(HSTORE, S , ref, ref) \ diff --git a/src/lj_ircall.h b/src/lj_ircall.h index 5c72478b..dbc8c0db 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -168,6 +168,7 @@ typedef struct CCallInfo { _(ANY, lj_tab_clear, 1, FS, NIL, 0) \ _(ANY, lj_tab_newkey, 3, S, PGC, CCI_L) \ _(ANY, lj_tab_len, 1, FL, INT, 0) \ + _(ANY, lj_tab_len_hint, 2, FL, INT, 0) \ _(ANY, lj_gc_step_jit, 2, FS, NIL, CCI_L) \ _(ANY, lj_gc_barrieruv, 2, FS, NIL, 0) \ _(ANY, lj_mem_newgco, 2, FS, PGC, CCI_L) \ diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 02d6b946..8333483f 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -120,7 +120,7 @@ LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_hload(jit_State *J); LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_uload(jit_State *J); LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_fload(jit_State *J); LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_xload(jit_State *J); -LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_tab_len(jit_State *J); +LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_alen(jit_State *J); LJ_FUNC TRef LJ_FASTCALL lj_opt_fwd_hrefk(jit_State *J); LJ_FUNC int LJ_FASTCALL lj_opt_fwd_href_nokey(jit_State *J); LJ_FUNC int LJ_FASTCALL lj_opt_fwd_tptr(jit_State *J, IRRef lim); diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 7a02c6ff..42c57c9b 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -2132,8 +2132,8 @@ LJFOLDX(lj_opt_fwd_hload) LJFOLD(ULOAD any) LJFOLDX(lj_opt_fwd_uload) -LJFOLD(CALLL any IRCALL_lj_tab_len) -LJFOLDX(lj_opt_fwd_tab_len) +LJFOLD(ALEN any any) +LJFOLDX(lj_opt_fwd_alen) /* Upvalue refs are really loads, but there are no corresponding stores. ** So CSE is ok for them, except for UREFO across a GC step (see below). diff --git a/src/lj_opt_loop.c b/src/lj_opt_loop.c index c5919ca0..2eacb7d7 100644 --- a/src/lj_opt_loop.c +++ b/src/lj_opt_loop.c @@ -352,10 +352,12 @@ static void loop_unroll(LoopState *lps) irr = IR(ref); goto phiconv; } - } else if (ref != REF_DROP && irr->o == IR_CONV && - ref > invar && irr->op1 < invar) { - /* May need an extra PHI for a CONV. */ - ref = irr->op1; + } else if (ref != REF_DROP && ref > invar && + ((irr->o == IR_CONV && irr->op1 < invar) || + (irr->o == IR_ALEN && irr->op2 < invar && + irr->op2 != REF_NIL))) { + /* May need an extra PHI for a CONV or ALEN hint. */ + ref = irr->o == IR_CONV ? irr->op1 : irr->op2; irr = IR(ref); phiconv: if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) { diff --git a/src/lj_opt_mem.c b/src/lj_opt_mem.c index 079f7cfe..4c2c05fe 100644 --- a/src/lj_opt_mem.c +++ b/src/lj_opt_mem.c @@ -363,7 +363,7 @@ TRef LJ_FASTCALL lj_opt_dse_ahstore(jit_State *J) IRIns *ir; /* Check for any intervening guards (includes conflicting loads). */ for (ir = IR(J->cur.nins-1); ir > store; ir--) - if (irt_isguard(ir->t) || ir->o == IR_CALLL) + if (irt_isguard(ir->t) || ir->o == IR_ALEN) goto doemit; /* No elimination possible. */ /* Remove redundant store from chain and replace with NOP. */ *refp = store->prev; @@ -381,6 +381,67 @@ doemit: return EMITFOLD; /* Otherwise we have a conflict or simply no match. */ } +/* ALEN forwarding. */ +TRef LJ_FASTCALL lj_opt_fwd_alen(jit_State *J) +{ + IRRef tab = fins->op1; /* Table reference. */ + IRRef lim = tab; /* Search limit. */ + IRRef ref; + + /* Search for conflicting HSTORE with numeric key. */ + ref = J->chain[IR_HSTORE]; + while (ref > lim) { + IRIns *store = IR(ref); + IRIns *href = IR(store->op1); + IRIns *key = IR(href->op2); + if (irt_isnum(key->o == IR_KSLOT ? IR(key->op1)->t : key->t)) { + lim = ref; /* Conflicting store found, limits search for ALEN. */ + break; + } + ref = store->prev; + } + + /* Try to find a matching ALEN. */ + ref = J->chain[IR_ALEN]; + while (ref > lim) { + /* CSE for ALEN only depends on the table, not the hint. */ + if (IR(ref)->op1 == tab) { + IRRef sref; + + /* Search for aliasing table.clear. */ + if (!fwd_aa_tab_clear(J, ref, tab)) + break; + + /* Search for hint-forwarding or conflicting store. */ + sref = J->chain[IR_ASTORE]; + while (sref > ref) { + IRIns *store = IR(sref); + IRIns *aref = IR(store->op1); + IRIns *fref = IR(aref->op1); + if (tab == fref->op1) { /* ASTORE to the same table. */ + /* Detect t[#t+1] = x idiom for push. */ + IRIns *idx = IR(aref->op2); + if (!irt_isnil(store->t) && + idx->o == IR_ADD && idx->op1 == ref && + IR(idx->op2)->o == IR_KINT && IR(idx->op2)->i == 1) { + /* Note: this requires an extra PHI check in loop unroll. */ + fins->op2 = aref->op2; /* Set ALEN hint. */ + } + goto doemit; /* Conflicting store, possibly giving a hint. */ + } else if (aa_table(J, tab, fref->op1) == ALIAS_NO) { + goto doemit; /* Conflicting store. */ + } + sref = store->prev; + } + + return ref; /* Plain ALEN forwarding. */ + } + ref = IR(ref)->prev; + } +doemit: + return EMITFOLD; +} + /* -- ULOAD forwarding ---------------------------------------------------- */ /* The current alias analysis for upvalues is very simplistic. It only @@ -430,7 +491,6 @@ TRef LJ_FASTCALL lj_opt_fwd_uload(jit_State *J) cselim: /* Try to find a matching load. Below the conflicting store, if any. */ - ref = J->chain[IR_ULOAD]; while (ref > lim) { IRIns *ir = IR(ref); @@ -845,39 +905,6 @@ doemit: return EMITFOLD; /* Otherwise we have a conflict or simply no match. */ } -/* -- Forwarding of lj_tab_len -------------------------------------------- */ - -/* This is rather simplistic right now, but better than nothing. */ -TRef LJ_FASTCALL lj_opt_fwd_tab_len(jit_State *J) -{ - IRRef tab = fins->op1; /* Table reference. */ - IRRef lim = tab; /* Search limit. */ - IRRef ref; - - /* Any ASTORE is a conflict and limits the search. */ - if (J->chain[IR_ASTORE] > lim) lim = J->chain[IR_ASTORE]; - - /* Search for conflicting HSTORE with numeric key. */ - ref = J->chain[IR_HSTORE]; - while (ref > lim) { - IRIns *store = IR(ref); - IRIns *href = IR(store->op1); - IRIns *key = IR(href->op2); - if (irt_isnum(key->o == IR_KSLOT ? IR(key->op1)->t : key->t)) { - lim = ref; /* Conflicting store found, limits search for TLEN. */ - break; - } - ref = store->prev; - } - - /* Search for aliasing table.clear. */ - if (!fwd_aa_tab_clear(J, lim, tab)) - return lj_ir_emit(J); - - /* Try to find a matching load. Below the conflicting store, if any. */ - return lj_opt_cselim(J, lim); -} - /* -- ASTORE/HSTORE previous type analysis -------------------------------- */ /* Check whether the previous value for a table store is non-nil. diff --git a/src/lj_opt_sink.c b/src/lj_opt_sink.c index c5323b11..11101702 100644 --- a/src/lj_opt_sink.c +++ b/src/lj_opt_sink.c @@ -78,8 +78,7 @@ static void sink_mark_ins(jit_State *J) switch (ir->o) { case IR_BASE: return; /* Finished. */ - case IR_CALLL: /* IRCALL_lj_tab_len */ - case IR_ALOAD: case IR_HLOAD: case IR_XLOAD: case IR_TBAR: + case IR_ALOAD: case IR_HLOAD: case IR_XLOAD: case IR_TBAR: case IR_ALEN: irt_setmark(IR(ir->op1)->t); /* Mark ref for remaining loads. */ break; case IR_FLOAD: diff --git a/src/lj_record.c b/src/lj_record.c index 8eec0071..4fc22742 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1058,7 +1058,7 @@ static TRef rec_mm_len(jit_State *J, TRef tr, TValue *tv) lj_record_call(J, func, 2); } else { if (LJ_52 && tref_istab(tr)) - return lj_ir_call(J, IRCALL_lj_tab_len, tr); + return emitir(IRTI(IR_ALEN), tr, TREF_NIL); lj_trace_err(J, LJ_TRERR_NOMM); } return 0; /* No result yet. */ @@ -2191,7 +2191,7 @@ void lj_record_ins(jit_State *J) if (tref_isstr(rc)) rc = emitir(IRTI(IR_FLOAD), rc, IRFL_STR_LEN); else if (!LJ_52 && tref_istab(rc)) - rc = lj_ir_call(J, IRCALL_lj_tab_len, rc); + rc = emitir(IRTI(IR_ALEN), rc, TREF_NIL); else rc = rec_mm_len(J, rc, rcv); break; diff --git a/src/lj_tab.c b/src/lj_tab.c index dcd24d31..eb9ef4af 100644 --- a/src/lj_tab.c +++ b/src/lj_tab.c @@ -639,49 +639,62 @@ int lj_tab_next(lua_State *L, GCtab *t, TValue *key) /* -- Table length calculation -------------------------------------------- */ -static MSize unbound_search(GCtab *t, MSize j) +/* Compute table length. Slow path with mixed array/hash lookups. */ +LJ_NOINLINE static MSize tab_len_slow(GCtab *t, size_t hi) { cTValue *tv; - MSize i = j; /* i is zero or a present index */ - j++; - /* find `i' and `j' such that i is present and j is not */ - while ((tv = lj_tab_getint(t, (int32_t)j)) && !tvisnil(tv)) { - i = j; - j *= 2; - if (j > (MSize)(INT_MAX-2)) { /* overflow? */ - /* table was built with bad purposes: resort to linear search */ - i = 1; - while ((tv = lj_tab_getint(t, (int32_t)i)) && !tvisnil(tv)) i++; - return i - 1; + size_t lo = hi; + hi++; + /* Widening search for an upper bound. */ + while ((tv = lj_tab_getint(t, (int32_t)hi)) && !tvisnil(tv)) { + lo = hi; + hi += hi; + if (hi > (size_t)(INT_MAX-2)) { /* Punt and do a linear search. */ + lo = 1; + while ((tv = lj_tab_getint(t, (int32_t)lo)) && !tvisnil(tv)) lo++; + return (MSize)(lo - 1); } } - /* now do a binary search between them */ - while (j - i > 1) { - MSize m = (i+j)/2; - cTValue *tvb = lj_tab_getint(t, (int32_t)m); - if (tvb && !tvisnil(tvb)) i = m; else j = m; + /* Binary search to find a non-nil to nil transition. */ + while (hi - lo > 1) { + size_t mid = (lo+hi) >> 1; + cTValue *tvb = lj_tab_getint(t, (int32_t)mid); + if (tvb && !tvisnil(tvb)) lo = mid; else hi = mid; } - return i; + return (MSize)lo; } -/* -** Try to find a boundary in table `t'. A `boundary' is an integer index -** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil). -*/ +/* Compute table length. Fast path. */ MSize LJ_FASTCALL lj_tab_len(GCtab *t) { - MSize j = (MSize)t->asize; - if (j > 1 && tvisnil(arrayslot(t, j-1))) { - MSize i = 1; - while (j - i > 1) { - MSize m = (i+j)/2; - if (tvisnil(arrayslot(t, m-1))) j = m; else i = m; + size_t hi = (size_t)t->asize; + if (hi) hi--; + /* In a growing array the last array element is very likely nil. */ + if (hi > 0 && LJ_LIKELY(tvisnil(arrayslot(t, hi)))) { + /* Binary search to find a non-nil to nil transition in the array. */ + size_t lo = 0; + while (hi - lo > 1) { + size_t mid = (lo+hi) >> 1; + if (tvisnil(arrayslot(t, mid))) hi = mid; else lo = mid; } - return i-1; + return (MSize)lo; } - if (j) j--; - if (t->hmask <= 0) - return j; - return unbound_search(t, j); + /* Without a hash part, there's an implicit nil after the last element. */ + return t->hmask ? tab_len_slow(t, hi) : (MSize)hi; } +#if LJ_HASJIT +/* Verify hinted table length or compute it. */ +MSize LJ_FASTCALL lj_tab_len_hint(GCtab *t, size_t hint) +{ + size_t asize = (size_t)t->asize; + cTValue *tv = arrayslot(t, hint); + if (LJ_LIKELY(hint+1 < asize)) { + if (LJ_LIKELY(!tvisnil(tv) && tvisnil(tv+1))) return (MSize)hint; + } else if (hint+1 <= asize && LJ_LIKELY(t->hmask == 0) && !tvisnil(tv)) { + return (MSize)hint; + } + return lj_tab_len(t); +} +#endif + diff --git a/src/lj_tab.h b/src/lj_tab.h index 597c94b2..f31590cd 100644 --- a/src/lj_tab.h +++ b/src/lj_tab.h @@ -69,5 +69,8 @@ LJ_FUNC TValue *lj_tab_set(lua_State *L, GCtab *t, cTValue *key); LJ_FUNCA int lj_tab_next(lua_State *L, GCtab *t, TValue *key); LJ_FUNCA MSize LJ_FASTCALL lj_tab_len(GCtab *t); +#if LJ_HASJIT +LJ_FUNC MSize LJ_FASTCALL lj_tab_len_hint(GCtab *t, size_t hint); +#endif #endif