From 19707009bfb8d1fe59a5c328034e8e8ad1b56232 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 12:41:47 +0200 Subject: [PATCH 01/95] Fix native MinGW build. Thanks to Victor Bombi. #1071 --- src/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 29c01747..dad90138 100644 --- a/src/Makefile +++ b/src/Makefile @@ -446,7 +446,11 @@ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS) DASM_DASC= vm_$(DASM_ARCH).dasc GIT= git -GIT_RELVER= [ -e ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || : +ifeq (Windows,$(HOST_SYS)$(HOST_MSYS)) + GIT_RELVER= if exist ..\.git ( $(GIT) show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt ) +else + GIT_RELVER= [ -e ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || : +endif GIT_DEP= $(wildcard ../.git/HEAD ../.git/refs/heads/*) BUILDVM_O= host/buildvm.o host/buildvm_asm.o host/buildvm_peobj.o \ From 4d05806ae046838826f9bab3b3b804eae26cd017 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 12:47:27 +0200 Subject: [PATCH 02/95] Allow override of paths for genversion.lua. Thanks to arch1t3cht. #1067 --- src/host/genversion.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/host/genversion.lua b/src/host/genversion.lua index 42b5e6fe..5ead4c2b 100644 --- a/src/host/genversion.lua +++ b/src/host/genversion.lua @@ -5,9 +5,9 @@ -- Released under the MIT license. See Copyright Notice in luajit.h ---------------------------------------------------------------------------- -local FILE_ROLLING_H = "luajit_rolling.h" -local FILE_RELVER_TXT = "luajit_relver.txt" -local FILE_LUAJIT_H = "luajit.h" +local FILE_ROLLING_H = arg[1] or "luajit_rolling.h" +local FILE_RELVER_TXT = arg[2] or "luajit_relver.txt" +local FILE_LUAJIT_H = arg[3] or "luajit.h" local function file_read(file) local fp = assert(io.open(file, "rb"), "run from the wrong directory") From 7f9907b4ed0870ba64342bcc4b26cff0a94540da Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 13:37:31 +0200 Subject: [PATCH 03/95] Add NaN check to IR_NEWREF. Thanks to Peter Cawley. #1069 --- src/lj_opt_fold.c | 5 ++++- src/lj_record.c | 12 +++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index ab158143..b437d672 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -1739,7 +1739,10 @@ LJFOLD(NE any any) LJFOLDF(comm_equal) { /* For non-numbers only: x == x ==> drop; x ~= x ==> fail */ - if (fins->op1 == fins->op2 && !irt_isnum(fins->t)) + if (fins->op1 == fins->op2 && + (!irt_isnum(fins->t) || + (fleft->o == IR_CONV && /* Converted integers cannot be NaN. */ + (uint32_t)(fleft->op2 & IRCONV_SRCMASK) - (uint32_t)IRT_I8 <= (uint32_t)(IRT_U64 - IRT_U8)))) return CONDFOLD(fins->o == IR_EQ); return fold_comm_swap(J); } diff --git a/src/lj_record.c b/src/lj_record.c index 0e14382c..dfcc3f65 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1254,10 +1254,16 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) lua_assert(!hasmm); if (oldv == niltvg(J2G(J))) { /* Need to insert a new key. */ TRef key = ix->key; - if (tref_isinteger(key)) /* NEWREF needs a TValue as a key. */ + if (tref_isinteger(key)) { /* NEWREF needs a TValue as a key. */ key = emitir(IRTN(IR_CONV), key, IRCONV_NUM_INT); - else if (tref_isnumber(key) && tref_isk(key) && tvismzero(&ix->keyv)) - key = lj_ir_knum_zero(J); /* Canonicalize -0.0 to +0.0. */ + } else if (tref_isnum(key)) { + if (tref_isk(key)) { + if (tvismzero(&ix->keyv)) + key = lj_ir_knum_zero(J); /* Canonicalize -0.0 to +0.0. */ + } else { + emitir(IRTG(IR_EQ, IRT_NUM), key, key); /* Check for !NaN. */ + } + } xref = emitir(IRT(IR_NEWREF, IRT_P32), ix->tab, key); keybarrier = 0; /* NEWREF already takes care of the key barrier. */ } From 9daf9f9003ff29551ef3b6fe19f4abf868bfd414 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 14:11:25 +0200 Subject: [PATCH 04/95] ARM64: Improve K13 constant rematerialization. Algorithm by Dougall Johnson: https://dougallj.wordpress.com/2021/10/30/ Thanks to Peter Cawley. #1065 --- src/lj_emit_arm64.h | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 6926c71a..7205ce78 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -43,26 +43,18 @@ static uint32_t emit_isk12(int64_t n) /* Encode constant in K13 format for logical data processing instructions. */ static uint32_t emit_isk13(uint64_t n, int is64) { - int inv = 0, w = 128, lz, tz; - if (n & 1) { n = ~n; w = 64; inv = 1; } /* Avoid wrap-around of ones. */ - if (!n) return 0; /* Neither all-zero nor all-ones are allowed. */ - do { /* Find the repeat width. */ - if (is64 && (uint32_t)(n^(n>>32))) break; - n = (uint32_t)n; - if (!n) return 0; /* Ditto when passing n=0xffffffff and is64=0. */ - w = 32; if ((n^(n>>16)) & 0xffff) break; - n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break; - n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break; - n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break; - n = n & 0x3; w = 2; - } while (0); - lz = emit_clz64(n); - tz = emit_ctz64(n); - if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */ - if (inv) - return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10); - else - return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10); + /* Thanks to: https://dougallj.wordpress.com/2021/10/30/ */ + int rot, ones, size, immr, imms; + if (!is64) n = ((uint64_t)n << 32) | (uint32_t)n; + if ((n+1u) <= 1u) return 0; /* Neither all-zero nor all-ones are allowed. */ + rot = (n & (n+1u)) ? emit_ctz64(n & (n+1u)) : 64; + n = lj_ror(n, rot & 63); + ones = emit_ctz64(~n); + size = emit_clz64(n) + ones; + if (lj_ror(n, size & 63) != n) return 0; /* Non-repeating? */ + immr = -rot & (size - 1); + imms = (-(size << 1) | (ones - 1)) & 63; + return A64I_K13 | A64F_IMMR(immr | (size & 64)) | A64F_IMMS(imms); } static uint32_t emit_isfpk64(uint64_t n) From 4651ff2fbc30a8326bcfc8e9d719fbf30856f5e2 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 14:15:18 +0200 Subject: [PATCH 05/95] ARM64: Inline only use of emit_loada. Thanks to Peter Cawley. #1065 --- src/lj_asm_arm64.h | 9 ++++++--- src/lj_emit_arm64.h | 2 -- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 5e690308..a575269b 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -831,10 +831,13 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) /* Key not found in chain: jump to exit (if merged) or load niltv. */ l_end = emit_label(as); as->invmcp = NULL; - if (merge == IR_NE) + if (merge == IR_NE) { asm_guardcc(as, CC_AL); - else if (destused) - emit_loada(as, dest, niltvg(J2G(as->J))); + } else if (destused) { + uint32_t k12 = emit_isk12(offsetof(global_State, nilnode.val)); + lj_assertA(k12 != 0, "Cannot k12 encode niltv(L)"); + emit_dn(as, A64I_ADDx^k12, dest, RID_GL); + } /* Follow hash chain until the end. */ l_loop = --as->mcp; diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 7205ce78..73df508c 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -230,8 +230,6 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64) /* Load a 64 bit constant into a GPR. */ #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X) -#define emit_loada(as, r, addr) emit_loadu64(as, (r), (uintptr_t)(addr)) - #define glofs(as, k) \ ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) #define mcpofs(as, k) \ From dfc122e45ce0dd76a47794789b413aeaa4cc3773 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 14:20:39 +0200 Subject: [PATCH 06/95] ARM64: Tune emit_lsptr. Avoid wrong load for asm_prof. Thanks to Peter Cawley. #1065 --- src/lj_emit_arm64.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 73df508c..86626177 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -242,19 +242,20 @@ static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow); /* Get/set from constant pointer. */ static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p) { - /* First, check if ip + offset is in range. */ - if ((ai & 0x00400000) && checkmcpofs(as, p)) { + Reg base = RID_GL; + int64_t ofs = glofs(as, p); + if (emit_checkofs(ai, ofs)) { + /* GL + offset, might subsequently fuse to LDP/STP. */ + } else if (ai == A64I_LDRx && checkmcpofs(as, p)) { + /* IP + offset is cheaper than allock, but address must be in range. */ emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r); - } else { - Reg base = RID_GL; /* Next, try GL + offset. */ - int64_t ofs = glofs(as, p); - if (!emit_checkofs(ai, ofs)) { /* Else split up into base reg + offset. */ - int64_t i64 = i64ptr(p); - base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r)); - ofs = i64 & 0x7fffull; - } - emit_lso(as, ai, r, base, ofs); + return; + } else { /* Split up into base reg + offset. */ + int64_t i64 = i64ptr(p); + base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r)); + ofs = i64 & 0x7fffull; } + emit_lso(as, ai, r, base, ofs); } /* Load 64 bit IR constant into register. */ From 5149b0a3a2809fef155ff2b2f01c667d920db3c2 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 16:30:14 +0200 Subject: [PATCH 07/95] ARM64: Consolidate 32/64-bit constant handling in assembler. Thanks to Peter Cawley. #1065 --- src/lj_asm.c | 4 +++ src/lj_emit_arm64.h | 75 +++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index c02a1b9e..844910ad 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -606,7 +606,11 @@ static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow) IRIns *ir = IR(ref); if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) || #if LJ_GC64 +#if LJ_TARGET_ARM64 + (ir->o == IR_KINT && (uint64_t)k == (uint32_t)ir->i) || +#else (ir->o == IR_KINT && k == ir->i) || +#endif (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) || ((ir->o == IR_KPTR || ir->o == IR_KKPTR) && k == (intptr_t)ir_kptr(ir)) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 86626177..50e658dd 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -20,7 +20,7 @@ static uint64_t get_k64val(ASMState *as, IRRef ref) } else { lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL, "bad 64 bit const IR op %d", ir->o); - return ir->i; /* Sign-extended. */ + return (uint32_t)ir->i; /* Zero-extended. */ } } @@ -152,11 +152,10 @@ nopair: /* Prefer rematerialization of BASE/L from global_State over spills. */ #define emit_canremat(ref) ((ref) <= ASMREF_L) -/* Try to find an N-step delta relative to other consts with N < lim. */ -static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) +/* Try to find a one-step delta relative to other consts. */ +static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64) { RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL); - if (lim <= 1) return 0; /* Can't beat that. */ while (work) { Reg r = rset_picktop(work); IRRef ref = regcost_ref(as->cost[r]); @@ -165,13 +164,14 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) : get_k64val(as, ref); int64_t delta = (int64_t)(k - kx); + if (!is64) delta = (int64_t)(int32_t)delta; /* Sign-extend. */ if (delta == 0) { - emit_dm(as, A64I_MOVx, rd, r); + emit_dm(as, is64|A64I_MOVw, rd, r); return 1; } else { uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta); if (k12) { - emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r); + emit_dn(as, (delta < 0 ? A64I_SUBw : A64I_ADDw)^is64^k12, rd, r); return 1; } /* Do other ops or multi-step deltas pay off? Probably not. @@ -184,51 +184,52 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) return 0; /* Failed. */ } -static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64) +static void emit_loadk(ASMState *as, Reg rd, uint64_t u64) { - int i, zeros = 0, ones = 0, neg; - if (!is64) u64 = (int64_t)(int32_t)u64; /* Sign-extend. */ - /* Count homogeneous 16 bit fragments. */ - for (i = 0; i < 4; i++) { - uint64_t frag = (u64 >> i*16) & 0xffff; - zeros += (frag == 0); - ones += (frag == 0xffff); + int zeros = 0, ones = 0, neg, lshift = 0; + int is64 = (u64 >> 32) ? A64I_X : 0, i = is64 ? 4 : 2; + /* Count non-homogeneous 16 bit fragments. */ + while (--i >= 0) { + uint32_t frag = (u64 >> i*16) & 0xffff; + zeros += (frag != 0); + ones += (frag != 0xffff); } - neg = ones > zeros; /* Use MOVN if it pays off. */ - if ((neg ? ones : zeros) < 3) { /* Need 2+ ins. Try shorter K13 encoding. */ + neg = ones < zeros; /* Use MOVN if it pays off. */ + if ((neg ? ones : zeros) > 1) { /* Need 2+ ins. Try 1 ins encodings. */ uint32_t k13 = emit_isk13(u64, is64); if (k13) { emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO); return; } - } - if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) { - int shift = 0, lshift = 0; - uint64_t n64 = neg ? ~u64 : u64; - if (n64 != 0) { - /* Find first/last fragment to be filled. */ - shift = (63-emit_clz64(n64)) & ~15; - lshift = emit_ctz64(n64) & ~15; + if (emit_kdelta(as, rd, u64, is64)) { + return; } - /* MOVK requires the original value (u64). */ - while (shift > lshift) { - uint32_t u16 = (u64 >> shift) & 0xffff; - /* Skip fragments that are correctly filled by MOVN/MOVZ. */ - if (u16 != (neg ? 0xffff : 0)) - emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd); - shift -= 16; - } - /* But MOVN needs an inverted value (n64). */ - emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) | - A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd); } + if (neg) { + u64 = ~u64; + if (!is64) u64 = (uint32_t)u64; + } + if (u64) { + /* Find first/last fragment to be filled. */ + int shift = (63-emit_clz64(u64)) & ~15; + lshift = emit_ctz64(u64) & ~15; + for (; shift > lshift; shift -= 16) { + uint32_t frag = (u64 >> shift) & 0xffff; + if (frag == 0) continue; /* Will be correctly filled by MOVN/MOVZ. */ + if (neg) frag ^= 0xffff; /* MOVK requires the original value. */ + emit_d(as, is64 | A64I_MOVKw | A64F_U16(frag) | A64F_LSL16(shift), rd); + } + } + /* But MOVN needs an inverted value. */ + emit_d(as, is64 | (neg ? A64I_MOVNw : A64I_MOVZw) | + A64F_U16((u64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd); } /* Load a 32 bit constant into a GPR. */ -#define emit_loadi(as, rd, i) emit_loadk(as, rd, i, 0) +#define emit_loadi(as, rd, i) emit_loadk(as, rd, (uint32_t)i) /* Load a 64 bit constant into a GPR. */ -#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X) +#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i) #define glofs(as, k) \ ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) From 315dc3e776d3199269a464b17d07c48064d3fd09 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 16:56:16 +0200 Subject: [PATCH 08/95] ARM64: Reload BASE via GL instead of spilling it. Thanks to Peter Cawley. #1068. --- src/lj_asm_arm64.h | 91 ++++++++++++++++++--------------------------- src/lj_emit_arm64.h | 2 +- 2 files changed, 38 insertions(+), 55 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index a575269b..b8fbf69b 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -541,8 +541,6 @@ static void asm_retf(ASMState *as, IRIns *ir) as->topslot -= (BCReg)delta; if ((int32_t)as->topslot < 0) as->topslot = 0; irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */ - /* Need to force a spill on REF_BASE now to update the stack slot. */ - emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE))); emit_setgl(as, base, jit_base); emit_addptr(as, base, -8*delta); asm_guardcc(as, CC_NE); @@ -1794,37 +1792,28 @@ static void asm_prof(ASMState *as, IRIns *ir) static void asm_stack_check(ASMState *as, BCReg topslot, IRIns *irp, RegSet allow, ExitNo exitno) { - Reg pbase; uint32_t k; + Reg pbase = RID_BASE; if (irp) { - if (!ra_hasspill(irp->s)) { - pbase = irp->r; - lj_assertA(ra_hasreg(pbase), "base reg lost"); - } else if (allow) { - pbase = rset_pickbot(allow); - } else { - pbase = RID_RET; - emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0); /* Restore temp register. */ - } - } else { - pbase = RID_BASE; + pbase = irp->r; + if (!ra_hasreg(pbase)) + pbase = allow ? (0x40 | rset_pickbot(allow)) : (0xC0 | RID_RET); } emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno)); + if (pbase & 0x80) /* Restore temp. register. */ + emit_lso(as, A64I_LDRx, (pbase & 31), RID_SP, 0); k = emit_isk12((8*topslot)); lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot); emit_n(as, A64I_CMPx^k, RID_TMP); - emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase); + emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, (pbase & 31)); emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP, (int32_t)offsetof(lua_State, maxstack)); - if (irp) { /* Must not spill arbitrary registers in head of side trace. */ - if (ra_hasspill(irp->s)) - emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s)); - emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L)); - if (ra_hasspill(irp->s) && !allow) - emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0); /* Save temp register. */ - } else { - emit_getgl(as, RID_TMP, cur_L); + if (pbase & 0x40) { + emit_getgl(as, (pbase & 31), jit_base); + if (pbase & 0x80) /* Save temp register. */ + emit_lso(as, A64I_STRx, (pbase & 31), RID_SP, 0); } + emit_getgl(as, RID_TMP, cur_L); } /* Restore Lua stack from on-trace state. */ @@ -1921,46 +1910,40 @@ static void asm_loop_tail_fixup(ASMState *as) /* -- Head of trace ------------------------------------------------------- */ -/* Reload L register from g->cur_L. */ -static void asm_head_lreg(ASMState *as) -{ - IRIns *ir = IR(ASMREF_L); - if (ra_used(ir)) { - Reg r = ra_dest(as, ir, RSET_GPR); - emit_getgl(as, r, cur_L); - ra_evictk(as); - } -} - /* Coalesce BASE register for a root trace. */ static void asm_head_root_base(ASMState *as) { - IRIns *ir; - asm_head_lreg(as); - ir = IR(REF_BASE); - if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t))) - ra_spill(as, ir); - ra_destreg(as, ir, RID_BASE); + IRIns *ir = IR(REF_BASE); + Reg r = ir->r; + if (ra_hasreg(r)) { + ra_free(as, r); + if (rset_test(as->modset, r) || irt_ismarked(ir->t)) + ir->r = RID_INIT; /* No inheritance for modified BASE register. */ + if (r != RID_BASE) + emit_movrr(as, ir, r, RID_BASE); + } } /* Coalesce BASE register for a side trace. */ static Reg asm_head_side_base(ASMState *as, IRIns *irp) { - IRIns *ir; - asm_head_lreg(as); - ir = IR(REF_BASE); - if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t))) - ra_spill(as, ir); - if (ra_hasspill(irp->s)) { - return ra_dest(as, ir, RSET_GPR); - } else { - Reg r = irp->r; - lj_assertA(ra_hasreg(r), "base reg lost"); - if (r != ir->r && !rset_test(as->freeset, r)) - ra_restore(as, regcost_ref(as->cost[r])); - ra_destreg(as, ir, r); - return r; + IRIns *ir = IR(REF_BASE); + Reg r = ir->r; + if (ra_hasreg(r)) { + ra_free(as, r); + if (rset_test(as->modset, r) || irt_ismarked(ir->t)) + ir->r = RID_INIT; /* No inheritance for modified BASE register. */ + if (irp->r == r) { + return r; /* Same BASE register already coalesced. */ + } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) { + /* Move from coalesced parent reg. */ + emit_movrr(as, ir, r, irp->r); + return irp->r; + } else { + emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */ + } } + return RID_NONE; } /* -- Tail of trace ------------------------------------------------------- */ diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 50e658dd..d4c54255 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -150,7 +150,7 @@ nopair: /* -- Emit loads/stores --------------------------------------------------- */ /* Prefer rematerialization of BASE/L from global_State over spills. */ -#define emit_canremat(ref) ((ref) <= ASMREF_L) +#define emit_canremat(ref) ((ref) <= REF_BASE) /* Try to find a one-step delta relative to other consts. */ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64) From 435d8c630135d4f6a54f2ecf7be2d7e805652f80 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:15:26 +0200 Subject: [PATCH 09/95] ARM64: Improve IR_HREF code generation. Thanks to Peter Cawley. #1070 --- src/lj_asm_arm64.h | 128 +++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 87 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index b8fbf69b..c5ebd324 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -773,57 +773,36 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) int destused = ra_used(ir); Reg dest = ra_dest(as, ir, allow); Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); - Reg key = 0, tmp = RID_TMP; - Reg ftmp = RID_NONE, type = RID_NONE, scr = RID_NONE, tisnum = RID_NONE; + Reg key = 0, tmp = RID_TMP, type = RID_NONE, tkey; IRRef refkey = ir->op2; IRIns *irkey = IR(refkey); - int isk = irref_isk(ir->op2); + int isk = irref_isk(refkey); IRType1 kt = irkey->t; uint32_t k = 0; uint32_t khash; - MCLabel l_end, l_loop, l_next; + MCLabel l_end, l_loop; rset_clear(allow, tab); - if (!isk) { - key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); + /* Allocate registers outside of the loop. */ + if (irkey->o != IR_KNUM || !(k = emit_isk12((int64_t)ir_knum(irkey)->u64))) { + key = ra_alloc1(as, refkey, irt_isnum(kt) ? RSET_FPR : allow); rset_clear(allow, key); - if (!irt_isstr(kt)) { - tmp = ra_scratch(as, allow); - rset_clear(allow, tmp); - } - } else if (irt_isnum(kt)) { - int64_t val = (int64_t)ir_knum(irkey)->u64; - if (!(k = emit_isk12(val))) { - key = ra_allock(as, val, allow); - rset_clear(allow, key); - } - } else if (!irt_ispri(kt)) { - if (!(k = emit_isk12(irkey->i))) { - key = ra_alloc1(as, refkey, allow); - rset_clear(allow, key); - } } - - /* Allocate constants early. */ - if (irt_isnum(kt)) { - if (!isk) { - tisnum = ra_allock(as, LJ_TISNUM << 15, allow); - ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key)); - rset_clear(allow, tisnum); - } - } else if (irt_isaddr(kt)) { - if (isk) { - int64_t kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64; - scr = ra_allock(as, kk, allow); - } else { - scr = ra_scratch(as, allow); - } - rset_clear(allow, scr); + if (!isk) { + tkey = ra_scratch(as, allow); + rset_clear(allow, tkey); + } else if (irt_isnum(kt)) { + tkey = key; /* Assumes -0.0 is already canonicalized to +0.0. */ } else { - lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type"); - type = ra_allock(as, ~((int64_t)~irt_toitype(kt) << 47), allow); - scr = ra_scratch(as, rset_clear(allow, type)); - rset_clear(allow, scr); + int64_t kk; + if (irt_isaddr(kt)) { + kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64; + } else { + lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type"); + kk = ~((int64_t)~irt_toitype(kt) << 47); + } + tkey = ra_allock(as, kk, allow); + rset_clear(allow, tkey); } /* Key not found in chain: jump to exit (if merged) or load niltv. */ @@ -839,50 +818,31 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) /* Follow hash chain until the end. */ l_loop = --as->mcp; - emit_n(as, A64I_CMPx^A64I_K12^0, dest); - emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next)); - l_next = emit_label(as); + if (destused) + emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next)); /* Type and value comparison. */ if (merge == IR_EQ) asm_guardcc(as, CC_EQ); else emit_cond_branch(as, CC_EQ, l_end); + emit_nm(as, A64I_CMPx^k, tmp, tkey); + if (!destused) + emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next)); + emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key)); + *l_loop = A64I_X | A64I_CBNZ | A64F_S19(as->mcp - l_loop) | dest; - if (irt_isnum(kt)) { - if (isk) { - /* Assumes -0.0 is already canonicalized to +0.0. */ - if (k) - emit_n(as, A64I_CMPx^k, tmp); - else - emit_nm(as, A64I_CMPx, key, tmp); - emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64)); + /* Construct tkey as canonicalized or tagged key. */ + if (!isk) { + if (irt_isnum(kt)) { + emit_dnm(as, A64I_CSELx | A64F_CC(CC_EQ), tkey, RID_ZERO, tkey); } else { - emit_nm(as, A64I_FCMPd, key, ftmp); - emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31)); - emit_cond_branch(as, CC_LO, l_next); - emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp); - emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n)); + lj_assertA(irt_isaddr(kt), "bad HREF key type"); + type = ra_allock(as, irt_toitype(kt) << 15, allow); + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 32), tkey, key, type); } - } else if (irt_isaddr(kt)) { - if (isk) { - emit_nm(as, A64I_CMPx, scr, tmp); - emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64)); - } else { - emit_nm(as, A64I_CMPx, tmp, scr); - emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64)); - } - } else { - emit_nm(as, A64I_CMPx, scr, type); - emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key)); } - *l_loop = A64I_BCC | A64F_S19(as->mcp - l_loop) | CC_NE; - if (!isk && irt_isaddr(kt)) { - type = ra_allock(as, (int32_t)irt_toitype(kt), allow); - emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type); - rset_clear(allow, type); - } /* Load main position relative to tab->node into dest. */ khash = isk ? ir_khash(as, irkey) : 1; if (khash == 0) { @@ -896,7 +856,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_dnm(as, A64I_ANDw, dest, dest, tmphash); emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask)); } else if (irt_isstr(kt)) { - /* Fetch of str->sid is cheaper than ra_allock. */ emit_dnm(as, A64I_ANDw, dest, dest, tmp); emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, sid)); emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask)); @@ -905,23 +864,18 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask)); emit_dnm(as, A64I_SUBw, dest, dest, tmp); emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp); - emit_dnm(as, A64I_EORw, dest, dest, tmp); - emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest); + emit_dnm(as, A64I_EORw | A64F_SH(A64SH_ROR, 32-HASH_ROT2), dest, tmp, dest); emit_dnm(as, A64I_SUBw, tmp, tmp, dest); emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest); - emit_dnm(as, A64I_EORw, tmp, tmp, dest); if (irt_isnum(kt)) { + emit_dnm(as, A64I_EORw, tmp, tkey, dest); emit_dnm(as, A64I_ADDw, dest, dest, dest); - emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest); - emit_dm(as, A64I_MOVw, tmp, dest); - emit_dn(as, A64I_FMOV_R_D, dest, (key & 31)); + emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, tkey); + emit_nm(as, A64I_FCMPZd, (key & 31), 0); + emit_dn(as, A64I_FMOV_R_D, tkey, (key & 31)); } else { - checkmclim(as); - emit_dm(as, A64I_MOVw, tmp, key); - emit_dnm(as, A64I_EORw, dest, dest, - ra_allock(as, irt_toitype(kt) << 15, allow)); - emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest); - emit_dm(as, A64I_MOVx, dest, key); + emit_dnm(as, A64I_EORw, tmp, key, dest); + emit_dnm(as, A64I_EORx | A64F_SH(A64SH_LSR, 32), dest, type, key); } } } From c2bdce399ebd89a171d2622dd6e0b738aa94a3f5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:19:02 +0200 Subject: [PATCH 10/95] ARM64: Improve IR_UREF code generation. Thanks to Peter Cawley. #1070 --- src/lj_asm_arm64.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index c5ebd324..68749284 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -924,18 +924,16 @@ static void asm_uref(ASMState *as, IRIns *ir) MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; emit_lsptr(as, A64I_LDRx, dest, v); } else { - Reg uv = ra_scratch(as, RSET_GPR); - Reg func = ra_alloc1(as, ir->op1, RSET_GPR); if (ir->o == IR_UREFC) { - asm_guardcc(as, CC_NE); - emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP); - emit_opk(as, A64I_ADDx, dest, uv, + asm_guardcnb(as, A64I_CBZ, RID_TMP); + emit_opk(as, A64I_ADDx, dest, dest, (int32_t)offsetof(GCupval, tv), RSET_GPR); - emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed)); + emit_lso(as, A64I_LDRB, RID_TMP, dest, + (int32_t)offsetof(GCupval, closed)); } else { - emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v)); + emit_lso(as, A64I_LDRx, dest, dest, (int32_t)offsetof(GCupval, v)); } - emit_lso(as, A64I_LDRx, uv, func, + emit_lso(as, A64I_LDRx, dest, ra_alloc1(as, ir->op1, RSET_GPR), (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8)); } } From c1877e648a5eeb96deda7080c6a43aed1b1a35ea Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:21:32 +0200 Subject: [PATCH 11/95] ARM64: Improve IR_OBAR code generation. Thanks to Peter Cawley. #1070 --- src/lj_asm_arm64.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 68749284..5fd7bf07 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -1283,7 +1283,6 @@ static void asm_obar(ASMState *as, IRIns *ir) const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv]; IRRef args[2]; MCLabel l_end; - RegSet allow = RSET_GPR; Reg obj, val, tmp; /* No need for other object barriers (yet). */ lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type"); @@ -1294,14 +1293,13 @@ static void asm_obar(ASMState *as, IRIns *ir) asm_gencall(as, ci, args); emit_dm(as, A64I_MOVx, ra_releasetmp(as, ASMREF_TMP1), RID_GL); obj = IR(ir->op1)->r; - tmp = ra_scratch(as, rset_exclude(allow, obj)); - emit_cond_branch(as, CC_EQ, l_end); - emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp); + tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj)); + emit_tnb(as, A64I_TBZ, tmp, lj_ffs(LJ_GC_BLACK), l_end); emit_cond_branch(as, CC_EQ, l_end); emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP); val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj)); emit_lso(as, A64I_LDRB, tmp, obj, - (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv)); + (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv)); emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked)); } From a5ee35867c6dd359a04f58913e9a21f1649d68b3 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:31:06 +0200 Subject: [PATCH 12/95] ARM64: Use RID_TMP instead of scratch register in more places. Thanks to Peter Cawley. #1070 --- src/lj_asm_arm64.h | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 5fd7bf07..9ea2d405 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -890,7 +890,7 @@ static void asm_hrefk(ASMState *as, IRIns *ir) int bigofs = !emit_checkofs(A64I_LDRx, kofs); Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE; Reg node = ra_alloc1(as, ir->op1, RSET_GPR); - Reg key, idx = node; + Reg idx = node; RegSet allow = rset_exclude(RSET_GPR, node); uint64_t k; lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot"); @@ -909,9 +909,8 @@ static void asm_hrefk(ASMState *as, IRIns *ir) } else { k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey); } - key = ra_scratch(as, allow); - emit_nm(as, A64I_CMPx, key, ra_allock(as, k, rset_exclude(allow, key))); - emit_lso(as, A64I_LDRx, key, idx, kofs); + emit_nm(as, A64I_CMPx, RID_TMP, ra_allock(as, k, allow)); + emit_lso(as, A64I_LDRx, RID_TMP, idx, kofs); if (bigofs) emit_opk(as, A64I_ADDx, dest, node, ofs, rset_exclude(RSET_GPR, node)); } @@ -1039,7 +1038,7 @@ static void asm_xstore(ASMState *as, IRIns *ir) static void asm_ahuvload(ASMState *as, IRIns *ir) { - Reg idx, tmp, type; + Reg idx, tmp; int32_t ofs = 0; RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; lj_assertA(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) || @@ -1058,8 +1057,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) } else { tmp = ra_scratch(as, gpr); } - type = ra_scratch(as, rset_clear(gpr, tmp)); - idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx); + idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, tmp), A64I_LDRx); rset_clear(gpr, idx); if (ofs & FUSE_REG) rset_clear(gpr, ofs & 31); if (ir->o == IR_VLOAD) ofs += 8 * ir->op2; @@ -1071,8 +1069,8 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), ra_allock(as, LJ_TISNUM << 15, gpr), tmp); } else if (irt_isaddr(ir->t)) { - emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type); - emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp); + emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), RID_TMP); + emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp); } else if (irt_isnil(ir->t)) { emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp); } else { @@ -1195,9 +1193,8 @@ dotypecheck: emit_nm(as, A64I_CMPx, ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp); } else { - Reg type = ra_scratch(as, allow); - emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type); - emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp); + emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), RID_TMP); + emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp); } emit_lso(as, A64I_LDRx, tmp, base, ofs); return; @@ -1805,7 +1802,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) /* Marker to prevent patching the GC check exit. */ #define ARM64_NOPATCH_GC_CHECK \ - (A64I_ORRx|A64F_D(RID_TMP)|A64F_M(RID_TMP)|A64F_N(RID_TMP)) + (A64I_ORRx|A64F_D(RID_ZERO)|A64F_M(RID_ZERO)|A64F_N(RID_ZERO)) /* Check GC threshold and do one or more GC steps. */ static void asm_gc_check(ASMState *as) From 4ed83bd990cbe2062a2a7392d7f5d65bc7c2ba04 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:34:28 +0200 Subject: [PATCH 13/95] ARM64: Simplify code generation for IR_STRTO. Thanks to Peter Cawley. #1070 --- src/lj_asm_arm64.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 9ea2d405..d2cb7823 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -664,25 +664,22 @@ static void asm_strto(ASMState *as, IRIns *ir) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num]; IRRef args[2]; - Reg dest = 0, tmp; - int destused = ra_used(ir); + Reg tmp; int32_t ofs = 0; ra_evictset(as, RSET_SCRATCH); - if (destused) { + if (ra_used(ir)) { if (ra_hasspill(ir->s)) { ofs = sps_scale(ir->s); - destused = 0; if (ra_hasreg(ir->r)) { ra_free(as, ir->r); ra_modified(as, ir->r); emit_spload(as, ir, ir->r, ofs); } } else { - dest = ra_dest(as, ir, RSET_FPR); + Reg dest = ra_dest(as, ir, RSET_FPR); + emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); } } - if (destused) - emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); asm_guardcnb(as, A64I_CBZ, RID_RET); args[0] = ir->op1; /* GCstr *str */ args[1] = ASMREF_TMP1; /* TValue *n */ From 6c599960d15888baa717956faaae83170188800d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:36:40 +0200 Subject: [PATCH 14/95] ARM64: Improve integer IR_MUL code generation. Thanks to Peter Cawley. #1070 --- src/lj_asm_arm64.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index d2cb7823..d9866e9d 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -1391,8 +1391,7 @@ static void asm_intmul(ASMState *as, IRIns *ir) if (irt_isguard(ir->t)) { /* IR_MULOV */ asm_guardcc(as, CC_NE); emit_dm(as, A64I_MOVw, dest, dest); /* Zero-extend. */ - emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest); - emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest); + emit_nm(as, A64I_CMPx | A64F_EX(A64EX_SXTW), dest, dest); emit_dnm(as, A64I_SMULL, dest, right, left); } else { emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right); From de2e09f54c75c8767895e48bc1dd5aa0608dbab3 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:38:44 +0200 Subject: [PATCH 15/95] ARM64: Improve BC_JLOOP. Thanks to Peter Cawley. #1070 --- src/vm_arm64.dasc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 698b4210..b94a9c0e 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -3816,9 +3816,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |.if JIT | // RA = base (ignored), RC = traceno | ldr CARG1, [GL, #GL_J(trace)] - | mov CARG2w, #0 // Traces on ARM64 don't store the trace #, so use 0. + | st_vmstate wzr // Traces on ARM64 don't store the trace #, so use 0. | ldr TRACE:RC, [CARG1, RC, lsl #3] - | st_vmstate CARG2w |.if PAUTH | ldr RA, TRACE:RC->mcauth |.else From 43eff4aad45e03ac7e415cd20b97161897b14756 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:44:54 +0200 Subject: [PATCH 16/95] Fix mcode limit check for non-x86 archs. Thanks to Peter Cawley. --- src/lj_mcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_mcode.c b/src/lj_mcode.c index 7013cd7d..4a1ba4aa 100644 --- a/src/lj_mcode.c +++ b/src/lj_mcode.c @@ -371,7 +371,7 @@ void lj_mcode_limiterr(jit_State *J, size_t need) sizemcode = (size_t)J->param[JIT_P_sizemcode] << 10; sizemcode = (sizemcode + LJ_PAGESIZE-1) & ~(size_t)(LJ_PAGESIZE - 1); maxmcode = (size_t)J->param[JIT_P_maxmcode] << 10; - if ((size_t)need > sizemcode) + if (need * sizeof(MCode) > sizemcode) lj_trace_err(J, LJ_TRERR_MCODEOV); /* Too long for any area. */ if (J->szallmcarea + sizemcode > maxmcode) lj_trace_err(J, LJ_TRERR_MCODEAL); From 0705ef6ce41320b097cfb4f3c9a2a876c1949e86 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 17:52:43 +0200 Subject: [PATCH 17/95] ARM64: Ensure branch is in range before emitting TBZ/TBNZ. Thanks to Peter Cawley. #1074 --- src/lj_asm_arm64.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index d9866e9d..05bdc78a 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -84,18 +84,23 @@ static void asm_guardcc(ASMState *as, A64CC cc) emit_cond_branch(as, cc, target); } -/* Emit test and branch instruction to exit for guard. */ -static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit) +/* Emit test and branch instruction to exit for guard, if in range. */ +static int asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit) { MCode *target = asm_exitstub_addr(as, as->snapno); MCode *p = as->mcp; + ptrdiff_t delta = target - p; if (LJ_UNLIKELY(p == as->invmcp)) { + if (as->orignins > 1023) return 0; /* Delta might end up too large. */ as->loopinv = 1; - *p = A64I_B | A64F_S26(target-p); - emit_tnb(as, ai^0x01000000u, r, bit, p-1); - return; + *p = A64I_B | A64F_S26(delta); + ai ^= 0x01000000u; + target = p-1; + } else if (LJ_UNLIKELY(delta >= 0x1fff)) { + return 0; } emit_tnb(as, ai, r, bit, target); + return 1; } /* Emit compare and branch instruction to exit for guard. */ @@ -1651,16 +1656,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir) if (asm_swapops(as, blref, brref)) { Reg tmp = blref; blref = brref; brref = tmp; } + bleft = ra_alloc1(as, blref, RSET_GPR); if (irref_isk(brref)) { uint64_t k = get_k64val(as, brref); - if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) { - asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, - ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k)); + if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE) && + asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, bleft, + emit_ctz64(k))) return; - } m2 = emit_isk13(k, irt_is64(irl->t)); } - bleft = ra_alloc1(as, blref, RSET_GPR); ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw); if (!m2) m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft)); From b8c6ccd50c61b7a2df5123ddc5a85ac7d089542b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 18:01:37 +0200 Subject: [PATCH 18/95] ARM64: Fix LDP/STP fusion (again). Reported and analyzed by Zhongwei Yao. Fix by Peter Cawley. #1075 --- src/lj_emit_arm64.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index d4c54255..9161c958 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -113,6 +113,17 @@ static int emit_checkofs(A64Ins ai, int64_t ofs) } } +static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc) +{ + if (ofs >= 0) { + return ai | A64F_U12(ofs>>sc); /* Subsequent lj_ror checks ofs. */ + } else if (ofs >= -256) { + return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff); + } else { + return A64F_D(31); /* Will mismatch prev. */ + } +} + static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs) { int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3; @@ -124,11 +135,9 @@ static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs) uint32_t prev = *as->mcp & ~A64F_D(31); int ofsm = ofs - (1<>sc)) || - prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) { + if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) { aip = (A64F_A(rd) | A64F_D(*as->mcp & 31)); - } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) || - prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) { + } else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) { aip = (A64F_D(rd) | A64F_A(*as->mcp & 31)); ofsm = ofs; } else { From 44da356e97a159f5962f32a526525d14bcd13179 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 18:16:31 +0200 Subject: [PATCH 19/95] ARM: Fix stack check code generation. Thanks to Peter Cawley. #1068 --- src/lj_asm_arm.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index 20e57393..f53f708b 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -1947,6 +1947,7 @@ static void asm_hiop(ASMState *as, IRIns *ir) static void asm_stack_check(ASMState *as, BCReg topslot, IRIns *irp, RegSet allow, ExitNo exitno) { + int savereg = 0; Reg pbase; uint32_t k; if (irp) { @@ -1957,12 +1958,14 @@ static void asm_stack_check(ASMState *as, BCReg topslot, pbase = rset_pickbot(allow); } else { pbase = RID_RET; - emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0); /* Restore temp. register. */ + savereg = 1; } } else { pbase = RID_BASE; } emit_branch(as, ARMF_CC(ARMI_BL, CC_LS), exitstub_addr(as->J, exitno)); + if (savereg) + emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0); /* Restore temp. register. */ k = emit_isk12(0, (int32_t)(8*topslot)); lua_assert(k); emit_n(as, ARMI_CMP^k, RID_TMP); @@ -1974,7 +1977,7 @@ static void asm_stack_check(ASMState *as, BCReg topslot, if (ra_hasspill(irp->s)) emit_lso(as, ARMI_LDR, pbase, RID_SP, sps_scale(irp->s)); emit_lso(as, ARMI_LDR, RID_TMP, RID_TMP, (i & 4095)); - if (ra_hasspill(irp->s) && !allow) + if (savereg) emit_lso(as, ARMI_STR, RID_RET, RID_SP, 0); /* Save temp. register. */ emit_loadi(as, RID_TMP, (i & ~4095)); } else { From ba2b34f5e82baec5f925fa89b7bf4f88ae376da9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 20:52:02 +0200 Subject: [PATCH 20/95] ARM64: Disassemble rotates on logical operands. Thanks to Peter Cawley. #1076 --- src/jit/dis_arm64.lua | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index b10e2fb1..3d199bf2 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -107,24 +107,20 @@ local map_logsr = { -- Logical, shifted register. [0] = { shift = 29, mask = 3, [0] = { - shift = 21, mask = 7, - [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg", - "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg" + shift = 21, mask = 1, + [0] = "andDNMSg", "bicDNMSg" }, { - shift = 21, mask = 7, - [0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg", - "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg" + shift = 21, mask = 1, + [0] = "orr|movDN0MSg", "orn|mvnDN0MSg" }, { - shift = 21, mask = 7, - [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg", - "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg" + shift = 21, mask = 1, + [0] = "eorDNMSg", "eonDNMSg" }, { - shift = 21, mask = 7, - [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg", - "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg" + shift = 21, mask = 1, + [0] = "ands|tstD0NMSg", "bicsDNMSg" } }, false -- unallocated @@ -132,24 +128,20 @@ local map_logsr = { -- Logical, shifted register. { shift = 29, mask = 3, [0] = { - shift = 21, mask = 7, - [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg", - "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg" + shift = 21, mask = 1, + [0] = "andDNMSg", "bicDNMSg" }, { - shift = 21, mask = 7, - [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg", - "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg" + shift = 21, mask = 1, + [0] = "orr|movDN0MSg", "orn|mvnDN0MSg" }, { - shift = 21, mask = 7, - [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg", - "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg" + shift = 21, mask = 1, + [0] = "eorDNMSg", "eonDNMSg" }, { - shift = 21, mask = 7, - [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg", - "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg" + shift = 21, mask = 1, + [0] = "ands|tstD0NMSg", "bicsDNMSg" } } } @@ -735,7 +727,7 @@ local map_cond = { "hi", "ls", "ge", "lt", "gt", "le", "al", } -local map_shift = { [0] = "lsl", "lsr", "asr", } +local map_shift = { [0] = "lsl", "lsr", "asr", "ror"} local map_extend = { [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx", From 90742d91c27d185b70d1b4a6343fb6b7c26002db Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 20:57:46 +0200 Subject: [PATCH 21/95] ARM64: Don't fuse sign extensions into logical operands. Thanks to Peter Cawley. #1076 --- src/lj_asm_arm64.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 05bdc78a..04834f57 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -216,16 +216,13 @@ static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow, static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) { IRIns *ir = IR(ref); + int logical = (ai & 0x1f000000) == 0x0a000000; if (ra_hasreg(ir->r)) { ra_noweak(as, ir->r); return A64F_M(ir->r); } else if (irref_isk(ref)) { - uint32_t m; int64_t k = get_k64val(as, ref); - if ((ai & 0x1f000000) == 0x0a000000) - m = emit_isk13(k, irt_is64(ir->t)); - else - m = emit_isk12(k); + uint32_t m = logical ? emit_isk13(k, irt_is64(ir->t)) : emit_isk12(k); if (m) return m; } else if (mayfuse(as, ref)) { @@ -237,7 +234,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31)); IRIns *irl = IR(ir->op1); if (sh == A64SH_LSL && - irl->o == IR_CONV && + irl->o == IR_CONV && !logical && irl->op2 == ((IRT_I64<op1, allow); return A64F_M(m) | A64F_SH(sh, shift); } - } else if (ir->o == IR_CONV && + } else if (ir->o == IR_CONV && !logical && ir->op2 == ((IRT_I64<op1, allow); return A64F_M(m) | A64F_EX(A64EX_SXTW); From 4611e25c0fbe911486cccae4556eb086c0254c5f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 9 Sep 2023 20:59:18 +0200 Subject: [PATCH 22/95] ARM64: Fuse rotates into logical operands. Thanks to Peter Cawley. #1076 --- src/lj_asm_arm64.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 04834f57..4dd6b711 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -244,6 +244,10 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) Reg m = ra_alloc1(as, ir->op1, allow); return A64F_M(m) | A64F_SH(sh, shift); } + } else if (ir->o == IR_BROR && logical && irref_isk(ir->op2)) { + Reg m = ra_alloc1(as, ir->op1, allow); + int shift = (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31)); + return A64F_M(m) | A64F_SH(A64SH_ROR, shift); } else if (ir->o == IR_CONV && !logical && ir->op2 == ((IRT_I64<op1, allow); @@ -1337,12 +1341,12 @@ static int asm_swapops(ASMState *as, IRRef lref, IRRef rref) if (irref_isk(lref)) return 1; /* But swap constants to the right. */ ir = IR(rref); - if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) || + if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) || (ir->o == IR_ADD && ir->op1 == ir->op2) || (ir->o == IR_CONV && ir->op2 == ((IRT_I64<o >= IR_BSHL && ir->o <= IR_BSAR) || + if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) || (ir->o == IR_ADD && ir->op1 == ir->op2) || (ir->o == IR_CONV && ir->op2 == ((IRT_I64< Date: Sat, 9 Sep 2023 23:01:26 +0200 Subject: [PATCH 23/95] Improve architecture detection error messages. --- src/Makefile | 2 +- src/lj_arch.h | 22 +++++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Makefile b/src/Makefile index dad90138..a83b8629 100644 --- a/src/Makefile +++ b/src/Makefile @@ -232,7 +232,7 @@ TARGET_ALDFLAGS= $(LDOPTIONS) $(TARGET_XLDFLAGS) $(TARGET_FLAGS) $(TARGET_LDFLAG TARGET_ASHLDFLAGS= $(LDOPTIONS) $(TARGET_XSHLDFLAGS) $(TARGET_FLAGS) $(TARGET_SHLDFLAGS) TARGET_ALIBS= $(TARGET_XLIBS) $(LIBS) $(TARGET_LIBS) -TARGET_TESTARCH=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM) +TARGET_TESTARCH:=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM) ifneq (,$(findstring LJ_TARGET_X64 ,$(TARGET_TESTARCH))) TARGET_LJARCH= x64 else diff --git a/src/lj_arch.h b/src/lj_arch.h index da53b162..e77865d9 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -52,7 +52,7 @@ #elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS) #define LUAJIT_TARGET LUAJIT_ARCH_MIPS #else -#error "No support for this architecture (yet)" +#error "Architecture not supported (in this version), see: https://luajit.org/status.html#architectures" #endif #endif @@ -188,13 +188,13 @@ #define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */ #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL -#if __ARM_ARCH____ARM_ARCH_8__ || __ARM_ARCH_8A__ +#if __ARM_ARCH >= 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__ #define LJ_ARCH_VERSION 80 -#elif __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__ +#elif __ARM_ARCH == 7 || __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__ #define LJ_ARCH_VERSION 70 #elif __ARM_ARCH_6T2__ #define LJ_ARCH_VERSION 61 -#elif __ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ || __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__ +#elif __ARM_ARCH == 6 || __ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ || __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__ #define LJ_ARCH_VERSION 60 #else #define LJ_ARCH_VERSION 50 @@ -328,29 +328,37 @@ #elif LJ_TARGET_ARM #if defined(__ARMEB__) #error "No support for big-endian ARM" +#undef LJ_TARGET_ARM #endif #if __ARM_ARCH_6M__ || __ARM_ARCH_7M__ || __ARM_ARCH_7EM__ #error "No support for Cortex-M CPUs" +#undef LJ_TARGET_ARM #endif #if !(__ARM_EABI__ || LJ_TARGET_IOS) #error "Only ARM EABI or iOS 3.0+ ABI is supported" +#undef LJ_TARGET_ARM #endif #elif LJ_TARGET_PPC || LJ_TARGET_PPCSPE #if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) -#error "No support for PowerPC CPUs without double-precision FPU" +#error "No support for PowerPC CPUs without double-precision FPU, use LuaJIT v2.1" +#undef LJ_TARGET_PPC #endif #if defined(_LITTLE_ENDIAN) && (!defined(_BYTE_ORDER) || (_BYTE_ORDER == _LITTLE_ENDIAN)) #error "No support for little-endian PowerPC" +#undef LJ_TARGET_PPC #endif #if defined(_LP64) #error "No support for PowerPC 64 bit mode" +#undef LJ_TARGET_PPC #endif #elif LJ_TARGET_MIPS #if defined(__mips_soft_float) -#error "No support for MIPS CPUs without FPU" +#error "No support for MIPS CPUs without FPU, use LuaJIT v2.1+" +#undef LJ_TARGET_MIPS #endif #if defined(_LP64) -#error "No support for MIPS64" +#error "No support for MIPS64, use LuaJIT v2.1+" +#undef LJ_TARGET_MIPS #endif #endif #endif From cb413bf8f4814fe3e47c8c619602c7a161469faf Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Sep 2023 05:20:22 +0200 Subject: [PATCH 24/95] Windows/ARM64: Add initial support. Only builds with native ARM64 Visual Studio for now. Thanks to vanc and Stephen Just. #593 #964 --- src/host/buildvm_peobj.c | 113 +++++++++++++++++++++++++++++++++++---- src/lj_arch.h | 2 +- src/lj_def.h | 24 ++++++--- src/lj_emit_arm64.h | 8 +-- src/lj_mcode.c | 10 ++-- src/lj_target.h | 11 ++-- src/msvcbuild.bat | 16 ++++-- 7 files changed, 147 insertions(+), 37 deletions(-) diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index 5bca6df8..e3e1026e 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -9,7 +9,7 @@ #include "buildvm.h" #include "lj_bc.h" -#if LJ_TARGET_X86ORX64 +#if LJ_TARGET_WINDOWS /* Context for PE object emitter. */ static char *strtab; @@ -93,6 +93,17 @@ typedef struct PEsymaux { #define PEOBJ_RELOC_ADDR32NB 0x03 #define PEOBJ_RELOC_OFS 0 #define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */ +#define PEOBJ_PDATA_NRELOC 6 +#define PEOBJ_XDATA_SIZE (8*2+4+6*2) +#elif LJ_TARGET_ARM64 +#define PEOBJ_ARCH_TARGET 0xaa64 +#define PEOBJ_RELOC_REL32 0x03 /* MS: BRANCH26. */ +#define PEOBJ_RELOC_DIR32 0x01 +#define PEOBJ_RELOC_ADDR32NB 0x02 +#define PEOBJ_RELOC_OFS (-4) +#define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */ +#define PEOBJ_PDATA_NRELOC 4 +#define PEOBJ_XDATA_SIZE (4+24+4 +4+8) #endif /* Section numbers (0-based). */ @@ -100,7 +111,7 @@ enum { PEOBJ_SECT_ABS = -2, PEOBJ_SECT_UNDEF = -1, PEOBJ_SECT_TEXT, -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC PEOBJ_SECT_PDATA, PEOBJ_SECT_XDATA, #elif LJ_TARGET_X86 @@ -175,6 +186,9 @@ void emit_peobj(BuildCtx *ctx) uint32_t sofs; int i, nrsym; union { uint8_t b; uint32_t u; } host_endian; +#ifdef PEOBJ_PDATA_NRELOC + uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs; +#endif sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection); @@ -188,18 +202,18 @@ void emit_peobj(BuildCtx *ctx) /* Flags: 60 = read+execute, 50 = align16, 20 = code. */ pesect[PEOBJ_SECT_TEXT].flags = PEOBJ_TEXT_FLAGS; -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC memcpy(pesect[PEOBJ_SECT_PDATA].name, ".pdata", sizeof(".pdata")-1); pesect[PEOBJ_SECT_PDATA].ofs = sofs; - sofs += (pesect[PEOBJ_SECT_PDATA].size = 6*4); + sofs += (pesect[PEOBJ_SECT_PDATA].size = PEOBJ_PDATA_NRELOC*4); pesect[PEOBJ_SECT_PDATA].relocofs = sofs; - sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = 6) * PEOBJ_RELOC_SIZE; + sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = PEOBJ_PDATA_NRELOC) * PEOBJ_RELOC_SIZE; /* Flags: 40 = read, 30 = align4, 40 = initialized data. */ pesect[PEOBJ_SECT_PDATA].flags = 0x40300040; memcpy(pesect[PEOBJ_SECT_XDATA].name, ".xdata", sizeof(".xdata")-1); pesect[PEOBJ_SECT_XDATA].ofs = sofs; - sofs += (pesect[PEOBJ_SECT_XDATA].size = 8*2+4+6*2); /* See below. */ + sofs += (pesect[PEOBJ_SECT_XDATA].size = PEOBJ_XDATA_SIZE); /* See below. */ pesect[PEOBJ_SECT_XDATA].relocofs = sofs; sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE; /* Flags: 40 = read, 30 = align4, 40 = initialized data. */ @@ -234,7 +248,7 @@ void emit_peobj(BuildCtx *ctx) */ nrsym = ctx->nrelocsym; pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym; -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC pehdr.nsyms += 1; /* Symbol for lj_err_unwind_win. */ #endif @@ -259,7 +273,6 @@ void emit_peobj(BuildCtx *ctx) #if LJ_TARGET_X64 { /* Write .pdata section. */ - uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs; uint32_t pdata[3]; /* Start of .text, end of .text and .xdata. */ PEreloc reloc; pdata[0] = 0; pdata[1] = fcofs; pdata[2] = 0; @@ -308,6 +321,88 @@ void emit_peobj(BuildCtx *ctx) reloc.type = PEOBJ_RELOC_ADDR32NB; owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); } +#elif LJ_TARGET_ARM64 + /* https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling */ + { /* Write .pdata section. */ + uint32_t pdata[4]; + PEreloc reloc; + pdata[0] = 0; + pdata[1] = 0; + pdata[2] = fcofs; + pdata[3] = 4+24+4; + owrite(ctx, &pdata, sizeof(pdata)); + /* Start of .text and start of .xdata. */ + reloc.vaddr = 0; reloc.symidx = 1+2+nrsym+2+2+1; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + reloc.vaddr = 4; reloc.symidx = 1+2+nrsym+2; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + /* Start of vm_ffi_call and start of second part of .xdata. */ + reloc.vaddr = 8; reloc.symidx = 1+2+nrsym+2+2+1; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + reloc.vaddr = 12; reloc.symidx = 1+2+nrsym+2; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + } + { /* Write .xdata section. */ + uint32_t u32; + uint8_t *p, uwc[24]; + PEreloc reloc; + +#define CBE16(x) (*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2) +#define CALLOC_S(s) (*p++ = ((s) >> 4)) /* s < 512 */ +#define CSAVE_FPLR(o) (*p++ = 0x40 | ((o) >> 3)) /* o <= 504 */ +#define CSAVE_REGP(r,o) CBE16(0xc800 | (((r)-19)<< 6) | ((o) >> 3)) +#define CSAVE_REGS(r1,r2,o1) do { \ + int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \ +} while (0) +#define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3)) +#define CSAVE_FREGS(r1,r2,o1) do { \ + int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \ +} while (0) +#define CSAVE_REGX(r,o) CBE16(0xd400 | (((r) - 19) << 5) | (~(o) >> 3)) +#define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */ +#define CODE_NOP 0xe3 +#define CODE_END 0xe4 +#define CEND_ALIGN do { \ + *p++ = CODE_END; \ + while ((p - uwc) & 3) *p++ = CODE_NOP; \ +} while (0) + + /* Unwind codes for .text section with handler. */ + p = uwc; + CALLOC_S(208); /* +1 */ + CSAVE_FPLR(192); /* +1 */ + CADD_FP(192); /* +2 */ + CSAVE_REGS(19, 28, 184); /* +5*2 */ + CSAVE_FREGS(8, 15, 104); /* +4*2 */ + CEND_ALIGN; /* +1 +1 -> 24 */ + + u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2); + owrite(ctx, &u32, 4); + owrite(ctx, &uwc, 24); + + u32 = 0; /* Handler RVA to be relocated at 4 + 24. */ + owrite(ctx, &u32, 4); + + /* Unwind codes for vm_ffi_call without handler. */ + p = uwc; + CSAVE_FPLR(16); /* +1 */ + CADD_FP(16); /* +2 */ + CSAVE_REGX(19, -24); /* +2 */ + CSAVE_REGX(20, -32); /* +2 */ + CEND_ALIGN; /* +1 +0 -> 8 */ + + u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2); + owrite(ctx, &u32, 4); + owrite(ctx, &uwc, 8); + + reloc.vaddr = 4 + 24; reloc.symidx = 1+2+nrsym+2+2; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + } #elif LJ_TARGET_X86 /* Write .sxdata section. */ for (i = 0; i < nrsym; i++) { @@ -339,7 +434,7 @@ void emit_peobj(BuildCtx *ctx) emit_peobj_sym(ctx, ctx->relocsym[i], 0, PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN); -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA); emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA); emit_peobj_sym(ctx, "lj_err_unwind_win", 0, diff --git a/src/lj_arch.h b/src/lj_arch.h index 3e920f2a..026e741f 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -57,7 +57,7 @@ #define LUAJIT_TARGET LUAJIT_ARCH_X64 #elif defined(__arm__) || defined(__arm) || defined(__ARM__) || defined(__ARM) #define LUAJIT_TARGET LUAJIT_ARCH_ARM -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) #define LUAJIT_TARGET LUAJIT_ARCH_ARM64 #elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC) #define LUAJIT_TARGET LUAJIT_ARCH_PPC diff --git a/src/lj_def.h b/src/lj_def.h index 88bc6336..1461d3d7 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -146,15 +146,9 @@ typedef uintptr_t BloomFilter; #define LJ_UNLIKELY(x) __builtin_expect(!!(x), 0) #define lj_ffs(x) ((uint32_t)__builtin_ctz(x)) -/* Don't ask ... */ -#if defined(__INTEL_COMPILER) && (defined(__i386__) || defined(__x86_64__)) -static LJ_AINLINE uint32_t lj_fls(uint32_t x) -{ - uint32_t r; __asm__("bsrl %1, %0" : "=r" (r) : "rm" (x) : "cc"); return r; -} -#else #define lj_fls(x) ((uint32_t)(__builtin_clz(x)^31)) -#endif +#define lj_ffs64(x) ((uint32_t)__builtin_ctzll(x)) +#define lj_fls64(x) ((uint32_t)(__builtin_clzll(x)^63)) #if defined(__arm__) static LJ_AINLINE uint32_t lj_bswap(uint32_t x) @@ -265,8 +259,12 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x) #else unsigned char _BitScanForward(unsigned long *, unsigned long); unsigned char _BitScanReverse(unsigned long *, unsigned long); +unsigned char _BitScanForward64(unsigned long *, uint64_t); +unsigned char _BitScanReverse64(unsigned long *, uint64_t); #pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) +#pragma intrinsic(_BitScanForward64) +#pragma intrinsic(_BitScanReverse64) static LJ_AINLINE uint32_t lj_ffs(uint32_t x) { @@ -277,6 +275,16 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x) { unsigned long r; _BitScanReverse(&r, x); return (uint32_t)r; } + +static LJ_AINLINE uint32_t lj_ffs64(uint64_t x) +{ + unsigned long r; _BitScanForward64(&r, x); return (uint32_t)r; +} + +static LJ_AINLINE uint32_t lj_fls64(uint64_t x) +{ + unsigned long r; _BitScanReverse64(&r, x); return (uint32_t)r; +} #endif unsigned long _byteswap_ulong(unsigned long); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 9161c958..fef5d973 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -30,15 +30,15 @@ static uint32_t emit_isk12(int64_t n) uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n; uint32_t m = n < 0 ? 0x40000000 : 0; if (k < 0x1000) { - return A64I_K12|m|A64F_U12(k); + return (uint32_t)(A64I_K12|m|A64F_U12(k)); } else if ((k & 0xfff000) == k) { - return A64I_K12|m|0x400000|A64F_U12(k>>12); + return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12)); } return 0; } -#define emit_clz64(n) __builtin_clzll(n) -#define emit_ctz64(n) __builtin_ctzll(n) +#define emit_clz64(n) (lj_fls64(n)^63) +#define emit_ctz64(n) lj_ffs64(n) /* Encode constant in K13 format for logical data processing instructions. */ static uint32_t emit_isk13(uint64_t n, int is64) diff --git a/src/lj_mcode.c b/src/lj_mcode.c index c8ed95e1..8a4851dd 100644 --- a/src/lj_mcode.c +++ b/src/lj_mcode.c @@ -29,6 +29,11 @@ #include #endif +#if LJ_TARGET_WINDOWS +#define WIN32_LEAN_AND_MEAN +#include +#endif + #if LJ_TARGET_IOS void sys_icache_invalidate(void *start, size_t len); #endif @@ -41,6 +46,8 @@ void lj_mcode_sync(void *start, void *end) #endif #if LJ_TARGET_X86ORX64 UNUSED(start); UNUSED(end); +#elif LJ_TARGET_WINDOWS + FlushInstructionCache(GetCurrentProcess(), start, (char *)end-(char *)start); #elif LJ_TARGET_IOS sys_icache_invalidate(start, (char *)end-(char *)start); #elif LJ_TARGET_PPC @@ -58,9 +65,6 @@ void lj_mcode_sync(void *start, void *end) #if LJ_TARGET_WINDOWS -#define WIN32_LEAN_AND_MEAN -#include - #define MCPROT_RW PAGE_READWRITE #define MCPROT_RX PAGE_EXECUTE_READ #define MCPROT_RWX PAGE_EXECUTE_READWRITE diff --git a/src/lj_target.h b/src/lj_target.h index 09d19bd9..e7322c07 100644 --- a/src/lj_target.h +++ b/src/lj_target.h @@ -58,9 +58,13 @@ typedef uint32_t RegSP; #if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 typedef uint64_t RegSet; #define RSET_BITS 6 +#define rset_picktop_(rs) ((Reg)lj_fls64(rs)) +#define rset_pickbot_(rs) ((Reg)lj_ffs64(rs)) #else typedef uint32_t RegSet; #define RSET_BITS 5 +#define rset_picktop_(rs) ((Reg)lj_fls(rs)) +#define rset_pickbot_(rs) ((Reg)lj_ffs(rs)) #endif #define RID2RSET(r) (((RegSet)1) << (r)) @@ -71,13 +75,6 @@ typedef uint32_t RegSet; #define rset_set(rs, r) (rs |= RID2RSET(r)) #define rset_clear(rs, r) (rs &= ~RID2RSET(r)) #define rset_exclude(rs, r) (rs & ~RID2RSET(r)) -#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 -#define rset_picktop_(rs) ((Reg)(__builtin_clzll(rs)^63)) -#define rset_pickbot_(rs) ((Reg)__builtin_ctzll(rs)) -#else -#define rset_picktop_(rs) ((Reg)lj_fls(rs)) -#define rset_pickbot_(rs) ((Reg)lj_ffs(rs)) -#endif /* -- Register allocation cost -------------------------------------------- */ diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index f9bf2528..2cfcf26e 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -34,20 +34,26 @@ if exist minilua.exe.manifest^ %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe -@set DASMFLAGS=-D WIN -D JIT -D FFI -D P64 +@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU -D P64 @set LJARCH=x64 @minilua -@if errorlevel 8 goto :X64 +@if errorlevel 8 goto :NO32 @set DASC=vm_x86.dasc -@set DASMFLAGS=-D WIN -D JIT -D FFI +@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU @set LJARCH=x86 @set LJCOMPILE=%LJCOMPILE% /arch:SSE2 +@goto :DA +:NO32 +@if "%VSCMD_ARG_TGT_ARCH%" neq "arm64" goto :X64 +@set DASC=vm_arm64.dasc +@set LJARCH=arm64 +@goto :DA :X64 -@if "%1" neq "nogc64" goto :GC64 +@if "%1" neq "nogc64" goto :DA @shift @set DASC=vm_x86.dasc @set LJCOMPILE=%LJCOMPILE% /DLUAJIT_DISABLE_GC64 -:GC64 +:DA minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC% @if errorlevel 1 goto :BAD From 9760984638d241531ff8a9eef259aad2272f0f75 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Sep 2023 05:23:10 +0200 Subject: [PATCH 25/95] Allow path overrides in genversion.lua with minilua, too. Thanks to arch1t3cht. #1067 --- src/host/genversion.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/src/host/genversion.lua b/src/host/genversion.lua index 5ead4c2b..28f7206c 100644 --- a/src/host/genversion.lua +++ b/src/host/genversion.lua @@ -5,6 +5,7 @@ -- Released under the MIT license. See Copyright Notice in luajit.h ---------------------------------------------------------------------------- +local arg = {...} local FILE_ROLLING_H = arg[1] or "luajit_rolling.h" local FILE_RELVER_TXT = arg[2] or "luajit_relver.txt" local FILE_LUAJIT_H = arg[3] or "luajit.h" From b174d5e66d103515aa88b1ecaf1cc03fe5af5ea4 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Sep 2023 13:10:17 +0200 Subject: [PATCH 26/95] Fix Cygwin build. Thanks to Christopher Ng. #1077 #1078 --- src/host/buildvm_peobj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index e3e1026e..fc3ef71a 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -9,7 +9,7 @@ #include "buildvm.h" #include "lj_bc.h" -#if LJ_TARGET_WINDOWS +#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN /* Context for PE object emitter. */ static char *strtab; From 836ab4227a1b024321731fe5a5059368c9f0dff7 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Sep 2023 13:14:09 +0200 Subject: [PATCH 27/95] ARM64: Remove unneeded IRCALL_* defs for math intrinsics. Workaround for MSVC issue. Thanks to Peter Cawley. #593 --- src/lj_ircall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_ircall.h b/src/lj_ircall.h index 569134e9..f342cdd2 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -63,7 +63,7 @@ typedef struct CCallInfo { /* Helpers for conditional function definitions. */ #define IRCALLCOND_ANY(x) x -#if LJ_TARGET_X86ORX64 +#if LJ_TARGET_X86ORX64 || LJ_TARGET_ARM64 #define IRCALLCOND_FPMATH(x) NULL #else #define IRCALLCOND_FPMATH(x) x From f63bc569fab1450def4c817f100e580dddb425c5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Sep 2023 13:33:27 +0200 Subject: [PATCH 28/95] Windows/ARM64: Fix exception unwinding. Thanks to Peter Cawley. #593 --- src/host/buildvm_peobj.c | 14 +++++++------- src/lj_err.c | 10 ++++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index fc3ef71a..cdbb79b6 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -354,7 +354,7 @@ void emit_peobj(BuildCtx *ctx) #define CBE16(x) (*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2) #define CALLOC_S(s) (*p++ = ((s) >> 4)) /* s < 512 */ #define CSAVE_FPLR(o) (*p++ = 0x40 | ((o) >> 3)) /* o <= 504 */ -#define CSAVE_REGP(r,o) CBE16(0xc800 | (((r)-19)<< 6) | ((o) >> 3)) +#define CSAVE_REGP(r,o) CBE16(0xc800 | (((r) - 19) << 6) | ((o) >> 3)) #define CSAVE_REGS(r1,r2,o1) do { \ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \ } while (0) @@ -362,6 +362,7 @@ void emit_peobj(BuildCtx *ctx) #define CSAVE_FREGS(r1,r2,o1) do { \ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \ } while (0) +#define CSAVE_REG(r,o) CBE16(0xd000 | (((r) - 19) << 6) | (~(o) >> 3)) #define CSAVE_REGX(r,o) CBE16(0xd400 | (((r) - 19) << 5) | (~(o) >> 3)) #define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */ #define CODE_NOP 0xe3 @@ -373,12 +374,11 @@ void emit_peobj(BuildCtx *ctx) /* Unwind codes for .text section with handler. */ p = uwc; - CALLOC_S(208); /* +1 */ - CSAVE_FPLR(192); /* +1 */ - CADD_FP(192); /* +2 */ CSAVE_REGS(19, 28, 184); /* +5*2 */ CSAVE_FREGS(8, 15, 104); /* +4*2 */ - CEND_ALIGN; /* +1 +1 -> 24 */ + CSAVE_FPLR(192); /* +1 */ + CALLOC_S(208); /* +1 */ + CEND_ALIGN; /* +1 +3 -> 24 */ u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2); owrite(ctx, &u32, 4); @@ -389,9 +389,9 @@ void emit_peobj(BuildCtx *ctx) /* Unwind codes for vm_ffi_call without handler. */ p = uwc; - CSAVE_FPLR(16); /* +1 */ CADD_FP(16); /* +2 */ - CSAVE_REGX(19, -24); /* +2 */ + CSAVE_FPLR(16); /* +1 */ + CSAVE_REG(19, 8); /* +2 */ CSAVE_REGX(20, -32); /* +2 */ CEND_ALIGN; /* +1 +0 -> 8 */ diff --git a/src/lj_err.c b/src/lj_err.c index 6e50cbee..8ef51bf2 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -261,6 +261,8 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, { #if LJ_TARGET_X86 void *cf = (char *)f - CFRAME_OFS_SEH; +#elif LJ_TARGET_ARM64 + void *cf = (char *)f - CFRAME_SIZE; #else void *cf = f; #endif @@ -297,11 +299,11 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, #else /* Unwind the stack and call all handlers for all lower C frames ** (including ourselves) again with EH_UNWINDING set. Then set - ** stack pointer = cf, result = errcode and jump to the specified target. + ** stack pointer = f, result = errcode and jump to the specified target. */ - RtlUnwindEx(cf, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ? - lj_vm_unwind_ff_eh : - lj_vm_unwind_c_eh), + RtlUnwindEx(f, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ? + lj_vm_unwind_ff_eh : + lj_vm_unwind_c_eh), rec, (void *)(uintptr_t)errcode, ctx, dispatch->HistoryTable); /* RtlUnwindEx should never return. */ #endif From 1c33f46314cc4e3cb52ac83c5b27419bc06b5154 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Sep 2023 16:35:28 +0200 Subject: [PATCH 29/95] Windows/ARM64: Support Windows calling conventions. Dear Microsoft: your butchering of the (perfectly fine) ARM64 ABI is a disgrace. Thanks to Peter Cawley. #593 --- src/lj_asm_arm64.h | 17 ++++++++++++++++- src/lj_ccall.c | 18 +++++++++++++++++- src/lj_crecord.c | 6 +----- src/lj_emit_arm64.h | 4 ++-- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 4dd6b711..c2b17737 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -432,6 +432,11 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++) as->cost[gpr] = REGCOST(~0u, ASMREF_L); gpr = REGARG_FIRSTGPR; +#if LJ_HASFFI && LJ_ABI_WIN + if ((ci->flags & CCI_VARARG)) { + fpr = REGARG_LASTFPR+1; + } +#endif for (n = 0; n < nargs; n++) { /* Setup args. */ IRRef ref = args[n]; IRIns *ir = IR(ref); @@ -442,6 +447,11 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) "reg %d not free", fpr); /* Must have been evicted. */ ra_leftov(as, fpr, ref); fpr++; +#if LJ_HASFFI && LJ_ABI_WIN + } else if ((ci->flags & CCI_VARARG) && (gpr <= REGARG_LASTGPR)) { + Reg rf = ra_alloc1(as, ref, RSET_FPR); + emit_dn(as, A64I_FMOV_R_D, gpr++, rf & 31); +#endif } else { Reg r = ra_alloc1(as, ref, RSET_FPR); int32_t al = spalign; @@ -1943,6 +1953,9 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; int spofs = 0, spalign = LJ_TARGET_OSX ? 0 : 7, nslots; asm_collectargs(as, ir, ci, args); +#if LJ_ABI_WIN + if ((ci->flags & CCI_VARARG)) nfpr = 0; +#endif for (i = 0; i < nargs; i++) { int al = spalign; if (!args[i]) { @@ -1954,7 +1967,9 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) #endif } else if (irt_isfp(IR(args[i])->t)) { if (nfpr > 0) { nfpr--; continue; } -#if LJ_TARGET_OSX +#if LJ_ABI_WIN + if ((ci->flags & CCI_VARARG) && ngpr > 0) { ngpr--; continue; } +#elif LJ_TARGET_OSX al |= irt_isnum(IR(args[i])->t) ? 7 : 3; #endif } else { diff --git a/src/lj_ccall.c b/src/lj_ccall.c index 00e753b9..5f95f5d8 100644 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -985,6 +985,14 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, fid = ctf->sib; } +#if LJ_TARGET_ARM64 && LJ_ABI_WIN + if ((ct->info & CTF_VARARG)) { + nsp -= maxgpr * CTSIZE_PTR; /* May end up with negative nsp. */ + ngpr = maxgpr; + nfpr = CCALL_NARG_FPR; + } +#endif + /* Walk through all passed arguments. */ for (o = L->base+1, narg = 1; o < top; o++, narg++) { CTypeID did; @@ -1035,9 +1043,14 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, align = CTSIZE_PTR-1; nsp = (nsp + align) & ~align; } +#if LJ_TARGET_ARM64 && LJ_ABI_WIN + /* A negative nsp points into cc->gpr. Blame MS for their messy ABI. */ + dp = ((uint8_t *)cc->stack) + (int32_t)nsp; +#else dp = ((uint8_t *)cc->stack) + nsp; +#endif nsp += CCALL_PACK_STACKARG ? sz : n * CTSIZE_PTR; - if (nsp > CCALL_SIZE_STACK) { /* Too many arguments. */ + if ((int32_t)nsp > CCALL_SIZE_STACK) { /* Too many arguments. */ err_nyi: lj_err_caller(L, LJ_ERR_FFI_NYICALL); } @@ -1099,6 +1112,9 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, #endif } if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG); /* Too few arguments. */ +#if LJ_TARGET_ARM64 && LJ_ABI_WIN + if ((int32_t)nsp < 0) nsp = 0; +#endif #if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) cc->nfpr = nfpr; /* Required for vararg functions. */ diff --git a/src/lj_crecord.c b/src/lj_crecord.c index d7a522fb..55d0b3ef 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -1118,12 +1118,8 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd, ngpr = 1; else if (ctype_cconv(ct->info) == CTCC_FASTCALL) ngpr = 2; -#elif LJ_TARGET_ARM64 -#if LJ_ABI_WIN -#error "NYI: ARM64 Windows ABI calling conventions" -#elif LJ_TARGET_OSX +#elif LJ_TARGET_ARM64 && LJ_TARGET_OSX int ngpr = CCALL_NARG_GPR; -#endif #endif /* Skip initial attributes. */ diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index fef5d973..3c510492 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -124,9 +124,9 @@ static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc) } } -static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs) +static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs64) { - int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3; + int ot = emit_checkofs(ai, ofs64), sc = (ai >> 30) & 3, ofs = (int)ofs64; lj_assertA(ot, "load/store offset %d out of range", ofs); /* Combine LDR/STR pairs to LDP/STP. */ if ((sc == 2 || sc == 3) && From 9e0437240f1fb4bfa7248f6ec8be0e3181016119 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Sep 2023 21:06:25 +0200 Subject: [PATCH 30/95] FFI: Fix 64 bit shift fold rules. Thanks to Peter Cawley. #1079 --- src/lj_opt_fold.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index d90477f6..743dfb07 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -377,10 +377,10 @@ static uint64_t kfold_int64arith(jit_State *J, uint64_t k1, uint64_t k2, case IR_BOR: k1 |= k2; break; case IR_BXOR: k1 ^= k2; break; case IR_BSHL: k1 <<= (k2 & 63); break; - case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 63)); break; - case IR_BSAR: k1 >>= (k2 & 63); break; - case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 63)); break; - case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 63)); break; + case IR_BSHR: k1 >>= (k2 & 63); break; + case IR_BSAR: k1 = (uint64_t)((int64_t)k1 >> (k2 & 63)); break; + case IR_BROL: k1 = lj_rol(k1, (k2 & 63)); break; + case IR_BROR: k1 = lj_ror(k1, (k2 & 63)); break; default: lj_assertJ(0, "bad IR op %d", op); break; } #else From 8af63f992058ebbac2d72ef92811cf22a90fa347 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Sep 2023 23:00:36 +0200 Subject: [PATCH 31/95] Windows/ARM64: Fix typo in exception unwinding. Thanks to Peter Cawley. #593 --- src/host/buildvm_peobj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index cdbb79b6..667cc3fc 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -362,7 +362,7 @@ void emit_peobj(BuildCtx *ctx) #define CSAVE_FREGS(r1,r2,o1) do { \ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \ } while (0) -#define CSAVE_REG(r,o) CBE16(0xd000 | (((r) - 19) << 6) | (~(o) >> 3)) +#define CSAVE_REG(r,o) CBE16(0xd000 | (((r) - 19) << 6) | ((o) >> 3)) #define CSAVE_REGX(r,o) CBE16(0xd400 | (((r) - 19) << 5) | (~(o) >> 3)) #define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */ #define CODE_NOP 0xe3 From b36f9fad63de19074d97df787146056c028e8fba Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 15 Sep 2023 05:23:29 +0200 Subject: [PATCH 32/95] Windows/ARM64: Fix exception unwinding (again). Thanks to Peter Cawley. #593 --- src/host/buildvm_peobj.c | 12 +++++------- src/vm_arm64.dasc | 42 +++++++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index 667cc3fc..7ce3b05a 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -358,12 +358,11 @@ void emit_peobj(BuildCtx *ctx) #define CSAVE_REGS(r1,r2,o1) do { \ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \ } while (0) +#define CSAVE_REGPX(r,o) CBE16(0xcc00 | (((r) - 19) << 6) | (~(o) >> 3)) #define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3)) #define CSAVE_FREGS(r1,r2,o1) do { \ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \ } while (0) -#define CSAVE_REG(r,o) CBE16(0xd000 | (((r) - 19) << 6) | ((o) >> 3)) -#define CSAVE_REGX(r,o) CBE16(0xd400 | (((r) - 19) << 5) | (~(o) >> 3)) #define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */ #define CODE_NOP 0xe3 #define CODE_END 0xe4 @@ -374,8 +373,8 @@ void emit_peobj(BuildCtx *ctx) /* Unwind codes for .text section with handler. */ p = uwc; - CSAVE_REGS(19, 28, 184); /* +5*2 */ - CSAVE_FREGS(8, 15, 104); /* +4*2 */ + CSAVE_REGS(19, 28, 176); /* +5*2 */ + CSAVE_FREGS(8, 15, 96); /* +4*2 */ CSAVE_FPLR(192); /* +1 */ CALLOC_S(208); /* +1 */ CEND_ALIGN; /* +1 +3 -> 24 */ @@ -391,9 +390,8 @@ void emit_peobj(BuildCtx *ctx) p = uwc; CADD_FP(16); /* +2 */ CSAVE_FPLR(16); /* +1 */ - CSAVE_REG(19, 8); /* +2 */ - CSAVE_REGX(20, -32); /* +2 */ - CEND_ALIGN; /* +1 +0 -> 8 */ + CSAVE_REGPX(19, -32); /* +2 */ + CEND_ALIGN; /* +1 +2 -> 8 */ u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2); owrite(ctx, &u32, 4); diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index b94a9c0e..d622d2a0 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -113,13 +113,37 @@ | |.define TMPDofs, #24 | +|.if WIN +|// Windows unwind data is suited to r1 stored first. +|.macro stp_unwind, r1, r2, where +| stp r1, r2, where +|.endmacro +|.macro ldp_unwind, r1, r2, where +| ldp r1, r2, where +|.endmacro +|.macro ldp_unwind, r1, r2, where, post_index +| ldp r1, r2, where, post_index +|.endmacro +|.else +|// Otherwise store r2 first for compact unwind info (OSX). +|.macro stp_unwind, r1, r2, where +| stp r2, r1, where +|.endmacro +|.macro ldp_unwind, r1, r2, where +| ldp r2, r1, where +|.endmacro +|.macro ldp_unwind, r1, r2, where, post_index +| ldp r2, r1, where, post_index +|.endmacro +|.endif +| |.macro save_, gpr1, gpr2, fpr1, fpr2 -| stp d..fpr2, d..fpr1, [sp, # SAVE_FPR_+(14-fpr1)*8] -| stp x..gpr2, x..gpr1, [sp, # SAVE_GPR_+(27-gpr1)*8] +| stp_unwind d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(14-fpr1)*8] +| stp_unwind x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(27-gpr1)*8] |.endmacro |.macro rest_, gpr1, gpr2, fpr1, fpr2 -| ldp d..fpr2, d..fpr1, [sp, # SAVE_FPR_+(14-fpr1)*8] -| ldp x..gpr2, x..gpr1, [sp, # SAVE_GPR_+(27-gpr1)*8] +| ldp_unwind d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(14-fpr1)*8] +| ldp_unwind x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(27-gpr1)*8] |.endmacro | |.macro saveregs @@ -127,14 +151,14 @@ | sub sp, sp, # CFRAME_SPACE | stp fp, lr, [sp, # SAVE_FP_LR_] | add fp, sp, # SAVE_FP_LR_ -| stp x20, x19, [sp, # SAVE_GPR_+(27-19)*8] +| stp_unwind x19, x20, [sp, # SAVE_GPR_+(27-19)*8] | save_ 21, 22, 8, 9 | save_ 23, 24, 10, 11 | save_ 25, 26, 12, 13 | save_ 27, 28, 14, 15 |.endmacro |.macro restoreregs -| ldp x20, x19, [sp, # SAVE_GPR_+(27-19)*8] +| ldp_unwind x19, x20, [sp, # SAVE_GPR_+(27-19)*8] | rest_ 21, 22, 8, 9 | rest_ 23, 24, 10, 11 | rest_ 25, 26, 12, 13 @@ -2162,7 +2186,7 @@ static void build_subroutines(BuildCtx *ctx) |//----------------------------------------------------------------------- | |// Handler for callback functions. - |// Saveregs already performed. Callback slot number in [sp], g in r12. + |// Saveregs already performed. Callback slot number in w9, g in x10. |->vm_ffi_callback: |.if FFI |.type CTSTATE, CTState, PC @@ -2215,7 +2239,7 @@ static void build_subroutines(BuildCtx *ctx) |.if FFI | .type CCSTATE, CCallState, x19 | sp_auth - | stp x20, CCSTATE, [sp, #-32]! + | stp_unwind CCSTATE, x20, [sp, #-32]! | stp fp, lr, [sp, #16] | add fp, sp, #16 | mov CCSTATE, x0 @@ -2247,7 +2271,7 @@ static void build_subroutines(BuildCtx *ctx) | stp d0, d1, CCSTATE->fpr[0] | stp d2, d3, CCSTATE->fpr[2] | ldp fp, lr, [sp, #16] - | ldp x20, CCSTATE, [sp], #32 + | ldp_unwind CCSTATE, x20, [sp], #32 | ret_auth |.endif |// Note: vm_ffi_call must be the last function in this object file! From 18b8fd8de794d1de7c3193189f42c5b0534043f5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 15 Sep 2023 05:27:29 +0200 Subject: [PATCH 33/95] ARM64: External unwinder already restores non-volatile registers. Thanks to Peter Cawley. #593 --- src/vm_arm64.dasc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index d622d2a0..34d29982 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -432,24 +432,24 @@ static void build_subroutines(BuildCtx *ctx) | // (void *cframe, int errcode) | mov sp, CARG1 | mov CRET1, CARG2 - |->vm_unwind_c_eh: // Landing pad for external unwinder. | ldr L, SAVE_L - | mv_vmstate TMP0w, C | ldr GL, L->glref + |->vm_unwind_c_eh: // Landing pad for external unwinder. + | mv_vmstate TMP0w, C | st_vmstate TMP0w | b ->vm_leave_unw | |->vm_unwind_ff: // Unwind C stack, return from ff pcall. | // (void *cframe) | and sp, CARG1, #CFRAME_RAWMASK - |->vm_unwind_ff_eh: // Landing pad for external unwinder. | ldr L, SAVE_L | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 | movn TISNIL, #0 + | ldr GL, L->glref // Setup pointer to global state. + |->vm_unwind_ff_eh: // Landing pad for external unwinder. | mov RC, #16 // 2 results: false + error message. | ldr BASE, L->base - | ldr GL, L->glref // Setup pointer to global state. | mov_false TMP0 | sub RA, BASE, #8 // Results start at BASE-8. | ldr PC, [BASE, FRAME_PC] // Fetch PC of previous frame. From 7a1c139569874f371f567d060738a3f5704930a1 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 15 Sep 2023 05:31:26 +0200 Subject: [PATCH 34/95] Windows: Pass scratch CONTEXT record to RtlUnwindEx. Thanks to Peter Cawley. #593 --- src/lj_err.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lj_err.c b/src/lj_err.c index 8ef51bf2..9677a1b0 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -285,8 +285,8 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, /* Don't catch access violations etc. */ return 1; /* ExceptionContinueSearch */ } -#if LJ_TARGET_X86 UNUSED(ctx); +#if LJ_TARGET_X86 UNUSED(dispatch); /* Call all handlers for all lower C frames (including ourselves) again ** with EH_UNWINDING set. Then call the specified function, passing cf @@ -304,7 +304,8 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, RtlUnwindEx(f, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ? lj_vm_unwind_ff_eh : lj_vm_unwind_c_eh), - rec, (void *)(uintptr_t)errcode, ctx, dispatch->HistoryTable); + rec, (void *)(uintptr_t)errcode, dispatch->ContextRecord, + dispatch->HistoryTable); /* RtlUnwindEx should never return. */ #endif } From bd2d10715165b89d30e46c5075aed725705dfe5b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 15 Sep 2023 05:47:29 +0200 Subject: [PATCH 35/95] Windows: Call C++ destructors without compiling with /EHa. Thanks to Peter Cawley. #593 --- doc/extensions.html | 4 +--- src/lj_err.c | 35 +++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index eb591d1e..a4f20841 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -426,9 +426,7 @@ the toolchain used to compile LuaJIT: on the C stack. The contents of the C++ exception object pass through unmodified.
  • Lua errors can be caught on the C++ side with catch(...). -The corresponding Lua error message can be retrieved from the Lua stack.
    -For MSVC for Windows 64 bit this requires compilation of your C++ code -with /EHa.
  • +The corresponding Lua error message can be retrieved from the Lua stack.
  • Throwing Lua errors across C++ frames is safe. C++ destructors will be called.
  • diff --git a/src/lj_err.c b/src/lj_err.c index 9677a1b0..cadc76bd 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -209,11 +209,6 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode) ** from 3rd party docs or must be found by trial-and-error. They really ** don't want you to write your own language-specific exception handler ** or to interact gracefully with MSVC. :-( -** -** Apparently MSVC doesn't call C++ destructors for foreign exceptions -** unless you compile your C++ code with /EHa. Unfortunately this means -** catch (...) also catches things like access violations. The use of -** _set_se_translator doesn't really help, because it requires /EHa, too. */ #define WIN32_LEAN_AND_MEAN @@ -270,11 +265,25 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, int errcode = LJ_EXCODE_CHECK(rec->ExceptionCode) ? LJ_EXCODE_ERRCODE(rec->ExceptionCode) : LUA_ERRRUN; if ((rec->ExceptionFlags & 6)) { /* EH_UNWINDING|EH_EXIT_UNWIND */ + if (rec->ExceptionCode == STATUS_LONGJUMP && + rec->ExceptionRecord && + LJ_EXCODE_CHECK(rec->ExceptionRecord->ExceptionCode)) { + errcode = LJ_EXCODE_ERRCODE(rec->ExceptionRecord->ExceptionCode); + if ((rec->ExceptionFlags & 0x20)) { /* EH_TARGET_UNWIND */ + /* Unwinding is about to finish; revert the ExceptionCode so that + ** RtlRestoreContext does not try to restore from a _JUMP_BUFFER. + */ + rec->ExceptionCode = 0; + } + } /* Unwind internal frames. */ err_unwind(L, cf, errcode); } else { void *cf2 = err_unwind(L, cf, 0); if (cf2) { /* We catch it, so start unwinding the upper frames. */ +#if !LJ_TARGET_X86 + EXCEPTION_RECORD rec2; +#endif if (rec->ExceptionCode == LJ_MSVC_EXCODE || rec->ExceptionCode == LJ_GCC_EXCODE) { #if !LJ_TARGET_CYGWIN @@ -285,8 +294,8 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, /* Don't catch access violations etc. */ return 1; /* ExceptionContinueSearch */ } - UNUSED(ctx); #if LJ_TARGET_X86 + UNUSED(ctx); UNUSED(dispatch); /* Call all handlers for all lower C frames (including ourselves) again ** with EH_UNWINDING set. Then call the specified function, passing cf @@ -297,6 +306,20 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, (void *)lj_vm_unwind_ff : (void *)lj_vm_unwind_c, errcode); /* lj_vm_rtlunwind does not return. */ #else + if (LJ_EXCODE_CHECK(rec->ExceptionCode)) { + /* For unwind purposes, wrap the EXCEPTION_RECORD in something that + ** looks like a longjmp, so that MSVC will execute C++ destructors in + ** the frames we unwind over. ExceptionInformation[0] should really + ** contain a _JUMP_BUFFER*, but hopefully nobody is looking too closely + ** at this point. + */ + rec2.ExceptionCode = STATUS_LONGJUMP; + rec2.ExceptionRecord = rec; + rec2.ExceptionAddress = 0; + rec2.NumberParameters = 1; + rec2.ExceptionInformation[0] = (ULONG_PTR)ctx; + rec = &rec2; + } /* Unwind the stack and call all handlers for all lower C frames ** (including ourselves) again with EH_UNWINDING set. Then set ** stack pointer = f, result = errcode and jump to the specified target. From 7a77a3cd85ed49498cc3b17e70c46ad518aebb72 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 15 Sep 2023 06:10:58 +0200 Subject: [PATCH 36/95] Windows/ARM64: Update install docs. --- doc/install.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/install.html b/doc/install.html index be721031..2c685c85 100644 --- a/doc/install.html +++ b/doc/install.html @@ -203,7 +203,7 @@ Or install Microsoft's Visual Studio (MSVC).

    Building with MSVC

    -Open a "Visual Studio Command Prompt" (either x86 or x64), cd to the +Open a "Visual Studio Command Prompt" (x86, x64 or ARM64), cd to the directory with the source code and run these commands:

    
    From 42ca6e120feebca85f1618da1c80cfa80b1d63ca Mon Sep 17 00:00:00 2001
    From: Mike Pall 
    Date: Sun, 17 Sep 2023 10:09:58 +0200
    Subject: [PATCH 37/95] ARM64: Set fixed interpreter registers before rethrow.
    
    Thanks to Peter Cawley. #593
    ---
     src/vm_arm64.dasc | 8 ++++----
     1 file changed, 4 insertions(+), 4 deletions(-)
    
    diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
    index 34d29982..61a3ba6d 100644
    --- a/src/vm_arm64.dasc
    +++ b/src/vm_arm64.dasc
    @@ -2029,13 +2029,13 @@ static void build_subroutines(BuildCtx *ctx)
       |.if JIT
       |  ldr L, SAVE_L
       |1:
    +  |   movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
    +  |   movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
    +  |   movn TISNIL, #0
       |  cmn CARG1w, #LUA_ERRERR
       |  bhs >9				// Check for error from exit.
    -  |   lsl RC, CARG1, #3
       |  ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
    -  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
    -  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
    -  |    movn TISNIL, #0
    +  |   lsl RC, CARG1, #3
       |  and LFUNC:CARG2, CARG2, #LJ_GCVMASK
       |   str RCw, SAVE_MULTRES
       |   str BASE, L->base
    
    From 7a2b83a0c5d980bf3db0aeda33c79e7bb4b3da01 Mon Sep 17 00:00:00 2001
    From: Mike Pall 
    Date: Sun, 17 Sep 2023 10:31:00 +0200
    Subject: [PATCH 38/95] IR_MIN/IR_MAX is non-commutative due to underlying FPU
     ops.
    
    Thanks to Peter Cawley. #1082
    ---
     src/lj_ir.h | 4 ++--
     1 file changed, 2 insertions(+), 2 deletions(-)
    
    diff --git a/src/lj_ir.h b/src/lj_ir.h
    index b32bd095..dbfba258 100644
    --- a/src/lj_ir.h
    +++ b/src/lj_ir.h
    @@ -76,8 +76,8 @@
       \
       _(ABS,	N , ref, ref) \
       _(LDEXP,	N , ref, ref) \
    -  _(MIN,	C , ref, ref) \
    -  _(MAX,	C , ref, ref) \
    +  _(MIN,	N , ref, ref) \
    +  _(MAX,	N , ref, ref) \
       _(FPMATH,	N , ref, lit) \
       \
       /* Overflow-checking arithmetic ops. */ \
    
    From e897c5743f97a6b05c59852709092e7da4119914 Mon Sep 17 00:00:00 2001
    From: Mike Pall 
    Date: Sun, 17 Sep 2023 10:44:04 +0200
    Subject: [PATCH 39/95] Windows/ARM64: Add MSVC cross-build support for x64 to
     ARM64.
    
    Thanks to invertego. #1081
    ---
     doc/install.html  |  3 +++
     src/msvcbuild.bat | 15 ++++++++++++++-
     2 files changed, 17 insertions(+), 1 deletion(-)
    
    diff --git a/doc/install.html b/doc/install.html
    index 2c685c85..04bfe26d 100644
    --- a/doc/install.html
    +++ b/doc/install.html
    @@ -214,6 +214,9 @@ msvcbuild
     Check the msvcbuild.bat file for more options.
     Then follow the installation instructions below.
     

    +

    +For an x64 to ARM64 cross-build run this first: vcvarsall.bat x64_arm64 +

    Building with MinGW or Cygwin

    Open a command prompt window and make sure the MinGW or Cygwin programs diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index 2cfcf26e..cd25beee 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -27,12 +27,15 @@ @set BUILDTYPE=release @set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c +@setlocal +@call :SETHOSTVARS %LJCOMPILE% host\minilua.c @if errorlevel 1 goto :BAD %LJLINK% /out:minilua.exe minilua.obj @if errorlevel 1 goto :BAD if exist minilua.exe.manifest^ %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe +@endlocal @set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU -D P64 @set LJARCH=x64 @@ -46,6 +49,7 @@ if exist minilua.exe.manifest^ :NO32 @if "%VSCMD_ARG_TGT_ARCH%" neq "arm64" goto :X64 @set DASC=vm_arm64.dasc +@set DASMTARGET=-D LUAJIT_TARGET=LUAJIT_ARCH_ARM64 @set LJARCH=arm64 @goto :DA :X64 @@ -60,12 +64,15 @@ minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC% if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt ) minilua host\genversion.lua -%LJCOMPILE% /I "." /I %DASMDIR% host\buildvm*.c +@setlocal +@call :SETHOSTVARS +%LJCOMPILE% /I "." /I %DASMDIR% %DASMTARGET% host\buildvm*.c @if errorlevel 1 goto :BAD %LJLINK% /out:buildvm.exe buildvm*.obj @if errorlevel 1 goto :BAD if exist buildvm.exe.manifest^ %LJMT% -manifest buildvm.exe.manifest -outputresource:buildvm.exe +@endlocal buildvm -m peobj -o lj_vm.obj @if errorlevel 1 goto :BAD @@ -124,6 +131,12 @@ if exist luajit.exe.manifest^ @echo. @echo === Successfully built LuaJIT for Windows/%LJARCH% === +@goto :END +:SETHOSTVARS +@if "%VSCMD_ARG_HOST_ARCH%_%VSCMD_ARG_TGT_ARCH%" equ "x64_arm64" ( + call "%VSINSTALLDIR%Common7\Tools\VsDevCmd.bat" -arch=%VSCMD_ARG_HOST_ARCH% -no_logo + echo on +) @goto :END :BAD @echo. From d2f6c55b05c716e5dbb479b7e684abaee7cf6e12 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 01:58:43 +0200 Subject: [PATCH 40/95] Cleanup stack overflow handling. Reported by Peter Cawley. #962 --- src/lj_state.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/lj_state.c b/src/lj_state.c index d7befaff..1a3473b4 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -97,8 +97,17 @@ void lj_state_shrinkstack(lua_State *L, MSize used) void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need) { MSize n; - if (L->stacksize > LJ_STACK_MAXEX) /* Overflow while handling overflow? */ - lj_err_throw(L, LUA_ERRERR); + if (L->stacksize >= LJ_STACK_MAXEX) { + /* 4. Throw 'error in error handling' when we are _over_ the limit. */ + if (L->stacksize > LJ_STACK_MAXEX) + lj_err_throw(L, LUA_ERRERR); /* Does not invoke an error handler. */ + /* 1. We are _at_ the limit after the last growth. */ + if (!L->status) { /* 2. Throw 'stack overflow'. */ + L->status = LUA_ERRRUN; /* Prevent ending here again for pushed msg. */ + lj_err_msg(L, LJ_ERR_STKOV); /* May invoke an error handler. */ + } + /* 3. Add space (over the limit) for pushed message and error handler. */ + } n = L->stacksize + need; if (n > LJ_STACK_MAX) { n += 2*LUA_MINSTACK; @@ -108,8 +117,6 @@ void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need) n = LJ_STACK_MAX; } resizestack(L, n); - if (L->stacksize >= LJ_STACK_MAXEX) - lj_err_msg(L, LJ_ERR_STKOV); } void LJ_FASTCALL lj_state_growstack1(lua_State *L) From 92b89d005ab721a61bce6d471b052bcb236b81d7 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 02:10:18 +0200 Subject: [PATCH 41/95] Add missing coercion when recording select(string, ...) Thanks to Peter Cawley. #1083 --- src/lj_record.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lj_record.c b/src/lj_record.c index dfcc3f65..a49f942a 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1570,8 +1570,11 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) TRef tr = TREF_NIL; ptrdiff_t idx = lj_ffrecord_select_mode(J, tridx, &J->L->base[dst-1]); if (idx < 0) goto nyivarg; - if (idx != 0 && !tref_isinteger(tridx)) + if (idx != 0 && !tref_isinteger(tridx)) { + if (tref_isstr(tridx)) + tridx = emitir(IRTG(IR_STRTO, IRT_NUM), tridx, 0); tridx = emitir(IRTGI(IR_CONV), tridx, IRCONV_INT_NUM|IRCONV_INDEX); + } if (idx != 0 && tref_isk(tridx)) { emitir(IRTGI(idx <= nvararg ? IR_GE : IR_LT), fr, lj_ir_kint(J, frofs+8*(int32_t)idx)); From b138ccfa918518a152bc830fef3d53cd0a922e36 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 02:15:16 +0200 Subject: [PATCH 42/95] Handle all stack layouts in (delayed) TRACE vmevent. Thanks to Sergey Bronnikov and Peter Cawley. #1087 --- src/lj_trace.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/lj_trace.c b/src/lj_trace.c index a72e73a3..25e610b5 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -524,21 +524,27 @@ static int trace_abort(jit_State *J) J->cur.link = 0; J->cur.linktype = LJ_TRLINK_NONE; lj_vmevent_send(L, TRACE, - TValue *frame; + cTValue *bot = tvref(L->stack); + cTValue *frame; const BCIns *pc; - GCfunc *fn; + BCPos pos = 0; setstrV(L, L->top++, lj_str_newlit(L, "abort")); setintV(L->top++, traceno); /* Find original Lua function call to generate a better error message. */ - frame = J->L->base-1; - pc = J->pc; - while (!isluafunc(frame_func(frame))) { - pc = (frame_iscont(frame) ? frame_contpc(frame) : frame_pc(frame)) - 1; - frame = frame_prev(frame); + for (frame = J->L->base-1, pc = J->pc; ; frame = frame_prev(frame)) { + if (isluafunc(frame_func(frame))) { + pos = proto_bcpos(funcproto(frame_func(frame)), pc); + break; + } else if (frame_prev(frame) <= bot) { + break; + } else if (frame_iscont(frame)) { + pc = frame_contpc(frame) - 1; + } else { + pc = frame_pc(frame) - 1; + } } - fn = frame_func(frame); - setfuncV(L, L->top++, fn); - setintV(L->top++, proto_bcpos(funcproto(fn), pc)); + setfuncV(L, L->top++, frame_func(frame)); + setintV(L->top++, pos); copyTV(L, L->top++, restorestack(L, errobj)); copyTV(L, L->top++, &J->errinfo); ); From fca1f51bf8209a41f8d7cd13ff09f113ac0d87b6 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 02:38:29 +0200 Subject: [PATCH 43/95] ARM64: Fuse negative 32 bit constants into arithmetic ops again. Thanks to Peter Cawley. #1065 --- src/lj_asm_arm64.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index c2b17737..8673f7df 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -222,7 +222,8 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) return A64F_M(ir->r); } else if (irref_isk(ref)) { int64_t k = get_k64val(as, ref); - uint32_t m = logical ? emit_isk13(k, irt_is64(ir->t)) : emit_isk12(k); + uint32_t m = logical ? emit_isk13(k, irt_is64(ir->t)) : + emit_isk12(irt_is64(ir->t) ? k : (int32_t)k); if (m) return m; } else if (mayfuse(as, ref)) { From 91592899275cbb540ca67bbf95b41a2200e4fdbd Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 02:48:12 +0200 Subject: [PATCH 44/95] ARM64: Fix IR_HREF code generation for constant FP keys. Reported by swarn. Fix for 435d8c63 by Peter Cawley. #1090 --- src/lj_asm_arm64.h | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 8673f7df..82f14405 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -787,7 +787,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) int destused = ra_used(ir); Reg dest = ra_dest(as, ir, allow); Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); - Reg key = 0, tmp = RID_TMP, type = RID_NONE, tkey; + Reg tmp = RID_TMP, type = RID_NONE, key, tkey; IRRef refkey = ir->op2; IRIns *irkey = IR(refkey); int isk = irref_isk(refkey); @@ -797,26 +797,22 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) MCLabel l_end, l_loop; rset_clear(allow, tab); - /* Allocate registers outside of the loop. */ - if (irkey->o != IR_KNUM || !(k = emit_isk12((int64_t)ir_knum(irkey)->u64))) { - key = ra_alloc1(as, refkey, irt_isnum(kt) ? RSET_FPR : allow); - rset_clear(allow, key); - } - if (!isk) { - tkey = ra_scratch(as, allow); - rset_clear(allow, tkey); - } else if (irt_isnum(kt)) { - tkey = key; /* Assumes -0.0 is already canonicalized to +0.0. */ - } else { + /* Allocate register for tkey outside of the loop. */ + if (isk) { int64_t kk; if (irt_isaddr(kt)) { kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64; + } else if (irt_isnum(kt)) { + kk = (int64_t)ir_knum(irkey)->u64; + /* Assumes -0.0 is already canonicalized to +0.0. */ } else { lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type"); kk = ~((int64_t)~irt_toitype(kt) << 47); } - tkey = ra_allock(as, kk, allow); - rset_clear(allow, tkey); + k = emit_isk12(kk); + tkey = k ? 0 : ra_allock(as, kk, allow); + } else { + tkey = ra_scratch(as, allow); } /* Key not found in chain: jump to exit (if merged) or load niltv. */ @@ -849,10 +845,13 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) /* Construct tkey as canonicalized or tagged key. */ if (!isk) { if (irt_isnum(kt)) { + key = ra_alloc1(as, refkey, RSET_FPR); emit_dnm(as, A64I_CSELx | A64F_CC(CC_EQ), tkey, RID_ZERO, tkey); + /* A64I_FMOV_R_D from key to tkey done below. */ } else { lj_assertA(irt_isaddr(kt), "bad HREF key type"); - type = ra_allock(as, irt_toitype(kt) << 15, allow); + key = ra_alloc1(as, refkey, allow); + type = ra_allock(as, irt_toitype(kt) << 15, rset_clear(allow, key)); emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 32), tkey, key, type); } } From b8919781d4717d8c3171b0002d230e03304d8174 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 03:46:33 +0200 Subject: [PATCH 45/95] Consistently use 64 bit constants for 64 bit IR instructions. Thanks to Peter Cawley. #1084 --- src/lj_asm_x86.h | 3 ++- src/lj_ffrecord.c | 23 +++++++++++------------ src/lj_iropt.h | 6 ++++++ src/lj_record.c | 9 +++++---- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 9f779bf5..c92de3d8 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -140,7 +140,8 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref) } } else if (irb->o == IR_ADD && irref_isk(irb->op2)) { /* Fuse base offset (vararg load). */ - as->mrm.ofs = IR(irb->op2)->i; + IRIns *irk = IR(irb->op2); + as->mrm.ofs = irk->o == IR_KINT ? irk->i : (int32_t)ir_kint64(irk)->u64; return irb->op1; } return ref; /* Otherwise use the given array base. */ diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 8ebf4165..1233e5f7 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -1130,7 +1130,7 @@ static TRef recff_sbufx_check(jit_State *J, RecordFFData *rd, ptrdiff_t arg) /* Emit BUFHDR for write to extended string buffer. */ static TRef recff_sbufx_write(jit_State *J, TRef ud) { - TRef trbuf = emitir(IRT(IR_ADD, IRT_PGC), ud, lj_ir_kint(J, sizeof(GCudata))); + TRef trbuf = emitir(IRT(IR_ADD, IRT_PGC), ud, lj_ir_kintpgc(J, sizeof(GCudata))); return emitir(IRT(IR_BUFHDR, IRT_PGC), trbuf, IRBUFHDR_WRITE); } @@ -1164,20 +1164,19 @@ static void LJ_FASTCALL recff_buffer_method_reset(jit_State *J, RecordFFData *rd SBufExt *sbx = bufV(&rd->argv[0]); int iscow = (int)sbufiscow(sbx); TRef trl = recff_sbufx_get_L(J, ud); - TRef trcow = emitir(IRT(IR_BAND, IRT_IGC), trl, lj_ir_kint(J, SBUF_FLAG_COW)); - TRef zero = lj_ir_kint(J, 0); - emitir(IRTG(iscow ? IR_NE : IR_EQ, IRT_IGC), trcow, zero); + TRef trcow = emitir(IRT(IR_BAND, IRT_IGC), trl, lj_ir_kintpgc(J, SBUF_FLAG_COW)); + TRef zeropgc = lj_ir_kintpgc(J, 0); + emitir(IRTG(iscow ? IR_NE : IR_EQ, IRT_IGC), trcow, zeropgc); if (iscow) { - trl = emitir(IRT(IR_BXOR, IRT_IGC), trl, - LJ_GC64 ? lj_ir_kint64(J, SBUF_FLAG_COW) : - lj_ir_kint(J, SBUF_FLAG_COW)); - recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, zero); - recff_sbufx_set_ptr(J, ud, IRFL_SBUF_E, zero); - recff_sbufx_set_ptr(J, ud, IRFL_SBUF_B, zero); + TRef zerop = lj_ir_kintp(J, 0); + trl = emitir(IRT(IR_BXOR, IRT_IGC), trl, lj_ir_kintpgc(J, SBUF_FLAG_COW)); + recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, zerop); + recff_sbufx_set_ptr(J, ud, IRFL_SBUF_E, zerop); + recff_sbufx_set_ptr(J, ud, IRFL_SBUF_B, zerop); recff_sbufx_set_L(J, ud, trl); emitir(IRT(IR_FSTORE, IRT_PGC), - emitir(IRT(IR_FREF, IRT_PGC), ud, IRFL_SBUF_REF), zero); - recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, zero); + emitir(IRT(IR_FREF, IRT_PGC), ud, IRFL_SBUF_REF), zeropgc); + recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, zerop); } else { TRef trb = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_B); recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, trb); diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 458a5511..a71a717b 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -56,6 +56,12 @@ LJ_FUNC TRef lj_ir_ktrace(jit_State *J); #define lj_ir_kintp(J, k) lj_ir_kint(J, (int32_t)(k)) #endif +#if LJ_GC64 +#define lj_ir_kintpgc lj_ir_kintp +#else +#define lj_ir_kintpgc lj_ir_kint +#endif + static LJ_AINLINE TRef lj_ir_knum(jit_State *J, lua_Number n) { TValue tv; diff --git a/src/lj_record.c b/src/lj_record.c index 7a970628..d44f7737 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1781,7 +1781,7 @@ noconstify: emitir(IRTG(IR_EQ, IRT_PGC), REF_BASE, emitir(IRT(IR_ADD, IRT_PGC), uref, - lj_ir_kint(J, (slot - 1 - LJ_FR2) * -8))); + lj_ir_kintpgc(J, (slot - 1 - LJ_FR2) * -8))); slot -= (int32_t)J->baseslot; /* Note: slot number may be negative! */ if (val == 0) { return getslot(J, slot); @@ -1794,7 +1794,7 @@ noconstify: } emitir(IRTG(IR_UGT, IRT_PGC), emitir(IRT(IR_SUB, IRT_PGC), uref, REF_BASE), - lj_ir_kint(J, (J->baseslot + J->maxslot) * 8)); + lj_ir_kintpgc(J, (J->baseslot + J->maxslot) * 8)); } else { needbarrier = 1; uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_PGC), fn, uv)); @@ -1972,7 +1972,8 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) emitir(IRTGI(IR_EQ), fr, lj_ir_kint(J, (int32_t)frame_ftsz(J->L->base-1))); vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr); - vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8*(1+LJ_FR2))); + vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, + lj_ir_kintpgc(J, frofs-8*(1+LJ_FR2))); for (i = 0; i < nload; i++) { IRType t = itype2irt(&J->L->base[i-1-LJ_FR2-nvararg]); J->base[dst+i] = lj_record_vload(J, vbase, (MSize)i, t); @@ -2023,7 +2024,7 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) IRType t; TRef aref, vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr); vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, - lj_ir_kint(J, frofs-(8<L->base[idx-2-LJ_FR2-nvararg]); aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, tridx); tr = lj_record_vload(J, aref, 0, t); From e86990f7f24a94b0897061f25a84547fe1108bed Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 03:54:08 +0200 Subject: [PATCH 46/95] Restore cur_L for specific Lua/C API use case. Thanks to Peter Cawley. #1066 --- src/lj_err.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lj_err.c b/src/lj_err.c index cadc76bd..7b11e4d0 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -174,12 +174,15 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode) case FRAME_PCALL: /* FF pcall() frame. */ case FRAME_PCALLH: /* FF pcall() frame inside hook. */ if (errcode) { + global_State *g; if (errcode == LUA_YIELD) { frame = frame_prevd(frame); break; } + g = G(L); + setgcref(g->cur_L, obj2gco(L)); if (frame_typep(frame) == FRAME_PCALL) - hook_leave(G(L)); + hook_leave(g); L->base = frame_prevd(frame) + 1; L->cframe = cf; unwindstack(L, L->base); From a5d2f70c73e406beb617afa829a7af5b8c1d842c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 04:40:48 +0200 Subject: [PATCH 47/95] Handle OOM error on stack resize in coroutine.resume and lua_checkstack. Thanks to Peter Cawley. #1066 --- src/lib_base.c | 5 ++++- src/lj_api.c | 7 ++++++- src/lj_state.c | 12 ++++++++++++ src/lj_state.h | 1 + 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/lib_base.c b/src/lib_base.c index dd54b9f9..4e6f8a30 100644 --- a/src/lib_base.c +++ b/src/lib_base.c @@ -616,7 +616,10 @@ static int ffh_resume(lua_State *L, lua_State *co, int wrap) setstrV(L, L->base-LJ_FR2, lj_err_str(L, em)); return FFH_RES(2); } - lj_state_growstack(co, (MSize)(L->top - L->base)); + if (lj_state_cpgrowstack(co, (MSize)(L->top - L->base)) != LUA_OK) { + cTValue *msg = --co->top; + lj_err_callermsg(L, strVdata(msg)); + } return FFH_RETRY; } diff --git a/src/lj_api.c b/src/lj_api.c index 386bcada..d4048d79 100644 --- a/src/lj_api.c +++ b/src/lj_api.c @@ -104,7 +104,12 @@ LUA_API int lua_checkstack(lua_State *L, int size) if (size > LUAI_MAXCSTACK || (L->top - L->base + size) > LUAI_MAXCSTACK) { return 0; /* Stack overflow. */ } else if (size > 0) { - lj_state_checkstack(L, (MSize)size); + int avail = (int)(mref(L->maxstack, TValue) - L->top); + if (size > avail && + lj_state_cpgrowstack(L, (MSize)(size - avail)) != LUA_OK) { + L->top--; + return 0; /* Out of memory. */ + } } return 1; } diff --git a/src/lj_state.c b/src/lj_state.c index 6b3f58ff..569e3f38 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -130,6 +130,18 @@ void LJ_FASTCALL lj_state_growstack1(lua_State *L) lj_state_growstack(L, 1); } +static TValue *cpgrowstack(lua_State *co, lua_CFunction dummy, void *ud) +{ + UNUSED(dummy); + lj_state_growstack(co, *(MSize *)ud); + return NULL; +} + +int LJ_FASTCALL lj_state_cpgrowstack(lua_State *L, MSize need) +{ + return lj_vm_cpcall(L, NULL, &need, cpgrowstack); +} + /* Allocate basic stack for new state. */ static void stack_init(lua_State *L1, lua_State *L) { diff --git a/src/lj_state.h b/src/lj_state.h index db67f03b..3850e5a1 100644 --- a/src/lj_state.h +++ b/src/lj_state.h @@ -18,6 +18,7 @@ LJ_FUNC void lj_state_relimitstack(lua_State *L); LJ_FUNC void lj_state_shrinkstack(lua_State *L, MSize used); LJ_FUNCA void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need); LJ_FUNC void LJ_FASTCALL lj_state_growstack1(lua_State *L); +LJ_FUNC int LJ_FASTCALL lj_state_cpgrowstack(lua_State *L, MSize need); static LJ_AINLINE void lj_state_checkstack(lua_State *L, MSize need) { From aa6b15c1a8922848bd6f596ba384824ca3fe0f5f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 04:43:40 +0200 Subject: [PATCH 48/95] Follow-up fix for stack overflow handling cleanup. --- src/lj_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_state.c b/src/lj_state.c index 1a3473b4..c2f0b115 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -102,7 +102,7 @@ void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need) if (L->stacksize > LJ_STACK_MAXEX) lj_err_throw(L, LUA_ERRERR); /* Does not invoke an error handler. */ /* 1. We are _at_ the limit after the last growth. */ - if (!L->status) { /* 2. Throw 'stack overflow'. */ + if (L->status < LUA_ERRRUN) { /* 2. Throw 'stack overflow'. */ L->status = LUA_ERRRUN; /* Prevent ending here again for pushed msg. */ lj_err_msg(L, LJ_ERR_STKOV); /* May invoke an error handler. */ } From d1a2fef8a8f53b0055ee041f7f63d83a27444ffa Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Sep 2023 05:19:55 +0200 Subject: [PATCH 49/95] LJ_FR2: Fix stack checks in vararg calls. Thanks to Peter Cawley. #1048 --- src/lj_def.h | 2 +- src/lj_dispatch.c | 2 +- src/vm_arm64.dasc | 1 + src/vm_mips64.dasc | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lj_def.h b/src/lj_def.h index 1461d3d7..0d6c346b 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -69,7 +69,7 @@ typedef unsigned int uintptr_t; #define LJ_MAX_UPVAL 60 /* Max. # of upvalues. */ #define LJ_MAX_IDXCHAIN 100 /* __index/__newindex chain limit. */ -#define LJ_STACK_EXTRA (5+2*LJ_FR2) /* Extra stack space (metamethods). */ +#define LJ_STACK_EXTRA (5+3*LJ_FR2) /* Extra stack space (metamethods). */ #define LJ_NUM_CBPAGE 1 /* Number of FFI callback pages. */ diff --git a/src/lj_dispatch.c b/src/lj_dispatch.c index 57809e62..b9748bba 100644 --- a/src/lj_dispatch.c +++ b/src/lj_dispatch.c @@ -453,7 +453,7 @@ static int call_init(lua_State *L, GCfunc *fn) int numparams = pt->numparams; int gotparams = (int)(L->top - L->base); int need = pt->framesize; - if ((pt->flags & PROTO_VARARG)) need += 1+gotparams; + if ((pt->flags & PROTO_VARARG)) need += 1+LJ_FR2+gotparams; lj_state_checkstack(L, (MSize)need); numparams -= gotparams; return numparams >= 0 ? numparams : 0; diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 61a3ba6d..3044a8ac 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -3916,6 +3916,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | add TMP2, BASE, RC | add LFUNC:CARG3, CARG3, TMP0, lsl #47 | add RA, RA, RC + | sub CARG1, CARG1, #8 | add TMP0, RC, #16+FRAME_VARG | str LFUNC:CARG3, [TMP2], #8 // Store (tagged) copy of LFUNC. | ldr KBASE, [PC, #-4+PC2PROTO(k)] diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc index 6c215f2b..ef0d901d 100644 --- a/src/vm_mips64.dasc +++ b/src/vm_mips64.dasc @@ -5396,6 +5396,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | settp LFUNC:RB, TMP0 | daddu TMP0, RA, RC | sd LFUNC:RB, 0(TMP1) // Store (tagged) copy of LFUNC. + | daddiu TMP2, TMP2, -8 | daddiu TMP3, RC, 16+FRAME_VARG | sltu AT, TMP0, TMP2 | ld KBASE, -4+PC2PROTO(k)(PC) From f72c19e482b6f918b7cf42b0436e2b117d160a29 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 22 Sep 2023 21:04:22 +0200 Subject: [PATCH 50/95] Maintain chain invariant in DCE. Thanks to Peter Cawley. #1094 --- src/lj_opt_dce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_opt_dce.c b/src/lj_opt_dce.c index c6c3e1bc..e6fcc552 100644 --- a/src/lj_opt_dce.c +++ b/src/lj_opt_dce.c @@ -44,12 +44,12 @@ static void dce_propagate(jit_State *J) IRIns *ir = IR(ins); if (irt_ismarked(ir->t)) { irt_clearmark(ir->t); - pchain[ir->o] = &ir->prev; } else if (!ir_sideeff(ir)) { *pchain[ir->o] = ir->prev; /* Reroute original instruction chain. */ lj_ir_nop(ir); continue; } + pchain[ir->o] = &ir->prev; if (ir->op1 >= REF_FIRST) irt_setmark(IR(ir->op1)->t); if (ir->op2 >= REF_FIRST) irt_setmark(IR(ir->op2)->t); } From becf5cc65d966a8926466dd43407c48bfea0fa13 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 25 Sep 2023 16:56:17 +0200 Subject: [PATCH 51/95] FFI: Fix ffi.abi("pauth"). Thanks to Peter Cawley. #1098 --- src/lib_ffi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib_ffi.c b/src/lib_ffi.c index 6dee2e74..ba783173 100644 --- a/src/lib_ffi.c +++ b/src/lib_ffi.c @@ -746,7 +746,7 @@ LJLIB_CF(ffi_abi) LJLIB_REC(.) "\003win" #endif #if LJ_ABI_PAUTH - "\007pauth" + "\005pauth" #endif #if LJ_TARGET_UWP "\003uwp" From 007e4dce13673b01a38b19384f54fa50a79a66de Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 21:17:43 +0200 Subject: [PATCH 52/95] ARM64: Restore fp before sp in C stack unwinders. Thanks to Peter Cawley. #1096 --- src/host/buildvm_peobj.c | 3 ++- src/vm_arm64.dasc | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index 7ce3b05a..8f04c496 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -373,11 +373,12 @@ void emit_peobj(BuildCtx *ctx) /* Unwind codes for .text section with handler. */ p = uwc; + CADD_FP(192); /* +2 */ CSAVE_REGS(19, 28, 176); /* +5*2 */ CSAVE_FREGS(8, 15, 96); /* +4*2 */ CSAVE_FPLR(192); /* +1 */ CALLOC_S(208); /* +1 */ - CEND_ALIGN; /* +1 +3 -> 24 */ + CEND_ALIGN; /* +1 +1 -> 24 */ u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2); owrite(ctx, &u32, 4); diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 3044a8ac..26973686 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -430,6 +430,7 @@ static void build_subroutines(BuildCtx *ctx) | |->vm_unwind_c: // Unwind C stack, return from vm_pcall. | // (void *cframe, int errcode) + | add fp, CARG1, # SAVE_FP_LR_ | mov sp, CARG1 | mov CRET1, CARG2 | ldr L, SAVE_L @@ -441,7 +442,8 @@ static void build_subroutines(BuildCtx *ctx) | |->vm_unwind_ff: // Unwind C stack, return from ff pcall. | // (void *cframe) - | and sp, CARG1, #CFRAME_RAWMASK + | add fp, CARG1, # SAVE_FP_LR_ + | mov sp, CARG1 | ldr L, SAVE_L | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 From 1e93951b258cdf885779992434201c6114445665 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 21:20:10 +0200 Subject: [PATCH 53/95] ARM64: Fix register hint for FFI calls with FP results. Thanks to Peter Cawley. #1096 --- src/lj_asm_arm64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 82f14405..9f165fa8 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -1985,7 +1985,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) as->evenspill = nslots; } #endif - return REGSP_HINT(RID_RET); + return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET); } static void asm_setup_target(ASMState *as) From 9cc8bbb7ae3675382d016e33b6d8b022101077b8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 21:22:50 +0200 Subject: [PATCH 54/95] ARM: Fix register hint for FFI calls with FP results. --- src/lj_asm_arm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index a003d5ca..ac3d1b58 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -2255,7 +2255,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) } if (nslots > as->evenspill) /* Leave room for args in stack slots. */ as->evenspill = nslots; - return REGSP_HINT(RID_RET); + return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET); } static void asm_setup_target(ASMState *as) From c5b075eb313e7ed4f3184382f6e70bc48b15ec72 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 21:39:40 +0200 Subject: [PATCH 55/95] ARM64: Unify constant register handling in interpreter. Plus minor optimizations. Simplifications for out-of-tree ARM64EC. Thanks to Peter Cawley. #1096 --- src/vm_arm64.dasc | 150 ++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 26973686..2aaa64cb 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -291,8 +291,17 @@ | blo target |.endmacro | +|.macro init_constants +| movn TISNIL, #0 +| movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 +| movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 +|.endmacro +| |.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro |.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro +|.macro mov_nil, reg; mov reg, TISNIL; .endmacro +|.macro cmp_nil, reg; cmp reg, TISNIL; .endmacro +|.macro add_TISNUM, dst, src; add dst, src, TISNUM; .endmacro | #define GL_J(field) (GG_G2J + (int)offsetof(jit_State, field)) | @@ -445,9 +454,7 @@ static void build_subroutines(BuildCtx *ctx) | add fp, CARG1, # SAVE_FP_LR_ | mov sp, CARG1 | ldr L, SAVE_L - | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 - | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 - | movn TISNIL, #0 + | init_constants | ldr GL, L->glref // Setup pointer to global state. |->vm_unwind_ff_eh: // Landing pad for external unwinder. | mov RC, #16 // 2 results: false + error message. @@ -512,11 +519,9 @@ static void build_subroutines(BuildCtx *ctx) | str L, GL->cur_L | mov RA, BASE | ldp BASE, CARG1, L->base - | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 - | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 + | init_constants | ldr PC, [BASE, FRAME_PC] | strb wzr, L->status - | movn TISNIL, #0 | sub RC, CARG1, BASE | ands CARG1, PC, #FRAME_TYPE | add RC, RC, #8 @@ -552,10 +557,8 @@ static void build_subroutines(BuildCtx *ctx) |3: // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype). | str L, GL->cur_L | ldp RB, CARG1, L->base // RB = old base (for vmeta_call). - | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 - | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 | add PC, PC, BASE - | movn TISNIL, #0 + | init_constants | sub PC, PC, RB // PC = frame delta + frame type | sub NARGS8:RC, CARG1, BASE | st_vmstate ST_INTERP @@ -664,7 +667,7 @@ static void build_subroutines(BuildCtx *ctx) | b >1 | |->vmeta_tgetb: // RB = table, RC = index - | add RC, RC, TISNUM + | add_TISNUM RC, RC | add CARG2, BASE, RB, lsl #3 | add CARG3, sp, TMPDofs | str RC, TMPD @@ -699,7 +702,7 @@ static void build_subroutines(BuildCtx *ctx) | sxtw CARG2, TMP1w | bl extern lj_tab_getinth // (GCtab *t, int32_t key) | // Returns cTValue * or NULL. - | mov TMP0, TISNIL + | mov_nil TMP0 | cbz CRET1, ->BC_TGETR_Z | ldr TMP0, [CRET1] | b ->BC_TGETR_Z @@ -722,7 +725,7 @@ static void build_subroutines(BuildCtx *ctx) | b >1 | |->vmeta_tsetb: // RB = table, RC = index - | add RC, RC, TISNUM + | add_TISNUM RC, RC | add CARG2, BASE, RB, lsl #3 | add CARG3, sp, TMPDofs | str RC, TMPD @@ -1036,7 +1039,7 @@ static void build_subroutines(BuildCtx *ctx) |1: // Field metatable must be at same offset for GCtab and GCudata! | ldr TAB:RB, TAB:CARG1->metatable |2: - | mov CARG1, TISNIL + | mov_nil CARG1 | ldr STR:RC, GL->gcroot[GCROOT_MMNAME+MM_metatable] | cbz TAB:RB, ->fff_restv | ldr TMP1w, TAB:RB->hmask @@ -1058,7 +1061,7 @@ static void build_subroutines(BuildCtx *ctx) | movk CARG1, #(LJ_TTAB>>1)&0xffff, lsl #48 | b ->fff_restv |5: - | cmp TMP0, TISNIL + | cmp_nil TMP0 | bne ->fff_restv | b <4 | @@ -1158,8 +1161,8 @@ static void build_subroutines(BuildCtx *ctx) | cbnz TAB:CARG2, ->fff_fallback #endif | mov RC, #(3+1)*8 - | stp CARG1, TISNIL, [BASE, #-8] - | str CFUNC:CARG4, [BASE, #-16] + | stp CFUNC:CARG4, CARG1, [BASE, #-16] + | str TISNIL, [BASE] | b ->fff_res | |.ffunc_2 ipairs_aux @@ -1171,14 +1174,14 @@ static void build_subroutines(BuildCtx *ctx) | add CARG2w, CARG2w, #1 | cmp CARG2w, TMP1w | ldr PC, [BASE, FRAME_PC] - | add TMP2, CARG2, TISNUM + | add_TISNUM TMP2, CARG2 | mov RC, #(0+1)*8 | str TMP2, [BASE, #-16] | bhs >2 // Not in array part? | ldr TMP0, [CARG3, CARG2, lsl #3] |1: | mov TMP1, #(2+1)*8 - | cmp TMP0, TISNIL + | cmp_nil TMP0 | str TMP0, [BASE, #-8] | csel RC, RC, TMP1, eq | b ->fff_res @@ -1201,8 +1204,8 @@ static void build_subroutines(BuildCtx *ctx) | cbnz TAB:CARG2, ->fff_fallback #endif | mov RC, #(3+1)*8 - | stp CARG1, TISNUM, [BASE, #-8] - | str CFUNC:CARG4, [BASE, #-16] + | stp CFUNC:CARG4, CARG1, [BASE, #-16] + | str TISNUM, [BASE] | b ->fff_res | |//-- Base library: catch errors ---------------------------------------- @@ -1392,7 +1395,7 @@ static void build_subroutines(BuildCtx *ctx) | eor CARG2w, CARG1w, CARG1w, asr #31 | movz CARG3, #0x41e0, lsl #48 // 2^31. | subs CARG1w, CARG2w, CARG1w, asr #31 - | add CARG1, CARG1, TISNUM + | add_TISNUM CARG1, CARG1 | csel CARG1, CARG1, CARG3, pl | // Fallthrough. | @@ -1483,7 +1486,7 @@ static void build_subroutines(BuildCtx *ctx) | ldr PC, [BASE, FRAME_PC] | str d0, [BASE, #-16] | mov RC, #(2+1)*8 - | add CARG2, CARG2, TISNUM + | add_TISNUM CARG2, CARG2 | str CARG2, [BASE, #-8] | b ->fff_res | @@ -1549,7 +1552,7 @@ static void build_subroutines(BuildCtx *ctx) | bne ->fff_fallback | ldrb TMP0w, STR:CARG1[1] // Access is always ok (NUL at end). | ldr CARG3w, STR:CARG1->len - | add TMP0, TMP0, TISNUM + | add_TISNUM TMP0, TMP0 | str TMP0, [BASE, #-16] | mov RC, #(0+1)*8 | cbz CARG3, ->fff_res @@ -1695,17 +1698,17 @@ static void build_subroutines(BuildCtx *ctx) |.ffunc_bit tobit | mov TMP0w, CARG1w |9: // Label reused by .ffunc_bit_op users. - | add CARG1, TMP0, TISNUM + | add_TISNUM CARG1, TMP0 | b ->fff_restv | |.ffunc_bit bswap | rev TMP0w, CARG1w - | add CARG1, TMP0, TISNUM + | add_TISNUM CARG1, TMP0 | b ->fff_restv | |.ffunc_bit bnot | mvn TMP0w, CARG1w - | add CARG1, TMP0, TISNUM + | add_TISNUM CARG1, TMP0 | b ->fff_restv | |.macro .ffunc_bit_sh, name, ins, shmod @@ -1726,7 +1729,7 @@ static void build_subroutines(BuildCtx *ctx) | checkint CARG1, ->vm_tobit_fb |2: | ins TMP0w, CARG1w, TMP1w - | add CARG1, TMP0, TISNUM + | add_TISNUM CARG1, TMP0 | b ->fff_restv |.endmacro | @@ -1915,8 +1918,7 @@ static void build_subroutines(BuildCtx *ctx) | and CARG3, CARG3, #LJ_GCVMASK | beq >2 |1: // Move results down. - | ldr CARG1, [RA] - | add RA, RA, #8 + | ldr CARG1, [RA], #8 | subs RB, RB, #8 | str CARG1, [BASE, RC, lsl #3] | add RC, RC, #1 @@ -2031,9 +2033,7 @@ static void build_subroutines(BuildCtx *ctx) |.if JIT | ldr L, SAVE_L |1: - | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 - | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 - | movn TISNIL, #0 + | init_constants | cmn CARG1w, #LUA_ERRERR | bhs >9 // Check for error from exit. | ldr LFUNC:CARG2, [BASE, FRAME_FUNC] @@ -2212,9 +2212,7 @@ static void build_subroutines(BuildCtx *ctx) | bl extern lj_ccallback_enter // (CTState *cts, void *cf) | // Returns lua_State *. | ldp BASE, RC, L:CRET1->base - | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 - | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 - | movn TISNIL, #0 + | init_constants | mov L, CRET1 | ldr LFUNC:CARG3, [BASE, FRAME_FUNC] | sub RC, RC, BASE @@ -2593,7 +2591,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | bne >5 | negs TMP0w, TMP0w | movz CARG3, #0x41e0, lsl #48 // 2^31. - | add TMP0, TMP0, TISNUM + | add_TISNUM TMP0, TMP0 | csel TMP0, TMP0, CARG3, vc |5: | str TMP0, [BASE, RA, lsl #3] @@ -2608,7 +2606,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | bne >2 | ldr CARG1w, STR:CARG1->len |1: - | add CARG1, CARG1, TISNUM + | add_TISNUM CARG1, CARG1 | str CARG1, [BASE, RA, lsl #3] | ins_next | @@ -2716,7 +2714,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | intins CARG1w, CARG1w, CARG2w | ins_arithfallback bvs |.endif - | add CARG1, CARG1, TISNUM + | add_TISNUM CARG1, CARG1 | str CARG1, [BASE, RA, lsl #3] |4: | ins_next @@ -2809,7 +2807,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_KSHORT: | // RA = dst, RC = int16_literal | sxth RCw, RCw - | add TMP0, RC, TISNUM + | add_TISNUM TMP0, RC | str TMP0, [BASE, RA, lsl #3] | ins_next break; @@ -3032,7 +3030,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | cmp TMP1w, CARG1w // In array part? | bhs ->vmeta_tgetv | ldr TMP0, [CARG3] - | cmp TMP0, TISNIL + | cmp_nil TMP0 | beq >5 |1: | str TMP0, [BASE, RA, lsl #3] @@ -3075,7 +3073,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ldr NODE:CARG3, NODE:CARG3->next | cmp CARG1, CARG4 | bne >4 - | cmp TMP0, TISNIL + | cmp_nil TMP0 | beq >5 |3: | str TMP0, [BASE, RA, lsl #3] @@ -3084,7 +3082,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |4: // Follow hash chain. | cbnz NODE:CARG3, <1 | // End of hash chain: key not found, nil result. - | mov TMP0, TISNIL + | mov_nil TMP0 | |5: // Check for __index if table value is nil. | ldr TAB:CARG1, TAB:CARG2->metatable @@ -3105,7 +3103,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | cmp RCw, CARG1w // In array part? | bhs ->vmeta_tgetb | ldr TMP0, [CARG3] - | cmp TMP0, TISNIL + | cmp_nil TMP0 | beq >5 |1: | str TMP0, [BASE, RA, lsl #3] @@ -3152,7 +3150,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ldr TMP1, [CARG3] | ldr TMP0, [BASE, RA, lsl #3] | ldrb TMP2w, TAB:CARG2->marked - | cmp TMP1, TISNIL // Previous value is nil? + | cmp_nil TMP1 // Previous value is nil? | beq >5 |1: | str TMP0, [CARG3] @@ -3204,7 +3202,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | cmp CARG1, CARG4 | bne >5 | ldr TMP0, [BASE, RA, lsl #3] - | cmp TMP1, TISNIL // Previous value is nil? + | cmp_nil TMP1 // Previous value is nil? | beq >4 |2: | str TMP0, NODE:CARG3->val @@ -3263,7 +3261,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ldr TMP1, [CARG3] | ldr TMP0, [BASE, RA, lsl #3] | ldrb TMP2w, TAB:CARG2->marked - | cmp TMP1, TISNIL // Previous value is nil? + | cmp_nil TMP1 // Previous value is nil? | beq >5 |1: | str TMP0, [CARG3] @@ -3362,9 +3360,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |->BC_CALL_Z: | mov RB, BASE // Save old BASE for vmeta_call. | add BASE, BASE, RA, lsl #3 - | ldr CARG3, [BASE] + | ldr CARG3, [BASE], #16 | sub NARGS8:RC, NARGS8:RC, #8 - | add BASE, BASE, #16 | checkfunc CARG3, ->vmeta_call | ins_call break; @@ -3380,9 +3377,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | // RA = base, (RB = 0,) RC = (nargs+1)*8 |->BC_CALLT1_Z: | add RA, BASE, RA, lsl #3 - | ldr TMP1, [RA] + | ldr TMP1, [RA], #16 | sub NARGS8:RC, NARGS8:RC, #8 - | add RA, RA, #16 | checktp CARG3, TMP1, LJ_TFUNC, ->vmeta_callt | ldr PC, [BASE, FRAME_PC] |->BC_CALLT2_Z: @@ -3462,10 +3458,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | add CARG3, CARG2, CARG1, lsl #3 | bhs >5 // Index points after array part? | ldr TMP0, [CARG3] - | cmp TMP0, TISNIL + | cmp_nil TMP0 | cinc CARG1, CARG1, eq // Skip holes in array part. | beq <1 - | add CARG1, CARG1, TISNUM + | add_TISNUM CARG1, CARG1 | stp CARG1, TMP0, [RA] | add CARG1, CARG1, #1 |3: @@ -3483,7 +3479,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | add NODE:CARG3, NODE:RB, CARG1, lsl #3 // node = tab->node + idx*3*8 | bhi <4 | ldp TMP0, CARG1, NODE:CARG3->val - | cmp TMP0, TISNIL + | cmp_nil TMP0 | add RC, RC, #1 | beq <6 // Skip holes in hash part. | stp CARG1, TMP0, [RA] @@ -3501,8 +3497,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | checkfunc CFUNC:CARG1, >5 | asr TMP0, TAB:CARG3, #47 | ldrb TMP1w, CFUNC:CARG1->ffid - | cmn TMP0, #-LJ_TTAB - | ccmp CARG4, TISNIL, #0, eq + | cmp_nil CARG4 + | ccmn TMP0, #-LJ_TTAB, #0, eq | ccmp TMP1w, #FF_next_N, #0, eq | bne >5 | mov TMP0w, #0xfffe7fff // LJ_KEYINDEX @@ -3542,51 +3538,51 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | and RC, RC, #255 | // RA = base, RB = (nresults+1), RC = numparams | ldr TMP1, [BASE, FRAME_PC] - | add RC, BASE, RC, lsl #3 - | add RA, BASE, RA, lsl #3 - | add RC, RC, #FRAME_VARG - | add TMP2, RA, RB, lsl #3 - | sub RC, RC, TMP1 // RC = vbase - | // Note: RC may now be even _above_ BASE if nargs was < numparams. + | add TMP0, BASE, RC, lsl #3 + | add RC, BASE, RA, lsl #3 // RC = destination + | add TMP0, TMP0, #FRAME_VARG + | add TMP2, RC, RB, lsl #3 + | sub RA, TMP0, TMP1 // RA = vbase + | // Note: RA may now be even _above_ BASE if nargs was < numparams. | sub TMP3, BASE, #16 // TMP3 = vtop | cbz RB, >5 | sub TMP2, TMP2, #16 |1: // Copy vararg slots to destination slots. - | cmp RC, TMP3 - | ldr TMP0, [RC], #8 - | csel TMP0, TMP0, TISNIL, lo - | cmp RA, TMP2 - | str TMP0, [RA], #8 + | cmp RA, TMP3 + | ldr TMP0, [RA], #8 + | csinv TMP0, TMP0, xzr, lo // TISNIL = ~xzr + | cmp RC, TMP2 + | str TMP0, [RC], #8 | blo <1 |2: | ins_next | |5: // Copy all varargs. | ldr TMP0, L->maxstack - | subs TMP2, TMP3, RC + | subs TMP2, TMP3, RA | csel RB, xzr, TMP2, le // MULTRES = (max(vtop-vbase,0)+1)*8 | add RB, RB, #8 - | add TMP1, RA, TMP2 + | add TMP1, RC, TMP2 | str RBw, SAVE_MULTRES | ble <2 // Nothing to copy. | cmp TMP1, TMP0 | bhi >7 |6: - | ldr TMP0, [RC], #8 - | str TMP0, [RA], #8 - | cmp RC, TMP3 + | ldr TMP0, [RA], #8 + | str TMP0, [RC], #8 + | cmp RA, TMP3 | blo <6 | b <2 | |7: // Grow stack for varargs. | lsr CARG2, TMP2, #3 - | stp BASE, RA, L->base + | stp BASE, RC, L->base | mov CARG1, L - | sub RC, RC, BASE // Need delta, because BASE may change. + | sub RA, RA, BASE // Need delta, because BASE may change. | str PC, SAVE_PC | bl extern lj_state_growstack // (lua_State *L, int n) - | ldp BASE, RA, L->base - | add RC, BASE, RC + | ldp BASE, RC, L->base + | add RA, BASE, RA | sub TMP3, BASE, #16 | b <6 break; @@ -3730,7 +3726,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) } else { | adds CARG1w, CARG1w, CARG3w | bvs >2 - | add TMP0, CARG1, TISNUM + | add_TISNUM TMP0, CARG1 | tbnz CARG3w, #31, >4 | cmp CARG1w, CARG2w } @@ -3809,7 +3805,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | // RA = base, RC = target | ldr CARG1, [BASE, RA, lsl #3] | add TMP1, BASE, RA, lsl #3 - | cmp CARG1, TISNIL + | cmp_nil CARG1 | beq >1 // Stop if iterator returned nil. if (op == BC_JITERL) { | str CARG1, [TMP1, #-8] From 14866a6828939d86e716939cfd2921ac5aaeca8e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 21:57:04 +0200 Subject: [PATCH 56/95] ARM64: Fix disassembly of U12 loads. Thanks to Peter Cawley. #1100 --- src/jit/dis_arm64.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index 3d199bf2..a7a29494 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -948,7 +948,7 @@ local function disass_ins(ctx) elseif p == "U" then local rn = map_regs.x[band(rshift(op, 5), 31)] local sz = band(rshift(op, 30), 3) - local imm12 = lshift(arshift(lshift(op, 10), 20), sz) + local imm12 = lshift(rshift(lshift(op, 10), 20), sz) if imm12 ~= 0 then x = "["..rn..", #"..imm12.."]" else From d2a5487fd79b0ce9cd303f84eae13ce12d4db4b7 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 22:10:02 +0200 Subject: [PATCH 57/95] ARM64: Use ADR and ADRP to form constants. Thanks to Peter Cawley. #1100 --- src/lj_emit_arm64.h | 36 +++++++++++++++++++++++++++++------- src/lj_target_arm64.h | 2 ++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 3c510492..51d0c351 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -193,6 +193,32 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64) return 0; /* Failed. */ } +#define glofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) +#define mcpofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1))) +#define checkmcpofs(as, k) \ + (A64F_S_OK(mcpofs(as, k)>>2, 19)) + +/* Try to form a const as ADR or ADRP or ADRP + ADD. */ +static int emit_kadrp(ASMState *as, Reg rd, uint64_t k) +{ + A64Ins ai = A64I_ADR; + int64_t ofs = mcpofs(as, k); + if (!A64F_S_OK((uint64_t)ofs, 21)) { + uint64_t kpage = k & ~0xfffull; + MCode *adrp = as->mcp - 1 - (k != kpage); + ofs = (int64_t)(kpage - ((uint64_t)adrp & ~0xfffull)) >> 12; + if (!A64F_S_OK(ofs, 21)) + return 0; /* Failed. */ + if (k != kpage) + emit_dn(as, (A64I_ADDx^A64I_K12)|A64F_U12(k - kpage), rd, rd); + ai = A64I_ADRP; + } + emit_d(as, ai|(((uint32_t)ofs&3)<<29)|A64F_S19(ofs>>2), rd); + return 1; +} + static void emit_loadk(ASMState *as, Reg rd, uint64_t u64) { int zeros = 0, ones = 0, neg, lshift = 0; @@ -213,6 +239,9 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64) if (emit_kdelta(as, rd, u64, is64)) { return; } + if (emit_kadrp(as, rd, u64)) { /* Either 1 or 2 ins. */ + return; + } } if (neg) { u64 = ~u64; @@ -240,13 +269,6 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64) /* Load a 64 bit constant into a GPR. */ #define emit_loadu64(as, rd, i) emit_loadk(as, rd, i) -#define glofs(as, k) \ - ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) -#define mcpofs(as, k) \ - ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1))) -#define checkmcpofs(as, k) \ - (A64F_S_OK(mcpofs(as, k)>>2, 19)) - static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow); /* Get/set from constant pointer. */ diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 65a14307..c34f1e59 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -234,6 +234,8 @@ typedef enum A64Ins { A64I_MOVZx = 0xd2800000, A64I_MOVNw = 0x12800000, A64I_MOVNx = 0x92800000, + A64I_ADR = 0x10000000, + A64I_ADRP = 0x90000000, A64I_LDRB = 0x39400000, A64I_LDRH = 0x79400000, From 656ecbcf8f669feb94e0d0ec4b4f59190bcd2e48 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 8 Oct 2023 22:12:01 +0200 Subject: [PATCH 58/95] DynASM/ARM64: Support ldp/stp of q registers. Thanks to Peter Cawley. #1096 --- dynasm/dasm_arm64.lua | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dynasm/dasm_arm64.lua b/dynasm/dasm_arm64.lua index e69f8ef3..05ea3e22 100644 --- a/dynasm/dasm_arm64.lua +++ b/dynasm/dasm_arm64.lua @@ -549,7 +549,7 @@ end local function parse_load_pair(params, nparams, n, op) if params[n+2] then werror("too many operands") end local pn, p2 = params[n], params[n+1] - local scale = shr(op, 30) == 0 and 2 or 3 + local scale = 2 + shr(op, 31 - band(shr(op, 26), 1)) local p1, wb = match(pn, "^%[%s*(.-)%s*%](!?)$") if not p1 then if not p2 then @@ -806,8 +806,8 @@ map_op = { ["ldrsw_*"] = "98000000DxB|b8800000DxL", -- NOTE: ldur etc. are handled by ldr et al. - ["stp_*"] = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP", - ["ldp_*"] = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP", + ["stp_*"] = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP|ac000000DAqP", + ["ldp_*"] = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP|ac400000DAqP", ["ldpsw_*"] = "68400000DAxP", -- Branches. @@ -942,7 +942,7 @@ local function parse_template(params, template, nparams, pos) werror("bad register type") end parse_reg_type = false - elseif p == "x" or p == "w" or p == "d" or p == "s" then + elseif p == "x" or p == "w" or p == "d" or p == "s" or p == "q" then if parse_reg_type ~= p then werror("register size mismatch") end From db944b2b56c86fcf133745976763604d96110285 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 Oct 2023 13:11:50 +0200 Subject: [PATCH 59/95] FFI: Fix dangling reference to CType in carith_checkarg(). Reported by Sergey Kaplun. #1108 --- src/lj_carith.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lj_carith.c b/src/lj_carith.c index 96384e87..bad5fe66 100644 --- a/src/lj_carith.c +++ b/src/lj_carith.c @@ -42,9 +42,13 @@ static int carith_checkarg(lua_State *L, CTState *cts, CDArith *ca) p = (uint8_t *)cdata_getptr(p, ct->size); if (ctype_isref(ct->info)) ct = ctype_rawchild(cts, ct); } else if (ctype_isfunc(ct->info)) { + CTypeID id0 = i ? ctype_typeid(cts, ca->ct[0]) : 0; p = (uint8_t *)*(void **)p; ct = ctype_get(cts, lj_ctype_intern(cts, CTINFO(CT_PTR, CTALIGN_PTR|id), CTSIZE_PTR)); + if (i) { /* cts->tab may have been reallocated. */ + ca->ct[0] = ctype_get(cts, id0); + } } if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct); ca->ct[i] = ct; From 4eb47df605883e983dadb78f303b22dd0232dd03 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 Oct 2023 13:18:51 +0200 Subject: [PATCH 60/95] FFI/Windows: Fix type declaration for int64_t and uint64_t. Thanks to Peter Cawley. #1106 --- src/lj_ctype.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lj_ctype.h b/src/lj_ctype.h index 45e7234e..cde1cf01 100644 --- a/src/lj_ctype.h +++ b/src/lj_ctype.h @@ -276,6 +276,8 @@ typedef struct CTState { #define CTTYDEFP(_) #endif +#define CTF_LONG_IF8 (CTF_LONG * (sizeof(long) == 8)) + /* Common types. */ #define CTTYDEF(_) \ _(NONE, 0, CT_ATTRIB, CTATTRIB(CTA_BAD)) \ @@ -289,8 +291,8 @@ typedef struct CTState { _(UINT16, 2, CT_NUM, CTF_UNSIGNED|CTALIGN(1)) \ _(INT32, 4, CT_NUM, CTALIGN(2)) \ _(UINT32, 4, CT_NUM, CTF_UNSIGNED|CTALIGN(2)) \ - _(INT64, 8, CT_NUM, CTF_LONG|CTALIGN(3)) \ - _(UINT64, 8, CT_NUM, CTF_UNSIGNED|CTF_LONG|CTALIGN(3)) \ + _(INT64, 8, CT_NUM, CTF_LONG_IF8|CTALIGN(3)) \ + _(UINT64, 8, CT_NUM, CTF_UNSIGNED|CTF_LONG_IF8|CTALIGN(3)) \ _(FLOAT, 4, CT_NUM, CTF_FP|CTALIGN(2)) \ _(DOUBLE, 8, CT_NUM, CTF_FP|CTALIGN(3)) \ _(COMPLEX_FLOAT, 8, CT_ARRAY, CTF_COMPLEX|CTALIGN(2)|CTID_FLOAT) \ From e826d0c101d750fac8334d71e221c50d8dbe236c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 Oct 2023 13:31:45 +0200 Subject: [PATCH 61/95] Add 'cc' file type for saving bytecode. Contributed by Sergey Bronnikov. #1105 --- doc/running.html | 3 ++- src/jit/bcsave.lua | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/running.html b/doc/running.html index 3afc1b56..9dd2b411 100644 --- a/doc/running.html +++ b/doc/running.html @@ -120,7 +120,8 @@ file name:

    • c — C source file, exported bytecode data.
    • -
    • h — C header file, static bytecode data.
    • +
    • cc — C++ source file, exported bytecode data.
    • +
    • h — C/C++ header file, static bytecode data.
    • obj or o — Object file, exported bytecode data (OS- and architecture-specific).
    • raw or any other extension — Raw bytecode file (portable). diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index 74699f3d..390d297c 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -38,7 +38,7 @@ Save LuaJIT bytecode: luajit -b[options] input output -- Stop handling options. - Use stdin as input and/or stdout as output. -File types: c h obj o raw (default) +File types: c cc h obj o raw (default) ]] os.exit(1) end @@ -81,7 +81,7 @@ end ------------------------------------------------------------------------------ local map_type = { - raw = "raw", c = "c", h = "h", o = "obj", obj = "obj", + raw = "raw", c = "c", cc = "c", h = "h", o = "obj", obj = "obj", } local map_arch = { From f2e955dae8411ccdce693806f15b1f221a49015c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 5 Nov 2023 11:27:35 +0100 Subject: [PATCH 62/95] Windows/x86: _BitScan*64 are only available on 64 bit archs. Reported by memcorrupt. #1109 --- src/lj_def.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/lj_def.h b/src/lj_def.h index 0d6c346b..2a1d7b56 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -259,12 +259,8 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x) #else unsigned char _BitScanForward(unsigned long *, unsigned long); unsigned char _BitScanReverse(unsigned long *, unsigned long); -unsigned char _BitScanForward64(unsigned long *, uint64_t); -unsigned char _BitScanReverse64(unsigned long *, uint64_t); #pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) -#pragma intrinsic(_BitScanForward64) -#pragma intrinsic(_BitScanReverse64) static LJ_AINLINE uint32_t lj_ffs(uint32_t x) { @@ -276,6 +272,12 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x) unsigned long r; _BitScanReverse(&r, x); return (uint32_t)r; } +#if defined(_M_X64) || defined(_M_ARM64) +unsigned char _BitScanForward64(unsigned long *, uint64_t); +unsigned char _BitScanReverse64(unsigned long *, uint64_t); +#pragma intrinsic(_BitScanForward64) +#pragma intrinsic(_BitScanReverse64) + static LJ_AINLINE uint32_t lj_ffs64(uint64_t x) { unsigned long r; _BitScanForward64(&r, x); return (uint32_t)r; @@ -286,6 +288,7 @@ static LJ_AINLINE uint32_t lj_fls64(uint64_t x) unsigned long r; _BitScanReverse64(&r, x); return (uint32_t)r; } #endif +#endif unsigned long _byteswap_ulong(unsigned long); uint64_t _byteswap_uint64(uint64_t); From d133d67c881f363f0b5584ebd21a965eb3435aa1 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 5 Nov 2023 11:31:08 +0100 Subject: [PATCH 63/95] x64: Properly fix __call metamethod return dispatch. Reported by Sergey Kaplun. #1110 --- src/vm_x86.dasc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc index 5b3356dc..56712f90 100644 --- a/src/vm_x86.dasc +++ b/src/vm_x86.dasc @@ -1243,7 +1243,7 @@ static void build_subroutines(BuildCtx *ctx) | mov LFUNC:RB, [RA-8] | add NARGS:RD, 1 | // This is fragile. L->base must not move, KBASE must always be defined. - |.if x64 + |.if X64 | cmp KBASEa, rdx // Continue with CALLT if flag set. |.else | cmp KBASE, BASE // Continue with CALLT if flag set. From 07b3cd3cf9b57a3801a1ebc48144767e31671f21 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 5 Nov 2023 16:34:46 +0100 Subject: [PATCH 64/95] Check for upvalue state transition in IR_UREFO. Thanks to Peter Cawley. #1085 --- src/lj_asm_arm.h | 32 +++++++++++++++++++------------ src/lj_asm_arm64.h | 20 ++++++++++++++------ src/lj_asm_mips.h | 27 ++++++++++++++++---------- src/lj_asm_ppc.h | 29 +++++++++++++++++----------- src/lj_asm_x86.h | 27 ++++++++++++++++---------- src/lj_opt_fold.c | 47 +++++++++++++++++++++++++++++++++++++--------- src/lj_opt_mem.c | 15 ++++++++++----- src/lj_record.c | 13 +++++++++++-- src/lj_state.c | 7 +++++-- 9 files changed, 150 insertions(+), 67 deletions(-) diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index ac3d1b58..348cd79f 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -969,24 +969,32 @@ static void asm_hrefk(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - if (irref_isk(ir->op1)) { + int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC); + if (irref_isk(ir->op1) && !guarded) { GCfunc *fn = ir_kfunc(IR(ir->op1)); MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; emit_lsptr(as, ARMI_LDR, dest, v); } else { - Reg uv = ra_scratch(as, RSET_GPR); - Reg func = ra_alloc1(as, ir->op1, RSET_GPR); - if (ir->o == IR_UREFC) { - asm_guardcc(as, CC_NE); + if (guarded) { + asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ); emit_n(as, ARMI_CMP|ARMI_K12|1, RID_TMP); - emit_opk(as, ARMI_ADD, dest, uv, - (int32_t)offsetof(GCupval, tv), RSET_GPR); - emit_lso(as, ARMI_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed)); - } else { - emit_lso(as, ARMI_LDR, dest, uv, (int32_t)offsetof(GCupval, v)); } - emit_lso(as, ARMI_LDR, uv, func, - (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); + if (ir->o == IR_UREFC) + emit_opk(as, ARMI_ADD, dest, dest, + (int32_t)offsetof(GCupval, tv), RSET_GPR); + else + emit_lso(as, ARMI_LDR, dest, dest, (int32_t)offsetof(GCupval, v)); + if (guarded) + emit_lso(as, ARMI_LDRB, RID_TMP, dest, + (int32_t)offsetof(GCupval, closed)); + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]); + emit_loadi(as, dest, k); + } else { + emit_lso(as, ARMI_LDR, dest, ra_alloc1(as, ir->op1, RSET_GPR), + (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); + } } } diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 9f165fa8..5b40f4cc 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -931,22 +931,30 @@ static void asm_hrefk(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - if (irref_isk(ir->op1)) { + int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC); + if (irref_isk(ir->op1) && !guarded) { GCfunc *fn = ir_kfunc(IR(ir->op1)); MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; emit_lsptr(as, A64I_LDRx, dest, v); } else { - if (ir->o == IR_UREFC) { - asm_guardcnb(as, A64I_CBZ, RID_TMP); + if (guarded) + asm_guardcnb(as, ir->o == IR_UREFC ? A64I_CBZ : A64I_CBNZ, RID_TMP); + if (ir->o == IR_UREFC) emit_opk(as, A64I_ADDx, dest, dest, (int32_t)offsetof(GCupval, tv), RSET_GPR); + else + emit_lso(as, A64I_LDRx, dest, dest, (int32_t)offsetof(GCupval, v)); + if (guarded) emit_lso(as, A64I_LDRB, RID_TMP, dest, (int32_t)offsetof(GCupval, closed)); + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + uint64_t k = gcrefu(fn->l.uvptr[(ir->op2 >> 8)]); + emit_loadu64(as, dest, k); } else { - emit_lso(as, A64I_LDRx, dest, dest, (int32_t)offsetof(GCupval, v)); + emit_lso(as, A64I_LDRx, dest, ra_alloc1(as, ir->op1, RSET_GPR), + (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8)); } - emit_lso(as, A64I_LDRx, dest, ra_alloc1(as, ir->op1, RSET_GPR), - (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8)); } } diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index b02da663..d4e40c91 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -1207,22 +1207,29 @@ nolo: static void asm_uref(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - if (irref_isk(ir->op1)) { + int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC); + if (irref_isk(ir->op1) && !guarded) { GCfunc *fn = ir_kfunc(IR(ir->op1)); MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; emit_lsptr(as, MIPSI_AL, dest, v, RSET_GPR); } else { - Reg uv = ra_scratch(as, RSET_GPR); - Reg func = ra_alloc1(as, ir->op1, RSET_GPR); - if (ir->o == IR_UREFC) { - asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO); - emit_tsi(as, MIPSI_AADDIU, dest, uv, (int32_t)offsetof(GCupval, tv)); - emit_tsi(as, MIPSI_LBU, RID_TMP, uv, (int32_t)offsetof(GCupval, closed)); + if (guarded) + asm_guard(as, ir->o == IR_UREFC ? MIPSI_BEQ : MIPSI_BNE, RID_TMP, RID_ZERO); + if (ir->o == IR_UREFC) + emit_tsi(as, MIPSI_AADDIU, dest, dest, (int32_t)offsetof(GCupval, tv)); + else + emit_tsi(as, MIPSI_AL, dest, dest, (int32_t)offsetof(GCupval, v)); + if (guarded) + emit_tsi(as, MIPSI_LBU, RID_TMP, dest, (int32_t)offsetof(GCupval, closed)); + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]); + emit_loada(as, dest, o); } else { - emit_tsi(as, MIPSI_AL, dest, uv, (int32_t)offsetof(GCupval, v)); + emit_tsi(as, MIPSI_AL, dest, ra_alloc1(as, ir->op1, RSET_GPR), + (int32_t)offsetof(GCfuncL, uvptr) + + (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8)); } - emit_tsi(as, MIPSI_AL, uv, func, (int32_t)offsetof(GCfuncL, uvptr) + - (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8)); } } diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index 6555312d..8e9a92a4 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -840,23 +840,30 @@ static void asm_hrefk(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - if (irref_isk(ir->op1)) { + int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC); + if (irref_isk(ir->op1) && !guarded) { GCfunc *fn = ir_kfunc(IR(ir->op1)); MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; emit_lsptr(as, PPCI_LWZ, dest, v, RSET_GPR); } else { - Reg uv = ra_scratch(as, RSET_GPR); - Reg func = ra_alloc1(as, ir->op1, RSET_GPR); - if (ir->o == IR_UREFC) { - asm_guardcc(as, CC_NE); + if (guarded) { + asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ); emit_ai(as, PPCI_CMPWI, RID_TMP, 1); - emit_tai(as, PPCI_ADDI, dest, uv, (int32_t)offsetof(GCupval, tv)); - emit_tai(as, PPCI_LBZ, RID_TMP, uv, (int32_t)offsetof(GCupval, closed)); - } else { - emit_tai(as, PPCI_LWZ, dest, uv, (int32_t)offsetof(GCupval, v)); } - emit_tai(as, PPCI_LWZ, uv, func, - (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); + if (ir->o == IR_UREFC) + emit_tai(as, PPCI_ADDI, dest, dest, (int32_t)offsetof(GCupval, tv)); + else + emit_tai(as, PPCI_LWZ, dest, dest, (int32_t)offsetof(GCupval, v)); + if (guarded) + emit_tai(as, PPCI_LBZ, RID_TMP, dest, (int32_t)offsetof(GCupval, closed)); + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]); + emit_loadi(as, dest, k); + } else { + emit_tai(as, PPCI_LWZ, dest, ra_alloc1(as, ir->op1, RSET_GPR), + (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); + } } } diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index c92de3d8..0e0b28a4 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1373,24 +1373,31 @@ static void asm_hrefk(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - if (irref_isk(ir->op1)) { + int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC); + if (irref_isk(ir->op1) && !guarded) { GCfunc *fn = ir_kfunc(IR(ir->op1)); MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; emit_rma(as, XO_MOV, dest|REX_GC64, v); } else { Reg uv = ra_scratch(as, RSET_GPR); - Reg func = ra_alloc1(as, ir->op1, RSET_GPR); - if (ir->o == IR_UREFC) { + if (ir->o == IR_UREFC) emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv)); - asm_guardcc(as, CC_NE); - emit_i8(as, 1); - emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); - } else { + else emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v)); + if (guarded) { + asm_guardcc(as, ir->o == IR_UREFC ? CC_E : CC_NE); + emit_i8(as, 0); + emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); + } + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]); + emit_loada(as, uv, o); + } else { + emit_rmro(as, XO_MOV, uv|REX_GC64, ra_alloc1(as, ir->op1, RSET_GPR), + (int32_t)offsetof(GCfuncL, uvptr) + + (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8)); } - emit_rmro(as, XO_MOV, uv|REX_GC64, func, - (int32_t)offsetof(GCfuncL, uvptr) + - (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8)); } } diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 743dfb07..ce78505b 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -2134,8 +2134,26 @@ LJFOLDX(lj_opt_fwd_uload) LJFOLD(ALEN any any) LJFOLDX(lj_opt_fwd_alen) +/* Try to merge UREFO/UREFC into referenced instruction. */ +static TRef merge_uref(jit_State *J, IRRef ref, IRIns* ir) +{ + if (ir->o == IR_UREFO && irt_isguard(ir->t)) { + /* Might be pointing to some other coroutine's stack. + ** And GC might shrink said stack, thereby repointing the upvalue. + ** GC might even collect said coroutine, thereby closing the upvalue. + */ + if (gcstep_barrier(J, ref)) + return EMITFOLD; /* So cannot merge. */ + /* Current fins wants a check, but ir doesn't have one. */ + if ((irt_t(fins->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC) && + irt_type(ir->t) == IRT_IGC) + ir->t.irt += IRT_PGC-IRT_IGC; /* So install a check. */ + } + return ref; /* Not a TRef, but the caller doesn't care. */ +} + /* Upvalue refs are really loads, but there are no corresponding stores. -** So CSE is ok for them, except for UREFO across a GC step (see below). +** So CSE is ok for them, except for guarded UREFO across a GC step. ** If the referenced function is const, its upvalue addresses are const, too. ** This can be used to improve CSE by looking for the same address, ** even if the upvalues originate from a different function. @@ -2153,9 +2171,7 @@ LJFOLDF(cse_uref) if (irref_isk(ir->op1)) { GCfunc *fn2 = ir_kfunc(IR(ir->op1)); if (gco2uv(gcref(fn2->l.uvptr[(ir->op2 >> 8)])) == uv) { - if (fins->o == IR_UREFO && gcstep_barrier(J, ref)) - break; - return ref; + return merge_uref(J, ref, ir); } } ref = ir->prev; @@ -2164,6 +2180,24 @@ LJFOLDF(cse_uref) return EMITFOLD; } +/* Custom CSE for UREFO. */ +LJFOLD(UREFO any any) +LJFOLDF(cse_urefo) +{ + if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) { + IRRef ref = J->chain[IR_UREFO]; + IRRef lim = fins->op1; + IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16); + while (ref > lim) { + IRIns *ir = IR(ref); + if (ir->op12 == op12) + return merge_uref(J, ref, ir); + ref = ir->prev; + } + } + return EMITFOLD; +} + LJFOLD(HREFK any any) LJFOLDX(lj_opt_fwd_hrefk) @@ -2384,14 +2418,9 @@ LJFOLDF(fold_base) /* Write barriers are amenable to CSE, but not across any incremental ** GC steps. -** -** The same logic applies to open upvalue references, because a stack -** may be resized during a GC step (not the current stack, but maybe that -** of a coroutine). */ LJFOLD(TBAR any) LJFOLD(OBAR any any) -LJFOLD(UREFO any any) LJFOLDF(barrier_tab) { TRef tr = lj_opt_cse(J); diff --git a/src/lj_opt_mem.c b/src/lj_opt_mem.c index 351d958c..631ac9e4 100644 --- a/src/lj_opt_mem.c +++ b/src/lj_opt_mem.c @@ -464,18 +464,23 @@ doemit: */ static AliasRet aa_uref(IRIns *refa, IRIns *refb) { - if (refa->o != refb->o) - return ALIAS_NO; /* Different UREFx type. */ if (refa->op1 == refb->op1) { /* Same function. */ if (refa->op2 == refb->op2) return ALIAS_MUST; /* Same function, same upvalue idx. */ else return ALIAS_NO; /* Same function, different upvalue idx. */ } else { /* Different functions, check disambiguation hash values. */ - if (((refa->op2 ^ refb->op2) & 0xff)) + if (((refa->op2 ^ refb->op2) & 0xff)) { return ALIAS_NO; /* Upvalues with different hash values cannot alias. */ - else - return ALIAS_MAY; /* No conclusion can be drawn for same hash value. */ + } else if (refa->o != refb->o) { + /* Different UREFx type, but need to confirm the UREFO really is open. */ + if (irt_type(refa->t) == IRT_IGC) refa->t.irt += IRT_PGC-IRT_IGC; + else if (irt_type(refb->t) == IRT_IGC) refb->t.irt += IRT_PGC-IRT_IGC; + return ALIAS_NO; + } else { + /* No conclusion can be drawn for same hash value and same UREFx type. */ + return ALIAS_MAY; + } } } diff --git a/src/lj_record.c b/src/lj_record.c index d44f7737..1dd310d4 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1772,12 +1772,12 @@ noconstify: /* Note: this effectively limits LJ_MAX_UPVAL to 127. */ uv = (uv << 8) | (hashrot(uvp->dhash, uvp->dhash + HASH_BIAS) & 0xff); if (!uvp->closed) { - uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_PGC), fn, uv)); /* In current stack? */ if (uvval(uvp) >= tvref(J->L->stack) && uvval(uvp) < tvref(J->L->maxstack)) { int32_t slot = (int32_t)(uvval(uvp) - (J->L->base - J->baseslot)); if (slot >= 0) { /* Aliases an SSA slot? */ + uref = tref_ref(emitir(IRT(IR_UREFO, IRT_PGC), fn, uv)); emitir(IRTG(IR_EQ, IRT_PGC), REF_BASE, emitir(IRT(IR_ADD, IRT_PGC), uref, @@ -1792,12 +1792,21 @@ noconstify: } } } + /* IR_UREFO+IRT_IGC is not checked for open-ness at runtime. + ** Always marked as a guard, since it might get promoted to IRT_PGC later. + */ + uref = emitir(IRTG(IR_UREFO, tref_isgcv(val) ? IRT_PGC : IRT_IGC), fn, uv); + uref = tref_ref(uref); emitir(IRTG(IR_UGT, IRT_PGC), emitir(IRT(IR_SUB, IRT_PGC), uref, REF_BASE), lj_ir_kintpgc(J, (J->baseslot + J->maxslot) * 8)); } else { + /* If fn is constant, then so is the GCupval*, and the upvalue cannot + ** transition back to open, so no guard is required in this case. + */ + IRType t = (tref_isk(fn) ? 0 : IRT_GUARD) | IRT_PGC; + uref = tref_ref(emitir(IRT(IR_UREFC, t), fn, uv)); needbarrier = 1; - uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_PGC), fn, uv)); } if (val == 0) { /* Upvalue load */ IRType t = itype2irt(uvval(uvp)); diff --git a/src/lj_state.c b/src/lj_state.c index 6efe189d..7e4961bd 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -346,8 +346,11 @@ void LJ_FASTCALL lj_state_free(global_State *g, lua_State *L) lj_assertG(L != mainthread(g), "free of main thread"); if (obj2gco(L) == gcref(g->cur_L)) setgcrefnull(g->cur_L); - lj_func_closeuv(L, tvref(L->stack)); - lj_assertG(gcref(L->openupval) == NULL, "stale open upvalues"); + if (gcref(L->openupval) != NULL) { + lj_func_closeuv(L, tvref(L->stack)); + lj_trace_abort(g); /* For aa_uref soundness. */ + lj_assertG(gcref(L->openupval) == NULL, "stale open upvalues"); + } lj_mem_freevec(g, tvref(L->stack), L->stacksize, TValue); lj_mem_freet(g, L); } From ce2cd617398412984c52ca90f833b30ae3dbd08b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 6 Nov 2023 23:14:22 +0100 Subject: [PATCH 65/95] ARM64: Fix disassembly of ldp/stp offsets. Thanks to Peter Cawley. #1113 --- src/jit/dis_arm64.lua | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index a7a29494..84677666 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -985,8 +985,7 @@ local function disass_ins(ctx) x = x.."]" end elseif p == "P" then - local opcv, sh = rshift(op, 26), 2 - if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end + local sh = 2 + rshift(op, 31 - band(rshift(op, 26), 1)) local imm7 = lshift(arshift(lshift(op, 10), 25), sh) local rn = map_regs.x[band(rshift(op, 5), 31)] local ind = band(rshift(op, 23), 3) From 433d7e8d8d182f44e88b5cfdc4b2d3026469dfb7 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 7 Nov 2023 22:25:42 +0100 Subject: [PATCH 66/95] FFI: Fix pragma push stack limit check and throw on overflow. Reported by Sergey Kaplun. #1114 --- src/lj_cparse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lj_cparse.c b/src/lj_cparse.c index f807c5ae..2ef7dbe1 100644 --- a/src/lj_cparse.c +++ b/src/lj_cparse.c @@ -1747,9 +1747,11 @@ static void cp_pragma(CPState *cp, BCLine pragmaline) cp_check(cp, '('); if (cp->tok == CTOK_IDENT) { if (cp->str->hash == H_(738e923c,a1b65954)) { /* push */ - if (cp->curpack < CPARSE_MAX_PACKSTACK) { + if (cp->curpack < CPARSE_MAX_PACKSTACK-1) { cp->packstack[cp->curpack+1] = cp->packstack[cp->curpack]; cp->curpack++; + } else { + cp_errmsg(cp, cp->tok, LJ_ERR_XLEVELS); } } else if (cp->str->hash == H_(6c71cf27,6c71cf27)) { /* pop */ if (cp->curpack > 0) cp->curpack--; From 65c849390702b1150d52e64db86cbc6b3c98413e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 9 Nov 2023 11:02:36 +0100 Subject: [PATCH 67/95] Invalidate SCEV entry when returning to lower frame. Thanks to Zhongwei Yao. #1115 --- src/lj_record.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lj_record.c b/src/lj_record.c index a49f942a..0122105b 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -755,6 +755,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) emitir(IRTG(IR_RETF, IRT_P32), trpt, trpc); J->retdepth++; J->needsnap = 1; + J->scev.idx = REF_NIL; lua_assert(J->baseslot == 1); /* Shift result slots up and clear the slots of the new frame below. */ memmove(J->base + cbase, J->base-1, sizeof(TRef)*nresults); From a4c1640432a9d8a60624cdc8065b15078c228e36 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 12 Nov 2023 14:42:24 +0100 Subject: [PATCH 68/95] Add stack check to pcall/xpcall. Analyzed by Peter Cawley. #1048 --- src/vm_arm.dasc | 7 +++++++ src/vm_mips.dasc | 10 +++++++++- src/vm_ppc.dasc | 8 ++++++++ src/vm_ppcspe.dasc | 8 ++++++++ src/vm_x86.dasc | 6 ++++++ 5 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc index 7dae1a53..872de45a 100644 --- a/src/vm_arm.dasc +++ b/src/vm_arm.dasc @@ -1155,8 +1155,11 @@ static void build_subroutines(BuildCtx *ctx) |//-- Base library: catch errors ---------------------------------------- | |.ffunc pcall + | ldr RB, L->maxstack + | add INS, BASE, NARGS8:RC | ldrb RA, [DISPATCH, #DISPATCH_GL(hookmask)] | cmp NARGS8:RC, #8 + | cmphs RB, INS | blo ->fff_fallback | tst RA, #HOOK_ACTIVE // Remember active hook before pcall. | mov RB, BASE @@ -1167,7 +1170,11 @@ static void build_subroutines(BuildCtx *ctx) | b ->vm_call_dispatch | |.ffunc_2 xpcall + | ldr RB, L->maxstack + | add INS, BASE, NARGS8:RC | ldrb RA, [DISPATCH, #DISPATCH_GL(hookmask)] + | cmp RB, INS + | blo ->fff_fallback | checkfunc CARG4, ->fff_fallback // Traceback must be a function. | mov RB, BASE | strd CARG12, [BASE, #8] // Swap function and traceback. diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc index f6f801f2..c4c0a416 100644 --- a/src/vm_mips.dasc +++ b/src/vm_mips.dasc @@ -1244,9 +1244,13 @@ static void build_subroutines(BuildCtx *ctx) |//-- Base library: catch errors ---------------------------------------- | |.ffunc pcall + | lw TMP1, L->maxstack + | addu TMP2, BASE, NARGS8:RC | lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH) | beqz NARGS8:RC, ->fff_fallback - | move TMP2, BASE + |. sltu AT, TMP1, TMP2 + | bnez AT, ->fff_fallback + |. move TMP2, BASE | addiu BASE, BASE, 8 | // Remember active hook before pcall. | srl TMP3, TMP3, HOOK_ACTIVE_SHIFT @@ -1256,8 +1260,12 @@ static void build_subroutines(BuildCtx *ctx) |. addiu NARGS8:RC, NARGS8:RC, -8 | |.ffunc xpcall + | lw TMP1, L->maxstack + | addu TMP2, BASE, NARGS8:RC | sltiu AT, NARGS8:RC, 16 | lw CARG4, 8+HI(BASE) + | sltu TMP1, TMP1, TMP2 + | or AT, AT, TMP1 | bnez AT, ->fff_fallback |. ldc1 FARG2, 8(BASE) | ldc1 FARG1, 0(BASE) diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc index 61ebbb04..d6792f2c 100644 --- a/src/vm_ppc.dasc +++ b/src/vm_ppc.dasc @@ -1537,8 +1537,12 @@ static void build_subroutines(BuildCtx *ctx) |//-- Base library: catch errors ---------------------------------------- | |.ffunc pcall + | lwz TMP1, L->maxstack + | add TMP2, BASE, NARGS8:RC | cmplwi NARGS8:RC, 8 | lbz TMP3, DISPATCH_GL(hookmask)(DISPATCH) + | cmplw cr1, TMP1, TMP2 + | cror 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | blt ->fff_fallback | mr TMP2, BASE | la BASE, 8(BASE) @@ -1549,9 +1553,13 @@ static void build_subroutines(BuildCtx *ctx) | b ->vm_call_dispatch | |.ffunc xpcall + | lwz TMP1, L->maxstack + | add TMP2, BASE, NARGS8:RC | cmplwi NARGS8:RC, 16 | lwz CARG4, 8(BASE) + | cmplw cr1, TMP1, TMP2 | lfd FARG2, 8(BASE) + | cror 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | lfd FARG1, 0(BASE) | blt ->fff_fallback | lbz TMP1, DISPATCH_GL(hookmask)(DISPATCH) diff --git a/src/vm_ppcspe.dasc b/src/vm_ppcspe.dasc index c4a44191..ea33c08b 100644 --- a/src/vm_ppcspe.dasc +++ b/src/vm_ppcspe.dasc @@ -1184,8 +1184,12 @@ static void build_subroutines(BuildCtx *ctx) |//-- Base library: catch errors ---------------------------------------- | |.ffunc pcall + | lwz TMP1, L->maxstack + | add TMP2, BASE, NARGS8:RC | cmplwi NARGS8:RC, 8 | lbz TMP3, DISPATCH_GL(hookmask)(DISPATCH) + | cmplw cr1, TMP1, TMP2 + | cror 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | blt ->fff_fallback | mr TMP2, BASE | la BASE, 8(BASE) @@ -1196,8 +1200,12 @@ static void build_subroutines(BuildCtx *ctx) | b ->vm_call_dispatch | |.ffunc_2 xpcall + | lwz TMP1, L->maxstack + | add TMP2, BASE, NARGS8:RC | lbz TMP3, DISPATCH_GL(hookmask)(DISPATCH) | mr TMP2, BASE + | cmplw TMP1, TMP2 + | blt ->fff_fallback | checkfunc CARG2 // Traceback must be a function. | checkfail ->fff_fallback | la BASE, 16(BASE) diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc index 56712f90..811d5e75 100644 --- a/src/vm_x86.dasc +++ b/src/vm_x86.dasc @@ -1720,6 +1720,9 @@ static void build_subroutines(BuildCtx *ctx) |//-- Base library: catch errors ---------------------------------------- | |.ffunc_1 pcall + | mov L:RB, SAVE_L + | lea RA, [BASE+NARGS:RD*8] + | cmp RA, L:RB->maxstack; ja ->fff_fallback | lea RA, [BASE+8] | sub NARGS:RD, 1 | mov PC, 8+FRAME_PCALL @@ -1731,6 +1734,9 @@ static void build_subroutines(BuildCtx *ctx) | jmp ->vm_call_dispatch | |.ffunc_2 xpcall + | mov L:RB, SAVE_L + | lea RA, [BASE+NARGS:RD*8] + | cmp RA, L:RB->maxstack; ja ->fff_fallback | cmp dword [BASE+12], LJ_TFUNC; jne ->fff_fallback | mov RB, [BASE+4] // Swap function and traceback. | mov [BASE+12], RB From d854d00ce94b274359e5181bed13e977420daf5c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 12 Nov 2023 15:18:44 +0100 Subject: [PATCH 69/95] x86/x64: Add more red zone checks to assembler backend. Thanks to Peter Cawley. #1116 --- src/lj_asm_x86.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index e01def59..6b114802 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -674,6 +674,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left) emit_rr(as, XO_CVTSI2SD, tmp, dest); if (!(as->flags & JIT_F_SPLIT_XMM)) emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */ + checkmclim(as); emit_rr(as, XO_CVTTSD2SI, dest, left); /* Can't fuse since left is needed twice. */ } @@ -713,6 +714,7 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ emit_loadn(as, bias, k); + checkmclim(as); emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); return; } else { /* Integer to FP conversion. */ @@ -1025,6 +1027,7 @@ static void asm_href(ASMState *as, IRIns *ir) emit_jcc(as, CC_E, nilexit); else emit_sjcc(as, CC_E, l_end); + checkmclim(as); if (irt_isnum(kt)) { if (isk) { /* Assumes -0.0 is already canonicalized to +0.0. */ @@ -1065,7 +1068,6 @@ static void asm_href(ASMState *as, IRIns *ir) emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); } emit_sfixup(as, l_loop); - checkmclim(as); /* Load main position relative to tab->node into dest. */ khash = isk ? ir_khash(irkey) : 1; @@ -1091,6 +1093,7 @@ static void asm_href(ASMState *as, IRIns *ir) emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp); emit_shifti(as, XOg_ROL, tmp, HASH_ROT3); emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp); + checkmclim(as); emit_shifti(as, XOg_ROL, dest, HASH_ROT2); emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest); emit_shifti(as, XOg_ROL, dest, HASH_ROT1); @@ -1375,6 +1378,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) if (irt_islightud(ir->t)) { Reg dest = asm_load_lightud64(as, ir, 1); if (ra_hasreg(dest)) { + checkmclim(as); asm_fuseahuref(as, ir->op1, RSET_GPR); emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM); } @@ -1394,6 +1398,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE); if (LJ_64 && irt_type(ir->t) >= IRT_NUM) { lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t)); + checkmclim(as); emit_u32(as, LJ_TISNUM); emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); } else { From 45c88b7963de2969a9a656c03ba06ad995d7fd5f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 12 Nov 2023 15:41:52 +0100 Subject: [PATCH 70/95] x86/x64: Don't fuse loads across table.clear. Reported by Peter Cawley. #1117 --- src/lj_asm_x86.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index d98fb827..a105b439 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -473,6 +473,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) } } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) && + noconflict(as, ref, IR_CALLS, 0) && /* Don't cross table.clear. */ !(LJ_GC64 && irt_isaddr(ir->t))) { asm_fuseahuref(as, ir->op1, xallow); return RID_MRM; From 113a168b792cd367822ec04cdc2ef32facd28efa Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 12 Nov 2023 16:11:11 +0100 Subject: [PATCH 71/95] Improve last commit. --- src/lj_asm_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index a105b439..955a54a4 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -473,7 +473,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) } } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) && - noconflict(as, ref, IR_CALLS, 0) && /* Don't cross table.clear. */ + noconflict(as, ref, IR_CALLS, 1) && /* Don't cross table.clear. */ !(LJ_GC64 && irt_isaddr(ir->t))) { asm_fuseahuref(as, ir->op1, xallow); return RID_MRM; From 644723649ea04cb23b72c814b88b72a29e4afed4 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 14 Nov 2023 22:50:21 +0100 Subject: [PATCH 72/95] x86/x64: Don't fuse loads across IR_NEWREF. Reported by Peter Cawley. #1117 --- src/lj_asm_x86.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 6b114802..ddbe9c55 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -93,7 +93,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k) /* Check if there's no conflicting instruction between curins and ref. ** Also avoid fusing loads if there are multiple references. */ -static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload) +static int noconflict(ASMState *as, IRRef ref, IROp conflict, int check) { IRIns *ir = as->ir; IRRef i = as->curins; @@ -102,7 +102,9 @@ static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload) while (--i > ref) { if (ir[i].o == conflict) return 0; /* Conflict found. */ - else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref)) + else if ((check & 1) && ir[i].o == IR_NEWREF) + return 0; + else if ((check & 2) && (ir[i].op1 == ref || ir[i].op2 == ref)) return 0; } return 1; /* Ok, no conflict. */ @@ -118,7 +120,7 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref) lua_assert(irb->op2 == IRFL_TAB_ARRAY); /* We can avoid the FLOAD of t->array for colocated arrays. */ if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE && - !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) { + !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 0)) { as->mrm.ofs = (int32_t)sizeof(GCtab); /* Ofs to colocated array. */ return irb->op1; /* Table obj. */ } @@ -337,7 +339,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; if (ir->o == IR_SLOAD) { if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) && - noconflict(as, ref, IR_RETF, 0)) { + noconflict(as, ref, IR_RETF, 2)) { as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); as->mrm.ofs = 8*((int32_t)ir->op1-1) + ((ir->op2&IRSLOAD_FRAME)?4:0); as->mrm.idx = RID_NONE; @@ -346,12 +348,12 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) } else if (ir->o == IR_FLOAD) { /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */ if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) && - noconflict(as, ref, IR_FSTORE, 0)) { + noconflict(as, ref, IR_FSTORE, 2)) { asm_fusefref(as, ir, xallow); return RID_MRM; } } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { - if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0)) { + if (noconflict(as, ref, ir->o + IRDELTA_L2S, 2+(ir->o != IR_ULOAD))) { asm_fuseahuref(as, ir->op1, xallow); return RID_MRM; } @@ -360,7 +362,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) ** Fusing unaligned memory operands is ok on x86 (except for SIMD types). */ if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) && - noconflict(as, ref, IR_XSTORE, 0)) { + noconflict(as, ref, IR_XSTORE, 2)) { asm_fusexref(as, ir->op1, xallow); return RID_MRM; } From 43d0a19158ceabaa51b0462c1ebc97612b420a2e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 15 Nov 2023 01:41:31 +0100 Subject: [PATCH 73/95] Fix last commit. --- src/lj_asm_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 9fa411a0..aee33716 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -118,7 +118,7 @@ static int noconflict(ASMState *as, IRRef ref, IROp conflict, int check) while (--i > ref) { if (ir[i].o == conflict) return 0; /* Conflict found. */ - else if ((check & 1) && ir[i].o == IR_NEWREF) + else if ((check & 1) && (ir[i].o == IR_NEWREF || ir[i].o == IR_CALLS)) return 0; else if ((check & 2) && (ir[i].op1 == ref || ir[i].op2 == ref)) return 0; From 1761fd2ef79ffe1778011c7e9cb03ed361b48c5e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 14:29:45 +0100 Subject: [PATCH 74/95] Emit sunk IR_NEWREF only once per key on snapshot replay. Thanks to Sergey Kaplun and Peter Cawley. #1128 --- src/lj_snap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/lj_snap.c b/src/lj_snap.c index a6cd93d4..5a5c481b 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -575,9 +575,21 @@ void lj_snap_replay(jit_State *J, GCtrace *T) if (irr->o == IR_HREFK || irr->o == IR_AREF) { IRIns *irf = &T->ir[irr->op1]; tmp = emitir(irf->ot, tmp, irf->op2); + } else if (irr->o == IR_NEWREF) { + IRRef allocref = tref_ref(tr); + IRRef keyref = tref_ref(key); + IRRef newref_ref = J->chain[IR_NEWREF]; + IRIns *newref = &J->cur.ir[newref_ref]; + lua_assert(irref_isk(keyref)); + if (newref_ref > allocref && newref->op2 == keyref) { + lua_assert(newref->op1 == allocref); + tmp = newref_ref; + goto skip_newref; + } } } tmp = emitir(irr->ot, tmp, key); + skip_newref: val = snap_pref(J, T, map, nent, seen, irs->op2); if (val == 0) { IRIns *irc = &T->ir[irs->op2]; From d1236a4caa999b29e774ef5103df3b424d821d9b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 14:41:56 +0100 Subject: [PATCH 75/95] Optimize table.new() with constant args to (sinkable) IR_TNEW. Thanks to Peter Cawley. #1128 --- src/lj_ffrecord.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 1233e5f7..151c4c8c 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -1444,6 +1444,15 @@ static void LJ_FASTCALL recff_table_new(jit_State *J, RecordFFData *rd) { TRef tra = lj_opt_narrow_toint(J, J->base[0]); TRef trh = lj_opt_narrow_toint(J, J->base[1]); + if (tref_isk(tra) && tref_isk(trh)) { + int32_t a = IR(tref_ref(tra))->i; + if (a < 0x7fff) { + uint32_t hbits = hsize2hbits(IR(tref_ref(trh))->i); + a = a > 0 ? a+1 : 0; + J->base[0] = emitir(IRTG(IR_TNEW, IRT_TAB), (uint32_t)a, hbits); + return; + } + } J->base[0] = lj_ir_call(J, IRCALL_lj_tab_new_ah, tra, trh); UNUSED(rd); } From dcf3627d79091e8c5535b15fc0ef40281ec9b9f7 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 14:48:34 +0100 Subject: [PATCH 76/95] Fix .debug_abbrev section in GDB JIT API. Thanks to Dmitry Stogov. #1129 --- src/lj_gdbjit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c index 01f51ba7..c0d7a164 100644 --- a/src/lj_gdbjit.c +++ b/src/lj_gdbjit.c @@ -633,7 +633,7 @@ static void LJ_FASTCALL gdbjit_debugabbrev(GDBJITctx *ctx) DUV(DW_AT_low_pc); DUV(DW_FORM_addr); DUV(DW_AT_high_pc); DUV(DW_FORM_addr); DUV(DW_AT_stmt_list); DUV(DW_FORM_data4); - DB(0); DB(0); + DB(0); DB(0); DB(0); ctx->p = p; } From 856423f5dabb5bbf86d36816a247663e90c69d35 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 15:00:52 +0100 Subject: [PATCH 77/95] Fix runtime library flags for MSVC debug builds. Reported by igor725. #1127 --- src/msvcbuild.bat | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index 1f60b8f4..0ac65409 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -15,6 +15,7 @@ @rem Add more debug flags here, e.g. DEBUGCFLAGS=/DLUA_USE_APICHECK @set DEBUGCFLAGS= @set LJCOMPILE=cl /nologo /c /O2 /W3 /D_CRT_SECURE_NO_DEPRECATE /D_CRT_STDIO_INLINE=__declspec(dllexport)__inline +@set LJDYNBUILD=/MD /DLUA_BUILD_AS_DLL @set LJLINK=link /nologo @set LJMT=mt /nologo @set LJLIB=lib /nologo /nodefaultlib @@ -71,11 +72,12 @@ buildvm -m folddef -o lj_folddef.h lj_opt_fold.c @shift @set BUILDTYPE=debug @set LJCOMPILE=%LJCOMPILE% /Zi %DEBUGCFLAGS% +@set LJDYNBUILD=/MDd /DLUA_BUILD_AS_DLL :NODEBUG @set LJLINK=%LJLINK% /%BUILDTYPE% @if "%1"=="amalg" goto :AMALGDLL @if "%1"=="static" goto :STATIC -%LJCOMPILE% /MD /DLUA_BUILD_AS_DLL lj_*.c lib_*.c +%LJCOMPILE% %LJDYNBUILD% lj_*.c lib_*.c @if errorlevel 1 goto :BAD %LJLINK% /DLL /out:%LJDLLNAME% lj_*.obj lib_*.obj @if errorlevel 1 goto :BAD @@ -87,7 +89,7 @@ buildvm -m folddef -o lj_folddef.h lj_opt_fold.c @if errorlevel 1 goto :BAD @goto :MTDLL :AMALGDLL -%LJCOMPILE% /MD /DLUA_BUILD_AS_DLL ljamalg.c +%LJCOMPILE% %LJDYNBUILD% ljamalg.c @if errorlevel 1 goto :BAD %LJLINK% /DLL /out:%LJDLLNAME% ljamalg.obj lj_vm.obj @if errorlevel 1 goto :BAD From e02cb19b570d79133a7581e0163e86b69cc792be Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 15:33:47 +0100 Subject: [PATCH 78/95] Fix anchoring for string buffer set() method. Thanks to Peter Cawley. #1125 --- src/lj_ffrecord.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 151c4c8c..c70793a4 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -1204,6 +1204,15 @@ static void LJ_FASTCALL recff_buffer_method_set(jit_State *J, RecordFFData *rd) if (tref_isstr(tr)) { TRef trp = emitir(IRT(IR_STRREF, IRT_PGC), tr, lj_ir_kint(J, 0)); TRef len = emitir(IRTI(IR_FLOAD), tr, IRFL_STR_LEN); + IRIns *irp = IR(tref_ref(trp)); + /* Anchor (potentially different) obj into which trp points after fold. */ + if (irp->o == IR_STRREF) { + tr = irp->op1; + } else if (irp->o == IR_KKPTR && !tref_isk(tr)) { + GCstr *str = strV(&rd->argv[1]); /* Constify the argument. */ + tr = lj_ir_kstr(J, str); + trp = lj_ir_kkptr(J, (char *)strdata(str)); + } lj_ir_call(J, IRCALL_lj_bufx_set, trbuf, trp, len, tr); #if LJ_HASFFI } else if (tref_iscdata(tr)) { From 1b38c736550004fba1b9712c1a5788b3eefa49be Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 15:45:10 +0100 Subject: [PATCH 79/95] Document workaround for multilib vs. cross-compiler conflict. Reported by igorpupkinable. #1126 --- doc/install.html | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/install.html b/doc/install.html index 21866315..7f2e40e4 100644 --- a/doc/install.html +++ b/doc/install.html @@ -240,7 +240,10 @@ for any supported target, as long as both architectures have the same pointer size. If you want to cross-compile to any 32 bit target on an x64 OS, you need to install the multilib development package (e.g. libc6-dev-i386 on Debian/Ubuntu) and build a 32 bit host part -(HOST_CC="gcc -m32"). +(HOST_CC="gcc -m32"). On some distro versions, multilib conflicts +with cross-compilers. The workaround is to install the x86 cross-compiler +package gcc-i686-linux-gnu and use it to build the host part +(HOST_CC=i686-linux-gnu-gcc).

      You need to specify TARGET_SYS whenever the host OS and the From 10cc759f259e1f3b6572ce663858c8ce4d34a483 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 16:10:48 +0100 Subject: [PATCH 80/95] ARM: Fix stack restore for FP slots. Thanks to Peter Cawley. #1131 --- src/lj_asm_arm.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index f53f708b..8869af32 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -1991,11 +1991,12 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) SnapEntry *map = &as->T->snapmap[snap->mapofs]; SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1]; MSize n, nent = snap->nent; + int32_t bias = 0; /* Store the value of all modified slots to the Lua stack. */ for (n = 0; n < nent; n++) { SnapEntry sn = map[n]; BCReg s = snap_slot(sn); - int32_t ofs = 8*((int32_t)s-1); + int32_t ofs = 8*((int32_t)s-1) - bias; IRRef ref = snap_ref(sn); IRIns *ir = IR(ref); if ((sn & SNAP_NORESTORE)) @@ -2013,6 +2014,12 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) emit_lso(as, ARMI_STR, tmp, RID_BASE, ofs+4); #else Reg src = ra_alloc1(as, ref, RSET_FPR); + if (LJ_UNLIKELY(ofs < -1020 || ofs > 1020)) { + int32_t adj = ofs & 0xffffff00; /* K12-friendly. */ + bias += adj; + ofs -= adj; + emit_addptr(as, RID_BASE, -adj); + } emit_vlso(as, ARMI_VSTR_D, src, RID_BASE, ofs); #endif } else { @@ -2038,6 +2045,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) } checkmclim(as); } + emit_addptr(as, RID_BASE, bias); lua_assert(map + nent == flinks); } From ff204d0350575cf710f6f4af982db146cb454e1a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Dec 2023 19:42:22 +0100 Subject: [PATCH 81/95] Fix anchoring for string buffer set() method (again). Thanks to Peter Cawley. #1125 --- src/lj_ffrecord.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index c70793a4..30dc6bfc 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -1205,14 +1205,11 @@ static void LJ_FASTCALL recff_buffer_method_set(jit_State *J, RecordFFData *rd) TRef trp = emitir(IRT(IR_STRREF, IRT_PGC), tr, lj_ir_kint(J, 0)); TRef len = emitir(IRTI(IR_FLOAD), tr, IRFL_STR_LEN); IRIns *irp = IR(tref_ref(trp)); - /* Anchor (potentially different) obj into which trp points after fold. */ - if (irp->o == IR_STRREF) { + /* trp must point into the anchored obj, even after folding. */ + if (irp->o == IR_STRREF) tr = irp->op1; - } else if (irp->o == IR_KKPTR && !tref_isk(tr)) { - GCstr *str = strV(&rd->argv[1]); /* Constify the argument. */ - tr = lj_ir_kstr(J, str); - trp = lj_ir_kkptr(J, (char *)strdata(str)); - } + else if (!tref_isk(tr)) + trp = emitir(IRT(IR_ADD, IRT_PGC), tr, lj_ir_kintpgc(J, sizeof(GCstr))); lj_ir_call(J, IRCALL_lj_bufx_set, trbuf, trp, len, tr); #if LJ_HASFFI } else if (tref_iscdata(tr)) { From 9bdfd34dccb913777be0efcc6869b6eeb5b9b43b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 11 Dec 2023 13:01:36 +0100 Subject: [PATCH 82/95] Only emit proper parent references in snapshot replay. Thanks to Peter Cawley. #1132 --- src/lj_snap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/lj_snap.c b/src/lj_snap.c index 5a5c481b..b387dd76 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -510,12 +510,14 @@ void lj_snap_replay(jit_State *J, GCtrace *T) IRRef refp = snap_ref(sn); IRIns *ir = &T->ir[refp]; if (regsp_reg(ir->r) == RID_SUNK) { + uint8_t m; if (J->slot[snap_slot(sn)] != snap_slot(sn)) continue; pass23 = 1; lua_assert(ir->o == IR_TNEW || ir->o == IR_TDUP || ir->o == IR_CNEW || ir->o == IR_CNEWI); - if (ir->op1 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op1); - if (ir->op2 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op2); + m = lj_ir_mode[ir->o]; + if (irm_op1(m) == IRMref) snap_pref(J, T, map, nent, seen, ir->op1); + if (irm_op2(m) == IRMref) snap_pref(J, T, map, nent, seen, ir->op2); if (LJ_HASFFI && ir->o == IR_CNEWI) { if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP) snap_pref(J, T, map, nent, seen, (ir+1)->op2); @@ -542,14 +544,16 @@ void lj_snap_replay(jit_State *J, GCtrace *T) IRIns *ir = &T->ir[refp]; if (regsp_reg(ir->r) == RID_SUNK) { TRef op1, op2; + uint8_t m; if (J->slot[snap_slot(sn)] != snap_slot(sn)) { /* De-dup allocs. */ J->slot[snap_slot(sn)] = J->slot[J->slot[snap_slot(sn)]]; continue; } op1 = ir->op1; - if (op1 >= T->nk) op1 = snap_pref(J, T, map, nent, seen, op1); + m = lj_ir_mode[ir->o]; + if (irm_op1(m) == IRMref) op1 = snap_pref(J, T, map, nent, seen, op1); op2 = ir->op2; - if (op2 >= T->nk) op2 = snap_pref(J, T, map, nent, seen, op2); + if (irm_op2(m) == IRMref) op2 = snap_pref(J, T, map, nent, seen, op2); if (LJ_HASFFI && ir->o == IR_CNEWI) { if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP) { lj_needsplit(J); /* Emit joining HIOP. */ From c42c62e71a45a677b8b1cbf749bd33cf4d5918ff Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 23 Dec 2023 19:14:32 +0100 Subject: [PATCH 83/95] Simplify handling of instable types in TNEW/TDUP load forwarding. Thanks to Peter Cawley. #994 --- src/lj_opt_mem.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/lj_opt_mem.c b/src/lj_opt_mem.c index dc74a06d..04b95a6f 100644 --- a/src/lj_opt_mem.c +++ b/src/lj_opt_mem.c @@ -185,25 +185,23 @@ static TRef fwd_ahload(jit_State *J, IRRef xref) } ref = store->prev; } - if (ir->o == IR_TNEW && !irt_isnil(fins->t)) - return 0; /* Type instability in loop-carried dependency. */ - if (irt_ispri(fins->t)) { - return TREF_PRI(irt_type(fins->t)); - } else if (irt_isnum(fins->t) || (LJ_DUALNUM && irt_isint(fins->t)) || - irt_isstr(fins->t)) { + /* Simplified here: let loop_unroll() figure out any type instability. */ + if (ir->o == IR_TNEW) { + return TREF_NIL; + } else { TValue keyv; cTValue *tv; IRIns *key = IR(xr->op2); if (key->o == IR_KSLOT) key = IR(key->op1); lj_ir_kvalue(J->L, &keyv, key); tv = lj_tab_get(J->L, ir_ktab(IR(ir->op1)), &keyv); - if (itype2irt(tv) != irt_type(fins->t)) - return 0; /* Type instability in loop-carried dependency. */ - if (irt_isnum(fins->t)) + if (tvispri(tv)) + return TREF_PRI(itype2irt(tv)); + else if (tvisnum(tv)) return lj_ir_knum_u64(J, tv->u64); - else if (LJ_DUALNUM && irt_isint(fins->t)) + else if (tvisint(tv)) return lj_ir_kint(J, intV(tv)); - else + else if (tvisgcv(tv)) return lj_ir_kstr(J, strV(tv)); } /* Othwerwise: don't intern as a constant. */ From 7dbe545933485849977d50384f2f20f2cccf0cf9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 23 Dec 2023 19:22:34 +0100 Subject: [PATCH 84/95] Respect jit.off() on pending trace exit. Thanks to Sergey Kaplun. #1134 --- src/lj_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_trace.c b/src/lj_trace.c index 25e610b5..d015f2ab 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -804,7 +804,7 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr) if (G(L)->gc.state == GCSatomic || G(L)->gc.state == GCSfinalize) { if (!(G(L)->hookmask & HOOK_GC)) lj_gc_step(L); /* Exited because of GC: drive GC forward. */ - } else { + } else if ((J->flags & JIT_F_ON)) { trace_hotside(J, pc); } if (bc_op(*pc) == BC_JLOOP) { From 658530562c2ac7ffa8e4ca5d18856857471244e9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 23 Dec 2023 19:43:03 +0100 Subject: [PATCH 85/95] Check for IR_HREF vs. IR_HREFK aliasing in non-nil store check. Thanks to Peter Cawley. #1133 --- src/lj_ir.h | 1 + src/lj_opt_mem.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/lj_ir.h b/src/lj_ir.h index 9fd4e275..6d974ed2 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -346,6 +346,7 @@ typedef struct IRType1 { uint8_t irt; } IRType1; #define irt_isu32(t) (irt_type(t) == IRT_U32) #define irt_isi64(t) (irt_type(t) == IRT_I64) #define irt_isu64(t) (irt_type(t) == IRT_U64) +#define irt_isp32(t) (irt_type(t) == IRT_P32) #define irt_isfp(t) (irt_isnum(t) || irt_isfloat(t)) #define irt_isinteger(t) (irt_typerange((t), IRT_I8, IRT_INT)) diff --git a/src/lj_opt_mem.c b/src/lj_opt_mem.c index 04b95a6f..214fb632 100644 --- a/src/lj_opt_mem.c +++ b/src/lj_opt_mem.c @@ -879,6 +879,8 @@ int lj_opt_fwd_wasnonnil(jit_State *J, IROpT loadop, IRRef xref) if (skref == xkref || !irref_isk(skref) || !irref_isk(xkref)) return 0; /* A nil store with same const key or var key MAY alias. */ /* Different const keys CANNOT alias. */ + } else if (irt_isp32(IR(skref)->t) != irt_isp32(IR(xkref)->t)) { + return 0; /* HREF and HREFK MAY alias. */ } /* Different key types CANNOT alias. */ } /* Other non-nil stores MAY alias. */ ref = store->prev; From c525bcb9024510cad9e170e12b6209aedb330f83 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 23 Dec 2023 20:06:17 +0100 Subject: [PATCH 86/95] DynASM/x86: Allow [&expr] operand. Thanks to Dmitry Stogov. #1138 --- dynasm/dasm_x86.lua | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua index 787163c0..df70fed8 100644 --- a/dynasm/dasm_x86.lua +++ b/dynasm/dasm_x86.lua @@ -627,7 +627,11 @@ local function wputmrmsib(t, imark, s, vsreg, psz, sk) werror("NYI: rip-relative displacement followed by immediate") end -- The previous byte in the action buffer cannot be 0xe9 or 0x80-0x8f. - wputlabel("REL_", disp[1], 2) + if disp[2] == "iPJ" then + waction("REL_A", disp[1]) + else + wputlabel("REL_", disp[1], 2) + end else wputdarg(disp) end @@ -744,9 +748,9 @@ local function dispexpr(expr) return imm*map_opsizenum[ops] end local mode, iexpr = immexpr(dispt) - if mode == "iJ" then + if mode == "iJ" or mode == "iPJ" then if c == "-" then werror("cannot invert label reference") end - return { iexpr } + return { iexpr, mode } end return expr -- Need to return original signed expression. end From 4b90f6c4d7420139c135435e1580acb52ea18436 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 22 Jan 2024 19:06:36 +0100 Subject: [PATCH 87/95] Add cross-32/64 bit and deterministic bytecode generation. Contributed by Peter Cawley. #993 #1008 --- doc/extensions.html | 51 +++++++++++++++----- doc/running.html | 3 ++ src/host/genlibbc.lua | 99 +++++++++++++++++++++----------------- src/jit/bcsave.lua | 31 ++++++++---- src/lib_base.c | 6 ++- src/lib_jit.c | 26 ++-------- src/lib_string.c | 22 +++++++-- src/lj_bcdump.h | 4 +- src/lj_bcread.c | 9 ++-- src/lj_bcwrite.c | 109 ++++++++++++++++++++++++++++++++++++------ src/lj_lex.c | 1 + src/lj_lex.h | 1 + src/lj_lib.c | 18 +++++++ src/lj_lib.h | 1 + src/lj_load.c | 29 ++++++++--- src/lj_parse.c | 28 ++++++----- 16 files changed, 306 insertions(+), 132 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index a4f20841..1d28475c 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -160,13 +160,33 @@ passes any arguments after the error function to the function which is called in a protected context.

      -

      loadfile() etc. handle UTF-8 source code

      +

      load*() handle UTF-8 source code

      Non-ASCII characters are handled transparently by the Lua source code parser. This allows the use of UTF-8 characters in identifiers and strings. A UTF-8 BOM is skipped at the start of the source code.

      +

      load*() add a mode parameter

      +

      +As an extension from Lua 5.2, the functions loadstring(), +loadfile() and (new) load() add an optional +mode parameter. +

      +

      +The default mode string is "bt", which allows loading of both +source code and bytecode. Use "t" to allow only source code +or "b" to allow only bytecode to be loaded. +

      +

      +By default, the load* functions generate the native bytecode format. +For cross-compilation purposes, add W to the mode string to +force the 32 bit format and X to force the 64 bit format. +Add both to force the opposite format. Note that non-native bytecode +generated by load* cannot be run, but can still be passed +to string.dump. +

      +

      tostring() etc. canonicalize NaN and ±Inf

      All number-to-string conversions consistently convert non-finite numbers @@ -186,26 +206,33 @@ works independently of the current locale and it supports hex floating-point numbers (e.g. 0x1.5p-3).

      -

      string.dump(f [,strip]) generates portable bytecode

      +

      string.dump(f [,mode]) generates portable bytecode

      An extra argument has been added to string.dump(). If set to -true, 'stripped' bytecode without debug information is -generated. This speeds up later bytecode loading and reduces memory -usage. See also the +true or to a string which contains the character s, +'stripped' bytecode without debug information is generated. This speeds +up later bytecode loading and reduces memory usage. See also the -b command line option.

      The generated bytecode is portable and can be loaded on any architecture -that LuaJIT supports, independent of word size or endianess. However, the -bytecode compatibility versions must match. Bytecode stays compatible -for dot releases (x.y.0 → x.y.1), but may change with major or -minor releases (2.0 → 2.1) or between any beta release. Foreign -bytecode (e.g. from Lua 5.1) is incompatible and cannot be loaded. +that LuaJIT supports. However, the bytecode compatibility versions must +match. Bytecode only stays compatible within a major+minor version +(x.y.aaa → x.y.bbb), except for development branches. Foreign bytecode +(e.g. from Lua 5.1) is incompatible and cannot be loaded.

      Note: LJ_GC64 mode requires a different frame layout, which implies -a different, incompatible bytecode format for all 64 bit ports. This may be -rectified in the future. +a different, incompatible bytecode format between 32 bit and 64 bit ports. +This may be rectified in the future. In the meantime, use the W +and X modes of the load* functions +for cross-compilation purposes. +

      +

      +Due to VM hardening, bytecode is not deterministic. Add d to the +mode string to dump it in a deterministic manner: identical source code +always gives a byte-for-byte identical bytecode dump. This feature is +mainly useful for reproducible builds.

      table.new(narray, nhash) allocates a pre-sized table

      diff --git a/doc/running.html b/doc/running.html index 9dd2b411..142b810f 100644 --- a/doc/running.html +++ b/doc/running.html @@ -106,6 +106,9 @@ are accepted:
    • -l — Only list bytecode.
    • -s — Strip debug info (this is the default).
    • -g — Keep debug info.
    • +
    • -W — Generate 32 bit (non-GC64) bytecode.
    • +
    • -X — Generate 64 bit (GC64) bytecode.
    • +
    • -d — Generate bytecode in deterministic manner.
    • -n name — Set module name (default: auto-detect from input name)
    • -t type — Set output file type (default: auto-detect from output name).
    • -a arch — Override architecture for object files (default: native).
    • diff --git a/src/host/genlibbc.lua b/src/host/genlibbc.lua index 3621c3f5..e697fceb 100644 --- a/src/host/genlibbc.lua +++ b/src/host/genlibbc.lua @@ -138,65 +138,73 @@ local function fixup_dump(dump, fixup) return { dump = ndump, startbc = startbc, sizebc = sizebc } end -local function find_defs(src) +local function find_defs(src, mode) local defs = {} for name, code in string.gmatch(src, "LJLIB_LUA%(([^)]*)%)%s*/%*(.-)%*/") do - local env = {} local tcode, fixup = transform_lua(code) - local func = assert(load(tcode, "", nil, env))() - defs[name] = fixup_dump(string.dump(func, true), fixup) + local func = assert(load(tcode, "", mode)) + defs[name] = fixup_dump(string.dump(func, mode), fixup) defs[#defs+1] = name end return defs end -local function gen_header(defs) +local function gen_header(defs32, defs64) local t = {} local function w(x) t[#t+1] = x end w("/* This is a generated file. DO NOT EDIT! */\n\n") w("static const int libbc_endian = ") w(isbe and 1 or 0) w(";\n\n") - local s, sb = "", "" - for i,name in ipairs(defs) do - local d = defs[name] - s = s .. d.dump - sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1) - .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc) - .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4) - end - w("static const uint8_t libbc_code[] = {\n") - local n = 0 - for i=1,#s do - local x = string.byte(s, i) - local xb = string.byte(sb, i) - if xb == 255 then - local name = BCN[x] - local m = #name + 4 - if n + m > 78 then n = 0; w("\n") end - n = n + m - w("BC_"); w(name) - else - local m = x < 10 and 2 or (x < 100 and 3 or 4) - if xb == 0 then - if n + m > 78 then n = 0; w("\n") end - else - local name = defs[xb]:gsub("_", ".") - if n ~= 0 then w("\n") end - w("/* "); w(name); w(" */ ") - n = #name + 7 - end - n = n + m - w(x) + for j,defs in ipairs{defs64, defs32} do + local s, sb = "", "" + for i,name in ipairs(defs) do + local d = defs[name] + s = s .. d.dump + sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1) + .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc) + .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4) + end + if j == 1 then + w("static const uint8_t libbc_code[] = {\n#if LJ_FR2\n") + else + w("\n#else\n") + end + local n = 0 + for i=1,#s do + local x = string.byte(s, i) + local xb = string.byte(sb, i) + if xb == 255 then + local name = BCN[x] + local m = #name + 4 + if n + m > 78 then n = 0; w("\n") end + n = n + m + w("BC_"); w(name) + else + local m = x < 10 and 2 or (x < 100 and 3 or 4) + if xb == 0 then + if n + m > 78 then n = 0; w("\n") end + else + local name = defs[xb]:gsub("_", ".") + if n ~= 0 then w("\n") end + w("/* "); w(name); w(" */ ") + n = #name + 7 + end + n = n + m + w(x) + end + w(",") end - w(",") end - w("\n0\n};\n\n") + w("\n#endif\n0\n};\n\n") w("static const struct { const char *name; int ofs; } libbc_map[] = {\n") - local m = 0 - for _,name in ipairs(defs) do - w('{"'); w(name); w('",'); w(m) w('},\n') - m = m + #defs[name].dump + local m32, m64 = 0, 0 + for i,name in ipairs(defs32) do + assert(name == defs64[i]) + w('{"'); w(name); w('",'); w(m32) w('},\n') + m32 = m32 + #defs32[name].dump + m64 = m64 + #defs64[name].dump + assert(m32 == m64) end - w("{NULL,"); w(m); w("}\n};\n\n") + w("{NULL,"); w(m32); w("}\n};\n\n") return table.concat(t) end @@ -219,7 +227,8 @@ end local outfile = parse_arg(arg) local src = read_files(arg) -local defs = find_defs(src) -local hdr = gen_header(defs) +local defs32 = find_defs(src, "Wdts") +local defs64 = find_defs(src, "Xdts") +local hdr = gen_header(defs32, defs64) write_file(outfile, hdr) diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index 390d297c..131bf39b 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -29,6 +29,9 @@ Save LuaJIT bytecode: luajit -b[options] input output -l Only list bytecode. -s Strip debug info (default). -g Keep debug info. + -W Generate 32 bit (non-GC64) bytecode. + -X Generate 64 bit (GC64) bytecode. + -d Generate bytecode in deterministic manner. -n name Set module name (default: auto-detect from input name). -t type Set output file type (default: auto-detect from output name). -a arch Override architecture for object files (default: native). @@ -51,8 +54,9 @@ local function check(ok, ...) end local function readfile(ctx, input) - if type(input) == "function" then return input end - if ctx.filename then + if ctx.string then + return check(loadstring(input, nil, ctx.mode)) + elseif ctx.filename then local data if input == "-" then data = io.stdin:read("*a") @@ -61,10 +65,10 @@ local function readfile(ctx, input) data = assert(fp:read("*a")) assert(fp:close()) end - return check(load(data, ctx.filename)) + return check(load(data, ctx.filename, ctx.mode)) else if input == "-" then input = nil end - return check(loadfile(input)) + return check(loadfile(input, ctx.mode)) end end @@ -624,7 +628,7 @@ end local function bcsave(ctx, input, output) local f = readfile(ctx, input) - local s = string.dump(f, ctx.strip) + local s = string.dump(f, ctx.mode) local t = ctx.type if not t then t = detecttype(output) @@ -647,9 +651,11 @@ local function docmd(...) local n = 1 local list = false local ctx = { - strip = true, arch = jit.arch, os = jit.os:lower(), - type = false, modname = false, + mode = "bt", arch = jit.arch, os = jit.os:lower(), + type = false, modname = false, string = false, } + local strip = "s" + local gc64 = "" while n <= #arg do local a = arg[n] if type(a) == "string" and a:sub(1, 1) == "-" and a ~= "-" then @@ -660,14 +666,18 @@ local function docmd(...) if opt == "l" then list = true elseif opt == "s" then - ctx.strip = true + strip = "s" elseif opt == "g" then - ctx.strip = false + strip = "" + elseif opt == "W" or opt == "X" then + gc64 = opt + elseif opt == "d" then + ctx.mode = ctx.mode .. opt else if arg[n] == nil or m ~= #a then usage() end if opt == "e" then if n ~= 1 then usage() end - arg[1] = check(loadstring(arg[1])) + ctx.string = true elseif opt == "n" then ctx.modname = checkmodname(tremove(arg, n)) elseif opt == "t" then @@ -687,6 +697,7 @@ local function docmd(...) n = n + 1 end end + ctx.mode = ctx.mode .. strip .. gc64 if list then if #arg == 0 or #arg > 2 then usage() end bclist(ctx, arg[1], arg[2] or "-") diff --git a/src/lib_base.c b/src/lib_base.c index 4e6f8a30..d644b4f2 100644 --- a/src/lib_base.c +++ b/src/lib_base.c @@ -360,7 +360,11 @@ LJLIB_ASM_(xpcall) LJLIB_REC(.) static int load_aux(lua_State *L, int status, int envarg) { if (status == LUA_OK) { - if (tvistab(L->base+envarg-1)) { + /* + ** Set environment table for top-level function. + ** Don't do this for non-native bytecode, which returns a prototype. + */ + if (tvistab(L->base+envarg-1) && tvisfunc(L->top-1)) { GCfunc *fn = funcV(L->top-1); GCtab *t = tabV(L->base+envarg-1); setgcref(fn->c.env, obj2gco(t)); diff --git a/src/lib_jit.c b/src/lib_jit.c index c0294927..b83c865a 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -161,24 +161,6 @@ LJLIB_PUSH(top-2) LJLIB_SET(version) /* -- Reflection API for Lua functions ------------------------------------ */ -/* Return prototype of first argument (Lua function or prototype object) */ -static GCproto *check_Lproto(lua_State *L, int nolua) -{ - TValue *o = L->base; - if (L->top > o) { - if (tvisproto(o)) { - return protoV(o); - } else if (tvisfunc(o)) { - if (isluafunc(funcV(o))) - return funcproto(funcV(o)); - else if (nolua) - return NULL; - } - } - lj_err_argt(L, 1, LUA_TFUNCTION); - return NULL; /* unreachable */ -} - static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val) { setintV(lj_tab_setstr(L, t, lj_str_newz(L, name)), val); @@ -187,7 +169,7 @@ static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val) /* local info = jit.util.funcinfo(func [,pc]) */ LJLIB_CF(jit_util_funcinfo) { - GCproto *pt = check_Lproto(L, 1); + GCproto *pt = lj_lib_checkLproto(L, 1, 1); if (pt) { BCPos pc = (BCPos)lj_lib_optint(L, 2, 0); GCtab *t; @@ -229,7 +211,7 @@ LJLIB_CF(jit_util_funcinfo) /* local ins, m = jit.util.funcbc(func, pc) */ LJLIB_CF(jit_util_funcbc) { - GCproto *pt = check_Lproto(L, 0); + GCproto *pt = lj_lib_checkLproto(L, 1, 0); BCPos pc = (BCPos)lj_lib_checkint(L, 2); if (pc < pt->sizebc) { BCIns ins = proto_bc(pt)[pc]; @@ -246,7 +228,7 @@ LJLIB_CF(jit_util_funcbc) /* local k = jit.util.funck(func, idx) */ LJLIB_CF(jit_util_funck) { - GCproto *pt = check_Lproto(L, 0); + GCproto *pt = lj_lib_checkLproto(L, 1, 0); ptrdiff_t idx = (ptrdiff_t)lj_lib_checkint(L, 2); if (idx >= 0) { if (idx < (ptrdiff_t)pt->sizekn) { @@ -266,7 +248,7 @@ LJLIB_CF(jit_util_funck) /* local name = jit.util.funcuvname(func, idx) */ LJLIB_CF(jit_util_funcuvname) { - GCproto *pt = check_Lproto(L, 0); + GCproto *pt = lj_lib_checkLproto(L, 1, 0); uint32_t idx = (uint32_t)lj_lib_checkint(L, 2); if (idx < pt->sizeuv) { setstrV(L, L->top-1, lj_str_newz(L, lj_debug_uvname(pt, idx))); diff --git a/src/lib_string.c b/src/lib_string.c index 29bcb8fe..255689ce 100644 --- a/src/lib_string.c +++ b/src/lib_string.c @@ -122,11 +122,25 @@ static int writer_buf(lua_State *L, const void *p, size_t size, void *sb) LJLIB_CF(string_dump) { - GCfunc *fn = lj_lib_checkfunc(L, 1); - int strip = L->base+1 < L->top && tvistruecond(L->base+1); - SBuf *sb = lj_buf_tmp_(L); /* Assumes lj_bcwrite() doesn't use tmpbuf. */ + GCproto *pt = lj_lib_checkLproto(L, 1, 1); + uint32_t flags = 0; + SBuf *sb; + TValue *o = L->base+1; + if (o < L->top) { + if (tvisstr(o)) { + const char *mode = strVdata(o); + char c; + while ((c = *mode++)) { + if (c == 's') flags |= BCDUMP_F_STRIP; + if (c == 'd') flags |= BCDUMP_F_DETERMINISTIC; + } + } else if (tvistruecond(o)) { + flags |= BCDUMP_F_STRIP; + } + } + sb = lj_buf_tmp_(L); /* Assumes lj_bcwrite() doesn't use tmpbuf. */ L->top = L->base+1; - if (!isluafunc(fn) || lj_bcwrite(L, funcproto(fn), writer_buf, sb, strip)) + if (!pt || lj_bcwrite(L, pt, writer_buf, sb, flags)) lj_err_caller(L, LJ_ERR_STRDUMP); setstrV(L, L->top-1, lj_buf_str(L, sb)); lj_gc_check(L); diff --git a/src/lj_bcdump.h b/src/lj_bcdump.h index 6ba71e25..3e56e39c 100644 --- a/src/lj_bcdump.h +++ b/src/lj_bcdump.h @@ -46,6 +46,8 @@ #define BCDUMP_F_KNOWN (BCDUMP_F_FR2*2-1) +#define BCDUMP_F_DETERMINISTIC 0x80000000 + /* Type codes for the GC constants of a prototype. Plus length for strings. */ enum { BCDUMP_KGC_CHILD, BCDUMP_KGC_TAB, BCDUMP_KGC_I64, BCDUMP_KGC_U64, @@ -61,7 +63,7 @@ enum { /* -- Bytecode reader/writer ---------------------------------------------- */ LJ_FUNC int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, - void *data, int strip); + void *data, uint32_t flags); LJ_FUNC GCproto *lj_bcread_proto(LexState *ls); LJ_FUNC GCproto *lj_bcread(LexState *ls); diff --git a/src/lj_bcread.c b/src/lj_bcread.c index c98c0d42..637ef067 100644 --- a/src/lj_bcread.c +++ b/src/lj_bcread.c @@ -281,8 +281,11 @@ static void bcread_knum(LexState *ls, GCproto *pt, MSize sizekn) static void bcread_bytecode(LexState *ls, GCproto *pt, MSize sizebc) { BCIns *bc = proto_bc(pt); - bc[0] = BCINS_AD((pt->flags & PROTO_VARARG) ? BC_FUNCV : BC_FUNCF, - pt->framesize, 0); + BCIns op; + if (ls->fr2 != LJ_FR2) op = BC_NOT; /* Mark non-native prototype. */ + else if ((pt->flags & PROTO_VARARG)) op = BC_FUNCV; + else op = BC_FUNCF; + bc[0] = BCINS_AD(op, pt->framesize, 0); bcread_block(ls, bc+1, (sizebc-1)*(MSize)sizeof(BCIns)); /* Swap bytecode instructions if the endianess differs. */ if (bcread_swap(ls)) { @@ -395,7 +398,7 @@ static int bcread_header(LexState *ls) bcread_byte(ls) != BCDUMP_VERSION) return 0; bcread_flags(ls) = flags = bcread_uleb128(ls); if ((flags & ~(BCDUMP_F_KNOWN)) != 0) return 0; - if ((flags & BCDUMP_F_FR2) != LJ_FR2*BCDUMP_F_FR2) return 0; + if ((flags & BCDUMP_F_FR2) != (uint32_t)ls->fr2*BCDUMP_F_FR2) return 0; if ((flags & BCDUMP_F_FFI)) { #if LJ_HASFFI lua_State *L = ls->L; diff --git a/src/lj_bcwrite.c b/src/lj_bcwrite.c index dd969413..c062dc49 100644 --- a/src/lj_bcwrite.c +++ b/src/lj_bcwrite.c @@ -27,7 +27,9 @@ typedef struct BCWriteCtx { GCproto *pt; /* Root prototype. */ lua_Writer wfunc; /* Writer callback. */ void *wdata; /* Writer callback data. */ - int strip; /* Strip debug info. */ + TValue **heap; /* Heap used for deterministic sorting. */ + uint32_t heapsz; /* Size of heap. */ + uint32_t flags; /* BCDUMP_F_* flags. */ int status; /* Status from writer callback. */ #ifdef LUA_USE_ASSERT global_State *g; @@ -76,6 +78,75 @@ static void bcwrite_ktabk(BCWriteCtx *ctx, cTValue *o, int narrow) ctx->sb.w = p; } +/* Compare two template table keys. */ +static LJ_AINLINE int bcwrite_ktabk_lt(TValue *a, TValue *b) +{ + uint32_t at = itype(a), bt = itype(b); + if (at != bt) { /* This also handles false and true keys. */ + return at < bt; + } else if (at == LJ_TSTR) { + return lj_str_cmp(strV(a), strV(b)) < 0; + } else { + return a->u64 < b->u64; /* This works for numbers and integers. */ + } +} + +/* Insert key into a sorted heap. */ +static void bcwrite_ktabk_heap_insert(TValue **heap, MSize idx, MSize end, + TValue *key) +{ + MSize child; + while ((child = idx * 2 + 1) < end) { + /* Find lower of the two children. */ + TValue *c0 = heap[child]; + if (child + 1 < end) { + TValue *c1 = heap[child + 1]; + if (bcwrite_ktabk_lt(c1, c0)) { + c0 = c1; + child++; + } + } + if (bcwrite_ktabk_lt(key, c0)) break; /* Key lower? Found our position. */ + heap[idx] = c0; /* Move lower child up. */ + idx = child; /* Descend. */ + } + heap[idx] = key; /* Insert key here. */ +} + +/* Resize heap, dropping content. */ +static void bcwrite_heap_resize(BCWriteCtx *ctx, uint32_t nsz) +{ + lua_State *L = sbufL(&ctx->sb); + if (ctx->heapsz) { + lj_mem_freevec(G(L), ctx->heap, ctx->heapsz, TValue *); + ctx->heapsz = 0; + } + if (nsz) { + ctx->heap = lj_mem_newvec(L, nsz, TValue *); + ctx->heapsz = nsz; + } +} + +/* Write hash part of template table in sorted order. */ +static void bcwrite_ktab_sorted_hash(BCWriteCtx *ctx, Node *node, MSize nhash) +{ + TValue **heap = ctx->heap; + MSize i = nhash; + for (;; node--) { /* Build heap. */ + if (!tvisnil(&node->val)) { + bcwrite_ktabk_heap_insert(heap, --i, nhash, &node->key); + if (i == 0) break; + } + } + do { /* Drain heap. */ + TValue *key = heap[0]; /* Output lowest key from top. */ + bcwrite_ktabk(ctx, key, 0); + bcwrite_ktabk(ctx, (TValue *)((char *)key - offsetof(Node, key)), 1); + key = heap[--nhash]; /* Remove last key. */ + bcwrite_ktabk_heap_insert(heap, 0, nhash, key); /* Re-insert. */ + } while (nhash); +} + /* Write a template table. */ static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t) { @@ -105,14 +176,20 @@ static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t) bcwrite_ktabk(ctx, o, 1); } if (nhash) { /* Write hash entries. */ - MSize i = nhash; Node *node = noderef(t->node) + t->hmask; - for (;; node--) - if (!tvisnil(&node->val)) { - bcwrite_ktabk(ctx, &node->key, 0); - bcwrite_ktabk(ctx, &node->val, 1); - if (--i == 0) break; - } + if ((ctx->flags & BCDUMP_F_DETERMINISTIC) && nhash > 1) { + if (ctx->heapsz < nhash) + bcwrite_heap_resize(ctx, t->hmask + 1); + bcwrite_ktab_sorted_hash(ctx, node, nhash); + } else { + MSize i = nhash; + for (;; node--) + if (!tvisnil(&node->val)) { + bcwrite_ktabk(ctx, &node->key, 0); + bcwrite_ktabk(ctx, &node->val, 1); + if (--i == 0) break; + } + } } } @@ -269,7 +346,7 @@ static void bcwrite_proto(BCWriteCtx *ctx, GCproto *pt) p = lj_strfmt_wuleb128(p, pt->sizekgc); p = lj_strfmt_wuleb128(p, pt->sizekn); p = lj_strfmt_wuleb128(p, pt->sizebc-1); - if (!ctx->strip) { + if (!(ctx->flags & BCDUMP_F_STRIP)) { if (proto_lineinfo(pt)) sizedbg = pt->sizept - (MSize)((char *)proto_lineinfo(pt) - (char *)pt); p = lj_strfmt_wuleb128(p, sizedbg); @@ -317,11 +394,10 @@ static void bcwrite_header(BCWriteCtx *ctx) *p++ = BCDUMP_HEAD2; *p++ = BCDUMP_HEAD3; *p++ = BCDUMP_VERSION; - *p++ = (ctx->strip ? BCDUMP_F_STRIP : 0) + + *p++ = (ctx->flags & (BCDUMP_F_STRIP | BCDUMP_F_FR2)) + LJ_BE*BCDUMP_F_BE + - ((ctx->pt->flags & PROTO_FFI) ? BCDUMP_F_FFI : 0) + - LJ_FR2*BCDUMP_F_FR2; - if (!ctx->strip) { + ((ctx->pt->flags & PROTO_FFI) ? BCDUMP_F_FFI : 0); + if (!(ctx->flags & BCDUMP_F_STRIP)) { p = lj_strfmt_wuleb128(p, len); p = lj_buf_wmem(p, name, len); } @@ -352,14 +428,16 @@ static TValue *cpwriter(lua_State *L, lua_CFunction dummy, void *ud) /* Write bytecode for a prototype. */ int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, void *data, - int strip) + uint32_t flags) { BCWriteCtx ctx; int status; ctx.pt = pt; ctx.wfunc = writer; ctx.wdata = data; - ctx.strip = strip; + ctx.heapsz = 0; + if ((bc_op(proto_bc(pt)[0]) != BC_NOT) == LJ_FR2) flags |= BCDUMP_F_FR2; + ctx.flags = flags; ctx.status = 0; #ifdef LUA_USE_ASSERT ctx.g = G(L); @@ -368,6 +446,7 @@ int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, void *data, status = lj_vm_cpcall(L, NULL, &ctx, cpwriter); if (status == 0) status = ctx.status; lj_buf_free(G(sbufL(&ctx.sb)), &ctx.sb); + bcwrite_heap_resize(&ctx, 0); return status; } diff --git a/src/lj_lex.c b/src/lj_lex.c index 61b04c4b..bd81dc40 100644 --- a/src/lj_lex.c +++ b/src/lj_lex.c @@ -411,6 +411,7 @@ int lj_lex_setup(lua_State *L, LexState *ls) ls->linenumber = 1; ls->lastline = 1; ls->endmark = 0; + ls->fr2 = LJ_FR2; /* Generate native bytecode by default. */ lex_next(ls); /* Read-ahead first char. */ if (ls->c == 0xef && ls->p + 2 <= ls->pe && (uint8_t)ls->p[0] == 0xbb && (uint8_t)ls->p[1] == 0xbf) { /* Skip UTF-8 BOM (if buffered). */ diff --git a/src/lj_lex.h b/src/lj_lex.h index e46fbd89..2ef7fc77 100644 --- a/src/lj_lex.h +++ b/src/lj_lex.h @@ -74,6 +74,7 @@ typedef struct LexState { MSize sizebcstack; /* Size of bytecode stack. */ uint32_t level; /* Syntactical nesting level. */ int endmark; /* Trust bytecode end marker, even if not at EOF. */ + int fr2; /* Generate bytecode for LJ_FR2 mode. */ } LexState; LJ_FUNC int lj_lex_setup(lua_State *L, LexState *ls); diff --git a/src/lj_lib.c b/src/lj_lib.c index ebe0dc78..06ae4fcf 100644 --- a/src/lj_lib.c +++ b/src/lj_lib.c @@ -62,6 +62,7 @@ static const uint8_t *lib_read_lfunc(lua_State *L, const uint8_t *p, GCtab *tab) ls.pe = (const char *)~(uintptr_t)0; ls.c = -1; ls.level = (BCDUMP_F_STRIP|(LJ_BE*BCDUMP_F_BE)); + ls.fr2 = LJ_FR2; ls.chunkname = name; pt = lj_bcread_proto(&ls); pt->firstline = ~(BCLine)0; @@ -266,6 +267,23 @@ GCfunc *lj_lib_checkfunc(lua_State *L, int narg) return funcV(o); } +GCproto *lj_lib_checkLproto(lua_State *L, int narg, int nolua) +{ + TValue *o = L->base + narg-1; + if (L->top > o) { + if (tvisproto(o)) { + return protoV(o); + } else if (tvisfunc(o)) { + if (isluafunc(funcV(o))) + return funcproto(funcV(o)); + else if (nolua) + return NULL; + } + } + lj_err_argt(L, narg, LUA_TFUNCTION); + return NULL; /* unreachable */ +} + GCtab *lj_lib_checktab(lua_State *L, int narg) { TValue *o = L->base + narg-1; diff --git a/src/lj_lib.h b/src/lj_lib.h index 6c3a1c83..a48e3c98 100644 --- a/src/lj_lib.h +++ b/src/lj_lib.h @@ -42,6 +42,7 @@ LJ_FUNC lua_Number lj_lib_checknum(lua_State *L, int narg); LJ_FUNC int32_t lj_lib_checkint(lua_State *L, int narg); LJ_FUNC int32_t lj_lib_optint(lua_State *L, int narg, int32_t def); LJ_FUNC GCfunc *lj_lib_checkfunc(lua_State *L, int narg); +LJ_FUNC GCproto *lj_lib_checkLproto(lua_State *L, int narg, int nolua); LJ_FUNC GCtab *lj_lib_checktab(lua_State *L, int narg); LJ_FUNC GCtab *lj_lib_checktabornil(lua_State *L, int narg); LJ_FUNC int lj_lib_checkopt(lua_State *L, int narg, int def, const char *lst); diff --git a/src/lj_load.c b/src/lj_load.c index 07304487..152ef6da 100644 --- a/src/lj_load.c +++ b/src/lj_load.c @@ -34,14 +34,28 @@ static TValue *cpparser(lua_State *L, lua_CFunction dummy, void *ud) UNUSED(dummy); cframe_errfunc(L->cframe) = -1; /* Inherit error function. */ bc = lj_lex_setup(L, ls); - if (ls->mode && !strchr(ls->mode, bc ? 'b' : 't')) { - setstrV(L, L->top++, lj_err_str(L, LJ_ERR_XMODE)); - lj_err_throw(L, LUA_ERRSYNTAX); + if (ls->mode) { + int xmode = 1; + const char *mode = ls->mode; + char c; + while ((c = *mode++)) { + if (c == (bc ? 'b' : 't')) xmode = 0; + if (c == (LJ_FR2 ? 'W' : 'X')) ls->fr2 = !LJ_FR2; + } + if (xmode) { + setstrV(L, L->top++, lj_err_str(L, LJ_ERR_XMODE)); + lj_err_throw(L, LUA_ERRSYNTAX); + } } pt = bc ? lj_bcread(ls) : lj_parse(ls); - fn = lj_func_newL_empty(L, pt, tabref(L->env)); - /* Don't combine above/below into one statement. */ - setfuncV(L, L->top++, fn); + if (ls->fr2 == LJ_FR2) { + fn = lj_func_newL_empty(L, pt, tabref(L->env)); + /* Don't combine above/below into one statement. */ + setfuncV(L, L->top++, fn); + } else { + /* Non-native generation returns a dumpable, but non-runnable prototype. */ + setprotoV(L, L->top++, pt); + } return NULL; } @@ -159,9 +173,10 @@ LUALIB_API int luaL_loadstring(lua_State *L, const char *s) LUA_API int lua_dump(lua_State *L, lua_Writer writer, void *data) { cTValue *o = L->top-1; + uint32_t flags = LJ_FR2*BCDUMP_F_FR2; /* Default mode for legacy C API. */ lj_checkapi(L->top > L->base, "top slot empty"); if (tvisfunc(o) && isluafunc(funcV(o))) - return lj_bcwrite(L, funcproto(funcV(o)), writer, data, 0); + return lj_bcwrite(L, funcproto(funcV(o)), writer, data, flags); else return 1; } diff --git a/src/lj_parse.c b/src/lj_parse.c index a30921af..5a44f8db 100644 --- a/src/lj_parse.c +++ b/src/lj_parse.c @@ -667,19 +667,20 @@ static void bcemit_store(FuncState *fs, ExpDesc *var, ExpDesc *e) /* Emit method lookup expression. */ static void bcemit_method(FuncState *fs, ExpDesc *e, ExpDesc *key) { - BCReg idx, func, obj = expr_toanyreg(fs, e); + BCReg idx, func, fr2, obj = expr_toanyreg(fs, e); expr_free(fs, e); func = fs->freereg; - bcemit_AD(fs, BC_MOV, func+1+LJ_FR2, obj); /* Copy object to 1st argument. */ + fr2 = fs->ls->fr2; + bcemit_AD(fs, BC_MOV, func+1+fr2, obj); /* Copy object to 1st argument. */ lj_assertFS(expr_isstrk(key), "bad usage"); idx = const_str(fs, key); if (idx <= BCMAX_C) { - bcreg_reserve(fs, 2+LJ_FR2); + bcreg_reserve(fs, 2+fr2); bcemit_ABC(fs, BC_TGETS, func, obj, idx); } else { - bcreg_reserve(fs, 3+LJ_FR2); - bcemit_AD(fs, BC_KSTR, func+2+LJ_FR2, idx); - bcemit_ABC(fs, BC_TGETV, func, obj, func+2+LJ_FR2); + bcreg_reserve(fs, 3+fr2); + bcemit_AD(fs, BC_KSTR, func+2+fr2, idx); + bcemit_ABC(fs, BC_TGETV, func, obj, func+2+fr2); fs->freereg--; } e->u.s.info = func; @@ -1326,9 +1327,12 @@ static void fs_fixup_bc(FuncState *fs, GCproto *pt, BCIns *bc, MSize n) { BCInsLine *base = fs->bcbase; MSize i; + BCIns op; pt->sizebc = n; - bc[0] = BCINS_AD((fs->flags & PROTO_VARARG) ? BC_FUNCV : BC_FUNCF, - fs->framesize, 0); + if (fs->ls->fr2 != LJ_FR2) op = BC_NOT; /* Mark non-native prototype. */ + else if ((fs->flags & PROTO_VARARG)) op = BC_FUNCV; + else op = BC_FUNCF; + bc[0] = BCINS_AD(op, fs->framesize, 0); for (i = 1; i < n; i++) bc[i] = base[i].ins; } @@ -1936,11 +1940,11 @@ static void parse_args(LexState *ls, ExpDesc *e) lj_assertFS(e->k == VNONRELOC, "bad expr type %d", e->k); base = e->u.s.info; /* Base register for call. */ if (args.k == VCALL) { - ins = BCINS_ABC(BC_CALLM, base, 2, args.u.s.aux - base - 1 - LJ_FR2); + ins = BCINS_ABC(BC_CALLM, base, 2, args.u.s.aux - base - 1 - ls->fr2); } else { if (args.k != VVOID) expr_tonextreg(fs, &args); - ins = BCINS_ABC(BC_CALL, base, 2, fs->freereg - base - LJ_FR2); + ins = BCINS_ABC(BC_CALL, base, 2, fs->freereg - base - ls->fr2); } expr_init(e, VCALL, bcemit_INS(fs, ins)); e->u.s.aux = base; @@ -1980,7 +1984,7 @@ static void expr_primary(LexState *ls, ExpDesc *v) parse_args(ls, v); } else if (ls->tok == '(' || ls->tok == TK_string || ls->tok == '{') { expr_tonextreg(fs, v); - if (LJ_FR2) bcreg_reserve(fs, 1); + if (ls->fr2) bcreg_reserve(fs, 1); parse_args(ls, v); } else { break; @@ -2565,7 +2569,7 @@ static void parse_for_iter(LexState *ls, GCstr *indexname) line = ls->linenumber; assign_adjust(ls, 3, expr_list(ls, &e), &e); /* The iterator needs another 3 [4] slots (func [pc] | state ctl). */ - bcreg_bump(fs, 3+LJ_FR2); + bcreg_bump(fs, 3+ls->fr2); isnext = (nvars <= 5 && predict_next(ls, fs, exprpc)); var_add(ls, 3); /* Hidden control variables. */ lex_check(ls, TK_do); From 2f35cb45fdd557aacb3875ec6ffd5721f92c9a51 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 22 Jan 2024 19:12:13 +0100 Subject: [PATCH 88/95] MIPS64 R2/R6: Fix FP to integer conversions. Thanks to Peter Cawley. #1146 --- src/lj_asm_mips.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index d4e40c91..5b83e34d 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -653,11 +653,11 @@ static void asm_conv(ASMState *as, IRIns *ir) rset_exclude(RSET_GPR, dest)); emit_fg(as, MIPSI_TRUNC_L_D, tmp, left); /* Delay slot. */ #if !LJ_TARGET_MIPSR6 - emit_branch(as, MIPSI_BC1T, 0, 0, l_end); - emit_fgh(as, MIPSI_C_OLT_D, 0, left, tmp); + emit_branch(as, MIPSI_BC1T, 0, 0, l_end); + emit_fgh(as, MIPSI_C_OLT_D, 0, left, tmp); #else - emit_branch(as, MIPSI_BC1NEZ, 0, (left&31), l_end); - emit_fgh(as, MIPSI_CMP_LT_D, left, left, tmp); + emit_branch(as, MIPSI_BC1NEZ, 0, (tmp&31), l_end); + emit_fgh(as, MIPSI_CMP_LT_D, tmp, left, tmp); #endif emit_lsptr(as, MIPSI_LDC1, (tmp & 31), (void *)&as->J->k64[LJ_K64_2P63], @@ -670,11 +670,11 @@ static void asm_conv(ASMState *as, IRIns *ir) rset_exclude(RSET_GPR, dest)); emit_fg(as, MIPSI_TRUNC_L_S, tmp, left); /* Delay slot. */ #if !LJ_TARGET_MIPSR6 - emit_branch(as, MIPSI_BC1T, 0, 0, l_end); - emit_fgh(as, MIPSI_C_OLT_S, 0, left, tmp); + emit_branch(as, MIPSI_BC1T, 0, 0, l_end); + emit_fgh(as, MIPSI_C_OLT_S, 0, left, tmp); #else - emit_branch(as, MIPSI_BC1NEZ, 0, (left&31), l_end); - emit_fgh(as, MIPSI_CMP_LT_S, left, left, tmp); + emit_branch(as, MIPSI_BC1NEZ, 0, (tmp&31), l_end); + emit_fgh(as, MIPSI_CMP_LT_S, tmp, left, tmp); #endif emit_lsptr(as, MIPSI_LWC1, (tmp & 31), (void *)&as->J->k32[LJ_K32_2P63], @@ -690,8 +690,8 @@ static void asm_conv(ASMState *as, IRIns *ir) MIPSIns mi = irt_is64(ir->t) ? (st == IRT_NUM ? MIPSI_TRUNC_L_D : MIPSI_TRUNC_L_S) : (st == IRT_NUM ? MIPSI_TRUNC_W_D : MIPSI_TRUNC_W_S); - emit_tg(as, irt_is64(ir->t) ? MIPSI_DMFC1 : MIPSI_MFC1, dest, left); - emit_fg(as, mi, left, left); + emit_tg(as, irt_is64(ir->t) ? MIPSI_DMFC1 : MIPSI_MFC1, dest, tmp); + emit_fg(as, mi, tmp, left); #endif } } From 3ca0a80711ef53e2e788bca7b282f8ad7c927b59 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 22 Jan 2024 19:17:45 +0100 Subject: [PATCH 89/95] DynASM/x86: Add endbr instruction. Thanks to Dmitry Stogov. #1143 #1142 --- dynasm/dasm_x86.lua | 2 ++ dynasm/dynasm.lua | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua index df70fed8..7c789f82 100644 --- a/dynasm/dasm_x86.lua +++ b/dynasm/dasm_x86.lua @@ -1151,6 +1151,8 @@ local map_op = { rep_0 = "F3", repe_0 = "F3", repz_0 = "F3", + endbr32_0 = "F30F1EFB", + endbr64_0 = "F30F1EFA", -- F4: *hlt cmc_0 = "F5", -- F6: test... mb,i; div... mb diff --git a/dynasm/dynasm.lua b/dynasm/dynasm.lua index 5be75f7f..0d15a872 100644 --- a/dynasm/dynasm.lua +++ b/dynasm/dynasm.lua @@ -75,7 +75,7 @@ local function wline(line, needindent) g_synclineno = g_synclineno + 1 end --- Write assembler line as a comment, if requestd. +-- Write assembler line as a comment, if requested. local function wcomment(aline) if g_opt.comment then wline(g_opt.comment..aline..g_opt.endcomment, true) From 85b4fed0b0353dd78c8c875c2f562d522a2b310f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 23 Jan 2024 18:58:52 +0100 Subject: [PATCH 90/95] Fix unsinking of IR_FSTORE for NULL metatable. Reported by pwnhacker0x18. #1147 --- src/lj_snap.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/lj_snap.c b/src/lj_snap.c index b387dd76..4a773048 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -412,6 +412,7 @@ static TRef snap_replay_const(jit_State *J, IRIns *ir) case IR_KNUM: return lj_ir_k64(J, IR_KNUM, ir_knum(ir)); case IR_KINT64: return lj_ir_k64(J, IR_KINT64, ir_kint64(ir)); case IR_KPTR: return lj_ir_kptr(J, ir_kptr(ir)); /* Continuation. */ + case IR_KNULL: return lj_ir_knull(J, irt_type(ir->t)); default: lua_assert(0); return TREF_NIL; break; } } @@ -821,9 +822,13 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, if (irk->o == IR_FREF) { switch (irk->op2) { case IRFL_TAB_META: - snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp); - /* NOBARRIER: The table is new (marked white). */ - setgcref(t->metatable, obj2gco(tabV(&tmp))); + if (T->ir[irs->op2].o == IR_KNULL) { + setgcrefnull(t->metatable); + } else { + snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp); + /* NOBARRIER: The table is new (marked white). */ + setgcref(t->metatable, obj2gco(tabV(&tmp))); + } break; case IRFL_TAB_NOMM: /* Negative metamethod cache invalidated by lj_tab_set() below. */ From 343ce0edaf3906a62022936175b2f5410024cbfc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 25 Jan 2024 13:23:48 +0100 Subject: [PATCH 91/95] Fix zero stripping in %g number formatting. Reported by pwnhacker0x18. #1149 --- src/lj_strfmt_num.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lj_strfmt_num.c b/src/lj_strfmt_num.c index 79ec0263..c6e776aa 100644 --- a/src/lj_strfmt_num.c +++ b/src/lj_strfmt_num.c @@ -454,7 +454,8 @@ static char *lj_strfmt_wfnum(SBuf *sb, SFormat sf, lua_Number n, char *p) prec--; if (!i) { if (ndlo == ndhi) { prec = 0; break; } - lj_strfmt_wuint9(tail, nd[++ndlo]); + ndlo = (ndlo + 1) & 0x3f; + lj_strfmt_wuint9(tail, nd[ndlo]); i = 9; } } From e6c0ade97c6b250a44e8f4b964024a22d913b860 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 26 Jan 2024 23:17:33 +0100 Subject: [PATCH 92/95] Fix documentation bug about '\z' string escape. --- doc/extensions.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/extensions.html b/doc/extensions.html index f8b45c28..04a9ae07 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -244,7 +244,7 @@ enabled:

      • goto and ::labels::.
      • -
      • Hex escapes '\x3F' and '\*' escape in strings.
      • +
      • Hex escapes '\x3F' and '\z' escape in strings.
      • load(string|reader [, chunkname [,mode [,env]]]).
      • loadstring() is an alias for load().
      • loadfile(filename [,mode [,env]]).
      • From 14987af80ab583514f19ef36d1023655324fc757 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 31 Jan 2024 14:29:23 +0100 Subject: [PATCH 93/95] Prevent include of luajit_rolling.h. Thanks to Peter Cawley. #1145 --- src/host/genversion.lua | 2 +- src/luajit_rolling.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/host/genversion.lua b/src/host/genversion.lua index 28f7206c..f0925160 100644 --- a/src/host/genversion.lua +++ b/src/host/genversion.lua @@ -29,7 +29,7 @@ local function file_write_mod(file, data) assert(fp:close()) end -local text = file_read(FILE_ROLLING_H) +local text = file_read(FILE_ROLLING_H):gsub("#error.-\n", "") local relver = file_read(FILE_RELVER_TXT):match("(%d+)") if relver then diff --git a/src/luajit_rolling.h b/src/luajit_rolling.h index 27368836..e7ff2c23 100644 --- a/src/luajit_rolling.h +++ b/src/luajit_rolling.h @@ -67,4 +67,5 @@ LUA_API int luaJIT_setmode(lua_State *L, int idx, int mode); /* Enforce (dynamic) linker error for version mismatches. Call from main. */ LUA_API void LUAJIT_VERSION_SYM(void); +#error "DO NOT USE luajit_rolling.h -- only include build-generated luajit.h" #endif From 9cdd5a9479d2265f42dfefc17d068174969bbcff Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 31 Jan 2024 14:32:04 +0100 Subject: [PATCH 94/95] Preserve keys with dynamic values in template tables when saving bytecode. Reported by Lyrthras. Fixed by Peter Cawley. #1155 --- src/lj_bcwrite.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lj_bcwrite.c b/src/lj_bcwrite.c index 6282f767..9820ad12 100644 --- a/src/lj_bcwrite.c +++ b/src/lj_bcwrite.c @@ -124,7 +124,7 @@ static void bcwrite_ktab(BCWriteCtx *ctx, const GCtab *t) MSize i, hmask = t->hmask; Node *node = noderef(t->node); for (i = 0; i <= hmask; i++) - nhash += !tvisnil(&node[i].val); + nhash += !tvisnil(&node[i].key); } /* Write number of array slots and hash slots. */ bcwrite_uleb128(ctx, narray); @@ -139,7 +139,7 @@ static void bcwrite_ktab(BCWriteCtx *ctx, const GCtab *t) MSize i = nhash; Node *node = noderef(t->node) + t->hmask; for (;; node--) - if (!tvisnil(&node->val)) { + if (!tvisnil(&node->key)) { bcwrite_ktabk(ctx, &node->key, 0); bcwrite_ktabk(ctx, &node->val, 1); if (--i == 0) break; From defe61a56751a0db5f00ff3ab7b8f45436ba74c8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 4 Feb 2024 16:34:30 +0100 Subject: [PATCH 95/95] Rework stack overflow handling. Reported by pwnhacker0x18. Fixed by Peter Cawley. #1152 --- src/lj_debug.c | 1 + src/lj_err.c | 22 +++++++++++++++++--- src/lj_err.h | 1 + src/lj_state.c | 56 +++++++++++++++++++++++++++++++++----------------- 4 files changed, 58 insertions(+), 22 deletions(-) diff --git a/src/lj_debug.c b/src/lj_debug.c index e6a8be54..bca1d7a5 100644 --- a/src/lj_debug.c +++ b/src/lj_debug.c @@ -63,6 +63,7 @@ static BCPos debug_framepc(lua_State *L, GCfunc *fn, cTValue *nextframe) if (cf == NULL || (char *)cframe_pc(cf) == (char *)cframe_L(cf)) return NO_BCPOS; ins = cframe_pc(cf); /* Only happens during error/hook handling. */ + if (!ins) return NO_BCPOS; } else { if (frame_islua(nextframe)) { ins = frame_pc(nextframe); diff --git a/src/lj_err.c b/src/lj_err.c index 4a2d6bbd..7afe1e29 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -488,7 +488,14 @@ LJ_NOINLINE void lj_err_mem(lua_State *L) { if (L->status == LUA_ERRERR+1) /* Don't touch the stack during lua_open. */ lj_vm_unwind_c(L->cframe, LUA_ERRMEM); - if (curr_funcisL(L)) L->top = curr_topL(L); + if (curr_funcisL(L)) { + L->top = curr_topL(L); + if (LJ_UNLIKELY(L->top > tvref(L->maxstack))) { + /* The current Lua frame violates the stack. Replace it with a dummy. */ + L->top = L->base; + setframe_gc(L->base - 1, obj2gco(L)); + } + } setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRMEM)); lj_err_throw(L, LUA_ERRMEM); } @@ -551,9 +558,11 @@ LJ_NOINLINE void LJ_FASTCALL lj_err_run(lua_State *L) { ptrdiff_t ef = finderrfunc(L); if (ef) { - TValue *errfunc = restorestack(L, ef); - TValue *top = L->top; + TValue *errfunc, *top; + lj_state_checkstack(L, LUA_MINSTACK * 2); /* Might raise new error. */ lj_trace_abort(G(L)); + errfunc = restorestack(L, ef); + top = L->top; if (!tvisfunc(errfunc) || L->status == LUA_ERRERR) { setstrV(L, top-1, lj_err_str(L, LJ_ERR_ERRERR)); lj_err_throw(L, LUA_ERRERR); @@ -567,6 +576,13 @@ LJ_NOINLINE void LJ_FASTCALL lj_err_run(lua_State *L) lj_err_throw(L, LUA_ERRRUN); } +/* Stack overflow error. */ +void LJ_FASTCALL lj_err_stkov(lua_State *L) +{ + lj_debug_addloc(L, err2msg(LJ_ERR_STKOV), L->base-1, NULL); + lj_err_run(L); +} + /* Formatted runtime error message. */ LJ_NORET LJ_NOINLINE static void err_msgv(lua_State *L, ErrMsg em, ...) { diff --git a/src/lj_err.h b/src/lj_err.h index 321719a9..15040922 100644 --- a/src/lj_err.h +++ b/src/lj_err.h @@ -23,6 +23,7 @@ LJ_DATA const char *lj_err_allmsg; LJ_FUNC GCstr *lj_err_str(lua_State *L, ErrMsg em); LJ_FUNCA_NORET void LJ_FASTCALL lj_err_throw(lua_State *L, int errcode); LJ_FUNC_NORET void lj_err_mem(lua_State *L); +LJ_FUNC_NORET void LJ_FASTCALL lj_err_stkov(lua_State *L); LJ_FUNCA_NORET void LJ_FASTCALL lj_err_run(lua_State *L); LJ_FUNC_NORET void lj_err_msg(lua_State *L, ErrMsg em); LJ_FUNC_NORET void lj_err_lex(lua_State *L, GCstr *src, const char *tok, diff --git a/src/lj_state.c b/src/lj_state.c index c2f0b115..adedb66c 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -96,27 +96,45 @@ void lj_state_shrinkstack(lua_State *L, MSize used) /* Try to grow stack. */ void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need) { - MSize n; - if (L->stacksize >= LJ_STACK_MAXEX) { - /* 4. Throw 'error in error handling' when we are _over_ the limit. */ - if (L->stacksize > LJ_STACK_MAXEX) - lj_err_throw(L, LUA_ERRERR); /* Does not invoke an error handler. */ - /* 1. We are _at_ the limit after the last growth. */ - if (L->status < LUA_ERRRUN) { /* 2. Throw 'stack overflow'. */ - L->status = LUA_ERRRUN; /* Prevent ending here again for pushed msg. */ - lj_err_msg(L, LJ_ERR_STKOV); /* May invoke an error handler. */ + MSize n = L->stacksize + need; + if (LJ_LIKELY(n < LJ_STACK_MAX)) { /* The stack can grow as requested. */ + if (n < 2 * L->stacksize) { /* Try to double the size. */ + n = 2 * L->stacksize; + if (n > LJ_STACK_MAX) + n = LJ_STACK_MAX; + } + resizestack(L, n); + } else { /* Request would overflow. Raise a stack overflow error. */ + if (curr_funcisL(L)) { + L->top = curr_topL(L); + if (L->top > tvref(L->maxstack)) { + /* The current Lua frame violates the stack, so replace it with a + ** dummy. This can happen when BC_IFUNCF is trying to grow the stack. + */ + L->top = L->base; + setframe_gc(L->base - 1, obj2gco(L)); + } + } + if (L->stacksize <= LJ_STACK_MAXEX) { + /* An error handler might want to inspect the stack overflow error, but + ** will need some stack space to run in. We give it a stack size beyond + ** the normal limit in order to do so, then rely on lj_state_relimitstack + ** calls during unwinding to bring us back to a convential stack size. + ** The + 1 is space for the error message, and 2 * LUA_MINSTACK is for + ** the lj_state_checkstack() call in lj_err_run(). + */ + resizestack(L, LJ_STACK_MAX + 1 + 2 * LUA_MINSTACK); + lj_err_stkov(L); /* May invoke an error handler. */ + } else { + /* If we're here, then the stack overflow error handler is requesting + ** to grow the stack even further. We have no choice but to abort the + ** error handler. + */ + GCstr *em = lj_err_str(L, LJ_ERR_STKOV); /* Might OOM. */ + setstrV(L, L->top++, em); /* There is always space to push an error. */ + lj_err_throw(L, LUA_ERRERR); /* Does not invoke an error handler. */ } - /* 3. Add space (over the limit) for pushed message and error handler. */ } - n = L->stacksize + need; - if (n > LJ_STACK_MAX) { - n += 2*LUA_MINSTACK; - } else if (n < 2*L->stacksize) { - n = 2*L->stacksize; - if (n >= LJ_STACK_MAX) - n = LJ_STACK_MAX; - } - resizestack(L, n); } void LJ_FASTCALL lj_state_growstack1(lua_State *L)