diff --git a/doc/extensions.html b/doc/extensions.html
index eb591d1e..c1c9a808 100644
--- a/doc/extensions.html
+++ b/doc/extensions.html
@@ -160,13 +160,33 @@ passes any arguments after the error function to the function
which is called in a protected context.
-loadfile() etc. handle UTF-8 source code
+load*() handle UTF-8 source code
Non-ASCII characters are handled transparently by the Lua source code parser.
This allows the use of UTF-8 characters in identifiers and strings.
A UTF-8 BOM is skipped at the start of the source code.
+load*() add a mode parameter
+
+As an extension from Lua 5.2, the functions loadstring(),
+loadfile() and (new) load() add an optional
+mode parameter.
+
+
+The default mode string is "bt", which allows loading of both
+source code and bytecode. Use "t" to allow only source code
+or "b" to allow only bytecode to be loaded.
+
+
+By default, the load* functions generate the native bytecode format.
+For cross-compilation purposes, add W to the mode string to
+force the 32 bit format and X to force the 64 bit format.
+Add both to force the opposite format. Note that non-native bytecode
+generated by load* cannot be run, but can still be passed
+to string.dump.
+
+
tostring() etc. canonicalize NaN and ±Inf
All number-to-string conversions consistently convert non-finite numbers
@@ -186,26 +206,33 @@ works independently of the current locale and it supports hex floating-point
numbers (e.g. 0x1.5p-3).
-string.dump(f [,strip]) generates portable bytecode
+string.dump(f [,mode]) generates portable bytecode
An extra argument has been added to string.dump(). If set to
-true, 'stripped' bytecode without debug information is
-generated. This speeds up later bytecode loading and reduces memory
-usage. See also the
+true or to a string which contains the character s,
+'stripped' bytecode without debug information is generated. This speeds
+up later bytecode loading and reduces memory usage. See also the
-b command line option.
The generated bytecode is portable and can be loaded on any architecture
-that LuaJIT supports, independent of word size or endianess. However, the
-bytecode compatibility versions must match. Bytecode stays compatible
-for dot releases (x.y.0 → x.y.1), but may change with major or
-minor releases (2.0 → 2.1) or between any beta release. Foreign
-bytecode (e.g. from Lua 5.1) is incompatible and cannot be loaded.
+that LuaJIT supports. However, the bytecode compatibility versions must
+match. Bytecode only stays compatible within a major+minor version
+(x.y.aaa → x.y.bbb), except for development branches. Foreign bytecode
+(e.g. from Lua 5.1) is incompatible and cannot be loaded.
Note: LJ_GC64 mode requires a different frame layout, which implies
-a different, incompatible bytecode format for all 64 bit ports. This may be
-rectified in the future.
+a different, incompatible bytecode format between 32 bit and 64 bit ports.
+This may be rectified in the future. In the meantime, use the W
+and X modes of the load* functions
+for cross-compilation purposes.
+
+
+Due to VM hardening, bytecode is not deterministic. Add d to the
+mode string to dump it in a deterministic manner: identical source code
+always gives a byte-for-byte identical bytecode dump. This feature is
+mainly useful for reproducible builds.
table.new(narray, nhash) allocates a pre-sized table
@@ -286,7 +313,7 @@ enabled:
- goto and ::labels::.
-- Hex escapes '\x3F' and '\*' escape in strings.
+- Hex escapes '\x3F' and '\z' escape in strings.
- load(string|reader [, chunkname [,mode [,env]]]).
- loadstring() is an alias for load().
- loadfile(filename [,mode [,env]]).
@@ -426,9 +453,7 @@ the toolchain used to compile LuaJIT:
on the C stack. The contents of the C++ exception object
pass through unmodified.
- Lua errors can be caught on the C++ side with catch(...).
-The corresponding Lua error message can be retrieved from the Lua stack.
-For MSVC for Windows 64 bit this requires compilation of your C++ code
-with /EHa.
+The corresponding Lua error message can be retrieved from the Lua stack.
- Throwing Lua errors across C++ frames is safe. C++ destructors
will be called.
diff --git a/doc/install.html b/doc/install.html
index be721031..b6481443 100644
--- a/doc/install.html
+++ b/doc/install.html
@@ -203,7 +203,7 @@ Or install Microsoft's Visual Studio (MSVC).
Building with MSVC
-Open a "Visual Studio Command Prompt" (either x86 or x64), cd to the
+Open a "Visual Studio Command Prompt" (x86, x64 or ARM64), cd to the
directory with the source code and run these commands:
@@ -214,6 +214,9 @@ msvcbuild
Check the msvcbuild.bat file for more options.
Then follow the installation instructions below.
+
+For an x64 to ARM64 cross-build run this first: vcvarsall.bat x64_arm64
+
Building with MinGW or Cygwin
Open a command prompt window and make sure the MinGW or Cygwin programs
@@ -266,6 +269,7 @@ for any supported target:
Yes, you need a toolchain for both your host and your target!
Both host and target architectures must have the same pointer size.
E.g. if you want to cross-compile to a 32 bit target on a 64 bit host, you need to install the multilib development package (e.g. libc6-dev-i386 on Debian/Ubuntu) and build a 32 bit host part (HOST_CC="gcc -m32").
+On some distro versions, multilib conflicts with cross-compilers. The workaround is to install the x86 cross-compiler package gcc-i686-linux-gnu and use it to build the host part (HOST_CC=i686-linux-gnu-gcc).
64 bit targets always require compilation on a 64 bit host.
diff --git a/doc/running.html b/doc/running.html
index 3afc1b56..142b810f 100644
--- a/doc/running.html
+++ b/doc/running.html
@@ -106,6 +106,9 @@ are accepted:
-l — Only list bytecode.
-s — Strip debug info (this is the default).
-g — Keep debug info.
+-W — Generate 32 bit (non-GC64) bytecode.
+-X — Generate 64 bit (GC64) bytecode.
+-d — Generate bytecode in deterministic manner.
-n name — Set module name (default: auto-detect from input name)
-t type — Set output file type (default: auto-detect from output name).
-a arch — Override architecture for object files (default: native).
@@ -120,7 +123,8 @@ file name:
- c — C source file, exported bytecode data.
-- h — C header file, static bytecode data.
+- cc — C++ source file, exported bytecode data.
+- h — C/C++ header file, static bytecode data.
- obj or o — Object file, exported bytecode data
(OS- and architecture-specific).
- raw or any other extension — Raw bytecode file (portable).
diff --git a/dynasm/dasm_arm64.lua b/dynasm/dasm_arm64.lua
index e69f8ef3..05ea3e22 100644
--- a/dynasm/dasm_arm64.lua
+++ b/dynasm/dasm_arm64.lua
@@ -549,7 +549,7 @@ end
local function parse_load_pair(params, nparams, n, op)
if params[n+2] then werror("too many operands") end
local pn, p2 = params[n], params[n+1]
- local scale = shr(op, 30) == 0 and 2 or 3
+ local scale = 2 + shr(op, 31 - band(shr(op, 26), 1))
local p1, wb = match(pn, "^%[%s*(.-)%s*%](!?)$")
if not p1 then
if not p2 then
@@ -806,8 +806,8 @@ map_op = {
["ldrsw_*"] = "98000000DxB|b8800000DxL",
-- NOTE: ldur etc. are handled by ldr et al.
- ["stp_*"] = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP",
- ["ldp_*"] = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP",
+ ["stp_*"] = "28000000DAwP|a8000000DAxP|2c000000DAsP|6c000000DAdP|ac000000DAqP",
+ ["ldp_*"] = "28400000DAwP|a8400000DAxP|2c400000DAsP|6c400000DAdP|ac400000DAqP",
["ldpsw_*"] = "68400000DAxP",
-- Branches.
@@ -942,7 +942,7 @@ local function parse_template(params, template, nparams, pos)
werror("bad register type")
end
parse_reg_type = false
- elseif p == "x" or p == "w" or p == "d" or p == "s" then
+ elseif p == "x" or p == "w" or p == "d" or p == "s" or p == "q" then
if parse_reg_type ~= p then
werror("register size mismatch")
end
diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua
index 787163c0..7c789f82 100644
--- a/dynasm/dasm_x86.lua
+++ b/dynasm/dasm_x86.lua
@@ -627,7 +627,11 @@ local function wputmrmsib(t, imark, s, vsreg, psz, sk)
werror("NYI: rip-relative displacement followed by immediate")
end
-- The previous byte in the action buffer cannot be 0xe9 or 0x80-0x8f.
- wputlabel("REL_", disp[1], 2)
+ if disp[2] == "iPJ" then
+ waction("REL_A", disp[1])
+ else
+ wputlabel("REL_", disp[1], 2)
+ end
else
wputdarg(disp)
end
@@ -744,9 +748,9 @@ local function dispexpr(expr)
return imm*map_opsizenum[ops]
end
local mode, iexpr = immexpr(dispt)
- if mode == "iJ" then
+ if mode == "iJ" or mode == "iPJ" then
if c == "-" then werror("cannot invert label reference") end
- return { iexpr }
+ return { iexpr, mode }
end
return expr -- Need to return original signed expression.
end
@@ -1147,6 +1151,8 @@ local map_op = {
rep_0 = "F3",
repe_0 = "F3",
repz_0 = "F3",
+ endbr32_0 = "F30F1EFB",
+ endbr64_0 = "F30F1EFA",
-- F4: *hlt
cmc_0 = "F5",
-- F6: test... mb,i; div... mb
diff --git a/dynasm/dynasm.lua b/dynasm/dynasm.lua
index 5be75f7f..0d15a872 100644
--- a/dynasm/dynasm.lua
+++ b/dynasm/dynasm.lua
@@ -75,7 +75,7 @@ local function wline(line, needindent)
g_synclineno = g_synclineno + 1
end
--- Write assembler line as a comment, if requestd.
+-- Write assembler line as a comment, if requested.
local function wcomment(aline)
if g_opt.comment then
wline(g_opt.comment..aline..g_opt.endcomment, true)
diff --git a/src/Makefile b/src/Makefile
index 8c47a16b..224d21e7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -233,7 +233,7 @@ TARGET_ALDFLAGS= $(LDOPTIONS) $(TARGET_XLDFLAGS) $(TARGET_FLAGS) $(TARGET_LDFLAG
TARGET_ASHLDFLAGS= $(LDOPTIONS) $(TARGET_XSHLDFLAGS) $(TARGET_FLAGS) $(TARGET_SHLDFLAGS)
TARGET_ALIBS= $(TARGET_XLIBS) $(LIBS) $(TARGET_LIBS)
-TARGET_TESTARCH=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
+TARGET_TESTARCH:=$(shell $(TARGET_CC) $(TARGET_TCFLAGS) -E lj_arch.h -dM)
ifneq (,$(findstring LJ_TARGET_X64 ,$(TARGET_TESTARCH)))
TARGET_LJARCH= x64
else
@@ -475,7 +475,11 @@ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
DASM_DASC= vm_$(DASM_ARCH).dasc
GIT= git
-GIT_RELVER= [ -e ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || :
+ifeq (Windows,$(HOST_SYS)$(HOST_MSYS))
+ GIT_RELVER= if exist ..\.git ( $(GIT) show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
+else
+ GIT_RELVER= [ -e ../.git ] && $(GIT) show -s --format=%ct >luajit_relver.txt 2>/dev/null || cat ../.relver >luajit_relver.txt 2>/dev/null || :
+endif
GIT_DEP= $(wildcard ../.git/HEAD ../.git/refs/heads/*)
BUILDVM_O= host/buildvm.o host/buildvm_asm.o host/buildvm_peobj.o \
diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c
index 5bca6df8..8f04c496 100644
--- a/src/host/buildvm_peobj.c
+++ b/src/host/buildvm_peobj.c
@@ -9,7 +9,7 @@
#include "buildvm.h"
#include "lj_bc.h"
-#if LJ_TARGET_X86ORX64
+#if LJ_TARGET_WINDOWS || LJ_TARGET_CYGWIN
/* Context for PE object emitter. */
static char *strtab;
@@ -93,6 +93,17 @@ typedef struct PEsymaux {
#define PEOBJ_RELOC_ADDR32NB 0x03
#define PEOBJ_RELOC_OFS 0
#define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */
+#define PEOBJ_PDATA_NRELOC 6
+#define PEOBJ_XDATA_SIZE (8*2+4+6*2)
+#elif LJ_TARGET_ARM64
+#define PEOBJ_ARCH_TARGET 0xaa64
+#define PEOBJ_RELOC_REL32 0x03 /* MS: BRANCH26. */
+#define PEOBJ_RELOC_DIR32 0x01
+#define PEOBJ_RELOC_ADDR32NB 0x02
+#define PEOBJ_RELOC_OFS (-4)
+#define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */
+#define PEOBJ_PDATA_NRELOC 4
+#define PEOBJ_XDATA_SIZE (4+24+4 +4+8)
#endif
/* Section numbers (0-based). */
@@ -100,7 +111,7 @@ enum {
PEOBJ_SECT_ABS = -2,
PEOBJ_SECT_UNDEF = -1,
PEOBJ_SECT_TEXT,
-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
PEOBJ_SECT_PDATA,
PEOBJ_SECT_XDATA,
#elif LJ_TARGET_X86
@@ -175,6 +186,9 @@ void emit_peobj(BuildCtx *ctx)
uint32_t sofs;
int i, nrsym;
union { uint8_t b; uint32_t u; } host_endian;
+#ifdef PEOBJ_PDATA_NRELOC
+ uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
+#endif
sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection);
@@ -188,18 +202,18 @@ void emit_peobj(BuildCtx *ctx)
/* Flags: 60 = read+execute, 50 = align16, 20 = code. */
pesect[PEOBJ_SECT_TEXT].flags = PEOBJ_TEXT_FLAGS;
-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
memcpy(pesect[PEOBJ_SECT_PDATA].name, ".pdata", sizeof(".pdata")-1);
pesect[PEOBJ_SECT_PDATA].ofs = sofs;
- sofs += (pesect[PEOBJ_SECT_PDATA].size = 6*4);
+ sofs += (pesect[PEOBJ_SECT_PDATA].size = PEOBJ_PDATA_NRELOC*4);
pesect[PEOBJ_SECT_PDATA].relocofs = sofs;
- sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = 6) * PEOBJ_RELOC_SIZE;
+ sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = PEOBJ_PDATA_NRELOC) * PEOBJ_RELOC_SIZE;
/* Flags: 40 = read, 30 = align4, 40 = initialized data. */
pesect[PEOBJ_SECT_PDATA].flags = 0x40300040;
memcpy(pesect[PEOBJ_SECT_XDATA].name, ".xdata", sizeof(".xdata")-1);
pesect[PEOBJ_SECT_XDATA].ofs = sofs;
- sofs += (pesect[PEOBJ_SECT_XDATA].size = 8*2+4+6*2); /* See below. */
+ sofs += (pesect[PEOBJ_SECT_XDATA].size = PEOBJ_XDATA_SIZE); /* See below. */
pesect[PEOBJ_SECT_XDATA].relocofs = sofs;
sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE;
/* Flags: 40 = read, 30 = align4, 40 = initialized data. */
@@ -234,7 +248,7 @@ void emit_peobj(BuildCtx *ctx)
*/
nrsym = ctx->nrelocsym;
pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym;
-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
pehdr.nsyms += 1; /* Symbol for lj_err_unwind_win. */
#endif
@@ -259,7 +273,6 @@ void emit_peobj(BuildCtx *ctx)
#if LJ_TARGET_X64
{ /* Write .pdata section. */
- uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
uint32_t pdata[3]; /* Start of .text, end of .text and .xdata. */
PEreloc reloc;
pdata[0] = 0; pdata[1] = fcofs; pdata[2] = 0;
@@ -308,6 +321,87 @@ void emit_peobj(BuildCtx *ctx)
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
}
+#elif LJ_TARGET_ARM64
+ /* https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling */
+ { /* Write .pdata section. */
+ uint32_t pdata[4];
+ PEreloc reloc;
+ pdata[0] = 0;
+ pdata[1] = 0;
+ pdata[2] = fcofs;
+ pdata[3] = 4+24+4;
+ owrite(ctx, &pdata, sizeof(pdata));
+ /* Start of .text and start of .xdata. */
+ reloc.vaddr = 0; reloc.symidx = 1+2+nrsym+2+2+1;
+ reloc.type = PEOBJ_RELOC_ADDR32NB;
+ owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+ reloc.vaddr = 4; reloc.symidx = 1+2+nrsym+2;
+ reloc.type = PEOBJ_RELOC_ADDR32NB;
+ owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+ /* Start of vm_ffi_call and start of second part of .xdata. */
+ reloc.vaddr = 8; reloc.symidx = 1+2+nrsym+2+2+1;
+ reloc.type = PEOBJ_RELOC_ADDR32NB;
+ owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+ reloc.vaddr = 12; reloc.symidx = 1+2+nrsym+2;
+ reloc.type = PEOBJ_RELOC_ADDR32NB;
+ owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+ }
+ { /* Write .xdata section. */
+ uint32_t u32;
+ uint8_t *p, uwc[24];
+ PEreloc reloc;
+
+#define CBE16(x) (*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2)
+#define CALLOC_S(s) (*p++ = ((s) >> 4)) /* s < 512 */
+#define CSAVE_FPLR(o) (*p++ = 0x40 | ((o) >> 3)) /* o <= 504 */
+#define CSAVE_REGP(r,o) CBE16(0xc800 | (((r) - 19) << 6) | ((o) >> 3))
+#define CSAVE_REGS(r1,r2,o1) do { \
+ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \
+} while (0)
+#define CSAVE_REGPX(r,o) CBE16(0xcc00 | (((r) - 19) << 6) | (~(o) >> 3))
+#define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3))
+#define CSAVE_FREGS(r1,r2,o1) do { \
+ int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \
+} while (0)
+#define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */
+#define CODE_NOP 0xe3
+#define CODE_END 0xe4
+#define CEND_ALIGN do { \
+ *p++ = CODE_END; \
+ while ((p - uwc) & 3) *p++ = CODE_NOP; \
+} while (0)
+
+ /* Unwind codes for .text section with handler. */
+ p = uwc;
+ CADD_FP(192); /* +2 */
+ CSAVE_REGS(19, 28, 176); /* +5*2 */
+ CSAVE_FREGS(8, 15, 96); /* +4*2 */
+ CSAVE_FPLR(192); /* +1 */
+ CALLOC_S(208); /* +1 */
+ CEND_ALIGN; /* +1 +1 -> 24 */
+
+ u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2);
+ owrite(ctx, &u32, 4);
+ owrite(ctx, &uwc, 24);
+
+ u32 = 0; /* Handler RVA to be relocated at 4 + 24. */
+ owrite(ctx, &u32, 4);
+
+ /* Unwind codes for vm_ffi_call without handler. */
+ p = uwc;
+ CADD_FP(16); /* +2 */
+ CSAVE_FPLR(16); /* +1 */
+ CSAVE_REGPX(19, -32); /* +2 */
+ CEND_ALIGN; /* +1 +2 -> 8 */
+
+ u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2);
+ owrite(ctx, &u32, 4);
+ owrite(ctx, &uwc, 8);
+
+ reloc.vaddr = 4 + 24; reloc.symidx = 1+2+nrsym+2+2;
+ reloc.type = PEOBJ_RELOC_ADDR32NB;
+ owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
+ }
#elif LJ_TARGET_X86
/* Write .sxdata section. */
for (i = 0; i < nrsym; i++) {
@@ -339,7 +433,7 @@ void emit_peobj(BuildCtx *ctx)
emit_peobj_sym(ctx, ctx->relocsym[i], 0,
PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN);
-#if LJ_TARGET_X64
+#ifdef PEOBJ_PDATA_NRELOC
emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA);
emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA);
emit_peobj_sym(ctx, "lj_err_unwind_win", 0,
diff --git a/src/host/genlibbc.lua b/src/host/genlibbc.lua
index 3621c3f5..e697fceb 100644
--- a/src/host/genlibbc.lua
+++ b/src/host/genlibbc.lua
@@ -138,65 +138,73 @@ local function fixup_dump(dump, fixup)
return { dump = ndump, startbc = startbc, sizebc = sizebc }
end
-local function find_defs(src)
+local function find_defs(src, mode)
local defs = {}
for name, code in string.gmatch(src, "LJLIB_LUA%(([^)]*)%)%s*/%*(.-)%*/") do
- local env = {}
local tcode, fixup = transform_lua(code)
- local func = assert(load(tcode, "", nil, env))()
- defs[name] = fixup_dump(string.dump(func, true), fixup)
+ local func = assert(load(tcode, "", mode))
+ defs[name] = fixup_dump(string.dump(func, mode), fixup)
defs[#defs+1] = name
end
return defs
end
-local function gen_header(defs)
+local function gen_header(defs32, defs64)
local t = {}
local function w(x) t[#t+1] = x end
w("/* This is a generated file. DO NOT EDIT! */\n\n")
w("static const int libbc_endian = ") w(isbe and 1 or 0) w(";\n\n")
- local s, sb = "", ""
- for i,name in ipairs(defs) do
- local d = defs[name]
- s = s .. d.dump
- sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1)
- .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc)
- .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4)
- end
- w("static const uint8_t libbc_code[] = {\n")
- local n = 0
- for i=1,#s do
- local x = string.byte(s, i)
- local xb = string.byte(sb, i)
- if xb == 255 then
- local name = BCN[x]
- local m = #name + 4
- if n + m > 78 then n = 0; w("\n") end
- n = n + m
- w("BC_"); w(name)
- else
- local m = x < 10 and 2 or (x < 100 and 3 or 4)
- if xb == 0 then
- if n + m > 78 then n = 0; w("\n") end
- else
- local name = defs[xb]:gsub("_", ".")
- if n ~= 0 then w("\n") end
- w("/* "); w(name); w(" */ ")
- n = #name + 7
- end
- n = n + m
- w(x)
+ for j,defs in ipairs{defs64, defs32} do
+ local s, sb = "", ""
+ for i,name in ipairs(defs) do
+ local d = defs[name]
+ s = s .. d.dump
+ sb = sb .. string.char(i) .. ("\0"):rep(d.startbc - 1)
+ .. (isbe and "\0\0\0\255" or "\255\0\0\0"):rep(d.sizebc)
+ .. ("\0"):rep(#d.dump - d.startbc - d.sizebc*4)
+ end
+ if j == 1 then
+ w("static const uint8_t libbc_code[] = {\n#if LJ_FR2\n")
+ else
+ w("\n#else\n")
+ end
+ local n = 0
+ for i=1,#s do
+ local x = string.byte(s, i)
+ local xb = string.byte(sb, i)
+ if xb == 255 then
+ local name = BCN[x]
+ local m = #name + 4
+ if n + m > 78 then n = 0; w("\n") end
+ n = n + m
+ w("BC_"); w(name)
+ else
+ local m = x < 10 and 2 or (x < 100 and 3 or 4)
+ if xb == 0 then
+ if n + m > 78 then n = 0; w("\n") end
+ else
+ local name = defs[xb]:gsub("_", ".")
+ if n ~= 0 then w("\n") end
+ w("/* "); w(name); w(" */ ")
+ n = #name + 7
+ end
+ n = n + m
+ w(x)
+ end
+ w(",")
end
- w(",")
end
- w("\n0\n};\n\n")
+ w("\n#endif\n0\n};\n\n")
w("static const struct { const char *name; int ofs; } libbc_map[] = {\n")
- local m = 0
- for _,name in ipairs(defs) do
- w('{"'); w(name); w('",'); w(m) w('},\n')
- m = m + #defs[name].dump
+ local m32, m64 = 0, 0
+ for i,name in ipairs(defs32) do
+ assert(name == defs64[i])
+ w('{"'); w(name); w('",'); w(m32) w('},\n')
+ m32 = m32 + #defs32[name].dump
+ m64 = m64 + #defs64[name].dump
+ assert(m32 == m64)
end
- w("{NULL,"); w(m); w("}\n};\n\n")
+ w("{NULL,"); w(m32); w("}\n};\n\n")
return table.concat(t)
end
@@ -219,7 +227,8 @@ end
local outfile = parse_arg(arg)
local src = read_files(arg)
-local defs = find_defs(src)
-local hdr = gen_header(defs)
+local defs32 = find_defs(src, "Wdts")
+local defs64 = find_defs(src, "Xdts")
+local hdr = gen_header(defs32, defs64)
write_file(outfile, hdr)
diff --git a/src/host/genversion.lua b/src/host/genversion.lua
index 42b5e6fe..f0925160 100644
--- a/src/host/genversion.lua
+++ b/src/host/genversion.lua
@@ -5,9 +5,10 @@
-- Released under the MIT license. See Copyright Notice in luajit.h
----------------------------------------------------------------------------
-local FILE_ROLLING_H = "luajit_rolling.h"
-local FILE_RELVER_TXT = "luajit_relver.txt"
-local FILE_LUAJIT_H = "luajit.h"
+local arg = {...}
+local FILE_ROLLING_H = arg[1] or "luajit_rolling.h"
+local FILE_RELVER_TXT = arg[2] or "luajit_relver.txt"
+local FILE_LUAJIT_H = arg[3] or "luajit.h"
local function file_read(file)
local fp = assert(io.open(file, "rb"), "run from the wrong directory")
@@ -28,7 +29,7 @@ local function file_write_mod(file, data)
assert(fp:close())
end
-local text = file_read(FILE_ROLLING_H)
+local text = file_read(FILE_ROLLING_H):gsub("#error.-\n", "")
local relver = file_read(FILE_RELVER_TXT):match("(%d+)")
if relver then
diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua
index 74699f3d..131bf39b 100644
--- a/src/jit/bcsave.lua
+++ b/src/jit/bcsave.lua
@@ -29,6 +29,9 @@ Save LuaJIT bytecode: luajit -b[options] input output
-l Only list bytecode.
-s Strip debug info (default).
-g Keep debug info.
+ -W Generate 32 bit (non-GC64) bytecode.
+ -X Generate 64 bit (GC64) bytecode.
+ -d Generate bytecode in deterministic manner.
-n name Set module name (default: auto-detect from input name).
-t type Set output file type (default: auto-detect from output name).
-a arch Override architecture for object files (default: native).
@@ -38,7 +41,7 @@ Save LuaJIT bytecode: luajit -b[options] input output
-- Stop handling options.
- Use stdin as input and/or stdout as output.
-File types: c h obj o raw (default)
+File types: c cc h obj o raw (default)
]]
os.exit(1)
end
@@ -51,8 +54,9 @@ local function check(ok, ...)
end
local function readfile(ctx, input)
- if type(input) == "function" then return input end
- if ctx.filename then
+ if ctx.string then
+ return check(loadstring(input, nil, ctx.mode))
+ elseif ctx.filename then
local data
if input == "-" then
data = io.stdin:read("*a")
@@ -61,10 +65,10 @@ local function readfile(ctx, input)
data = assert(fp:read("*a"))
assert(fp:close())
end
- return check(load(data, ctx.filename))
+ return check(load(data, ctx.filename, ctx.mode))
else
if input == "-" then input = nil end
- return check(loadfile(input))
+ return check(loadfile(input, ctx.mode))
end
end
@@ -81,7 +85,7 @@ end
------------------------------------------------------------------------------
local map_type = {
- raw = "raw", c = "c", h = "h", o = "obj", obj = "obj",
+ raw = "raw", c = "c", cc = "c", h = "h", o = "obj", obj = "obj",
}
local map_arch = {
@@ -624,7 +628,7 @@ end
local function bcsave(ctx, input, output)
local f = readfile(ctx, input)
- local s = string.dump(f, ctx.strip)
+ local s = string.dump(f, ctx.mode)
local t = ctx.type
if not t then
t = detecttype(output)
@@ -647,9 +651,11 @@ local function docmd(...)
local n = 1
local list = false
local ctx = {
- strip = true, arch = jit.arch, os = jit.os:lower(),
- type = false, modname = false,
+ mode = "bt", arch = jit.arch, os = jit.os:lower(),
+ type = false, modname = false, string = false,
}
+ local strip = "s"
+ local gc64 = ""
while n <= #arg do
local a = arg[n]
if type(a) == "string" and a:sub(1, 1) == "-" and a ~= "-" then
@@ -660,14 +666,18 @@ local function docmd(...)
if opt == "l" then
list = true
elseif opt == "s" then
- ctx.strip = true
+ strip = "s"
elseif opt == "g" then
- ctx.strip = false
+ strip = ""
+ elseif opt == "W" or opt == "X" then
+ gc64 = opt
+ elseif opt == "d" then
+ ctx.mode = ctx.mode .. opt
else
if arg[n] == nil or m ~= #a then usage() end
if opt == "e" then
if n ~= 1 then usage() end
- arg[1] = check(loadstring(arg[1]))
+ ctx.string = true
elseif opt == "n" then
ctx.modname = checkmodname(tremove(arg, n))
elseif opt == "t" then
@@ -687,6 +697,7 @@ local function docmd(...)
n = n + 1
end
end
+ ctx.mode = ctx.mode .. strip .. gc64
if list then
if #arg == 0 or #arg > 2 then usage() end
bclist(ctx, arg[1], arg[2] or "-")
diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua
index b10e2fb1..84677666 100644
--- a/src/jit/dis_arm64.lua
+++ b/src/jit/dis_arm64.lua
@@ -107,24 +107,20 @@ local map_logsr = { -- Logical, shifted register.
[0] = {
shift = 29, mask = 3,
[0] = {
- shift = 21, mask = 7,
- [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
- "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+ shift = 21, mask = 1,
+ [0] = "andDNMSg", "bicDNMSg"
},
{
- shift = 21, mask = 7,
- [0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
- "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+ shift = 21, mask = 1,
+ [0] = "orr|movDN0MSg", "orn|mvnDN0MSg"
},
{
- shift = 21, mask = 7,
- [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
- "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+ shift = 21, mask = 1,
+ [0] = "eorDNMSg", "eonDNMSg"
},
{
- shift = 21, mask = 7,
- [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
- "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+ shift = 21, mask = 1,
+ [0] = "ands|tstD0NMSg", "bicsDNMSg"
}
},
false -- unallocated
@@ -132,24 +128,20 @@ local map_logsr = { -- Logical, shifted register.
{
shift = 29, mask = 3,
[0] = {
- shift = 21, mask = 7,
- [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
- "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+ shift = 21, mask = 1,
+ [0] = "andDNMSg", "bicDNMSg"
},
{
- shift = 21, mask = 7,
- [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
- "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+ shift = 21, mask = 1,
+ [0] = "orr|movDN0MSg", "orn|mvnDN0MSg"
},
{
- shift = 21, mask = 7,
- [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
- "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+ shift = 21, mask = 1,
+ [0] = "eorDNMSg", "eonDNMSg"
},
{
- shift = 21, mask = 7,
- [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
- "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+ shift = 21, mask = 1,
+ [0] = "ands|tstD0NMSg", "bicsDNMSg"
}
}
}
@@ -735,7 +727,7 @@ local map_cond = {
"hi", "ls", "ge", "lt", "gt", "le", "al",
}
-local map_shift = { [0] = "lsl", "lsr", "asr", }
+local map_shift = { [0] = "lsl", "lsr", "asr", "ror"}
local map_extend = {
[0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
@@ -956,7 +948,7 @@ local function disass_ins(ctx)
elseif p == "U" then
local rn = map_regs.x[band(rshift(op, 5), 31)]
local sz = band(rshift(op, 30), 3)
- local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
+ local imm12 = lshift(rshift(lshift(op, 10), 20), sz)
if imm12 ~= 0 then
x = "["..rn..", #"..imm12.."]"
else
@@ -993,8 +985,7 @@ local function disass_ins(ctx)
x = x.."]"
end
elseif p == "P" then
- local opcv, sh = rshift(op, 26), 2
- if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
+ local sh = 2 + rshift(op, 31 - band(rshift(op, 26), 1))
local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
local rn = map_regs.x[band(rshift(op, 5), 31)]
local ind = band(rshift(op, 23), 3)
diff --git a/src/lib_base.c b/src/lib_base.c
index a0712674..2b0ff151 100644
--- a/src/lib_base.c
+++ b/src/lib_base.c
@@ -360,7 +360,11 @@ LJLIB_ASM_(xpcall) LJLIB_REC(.)
static int load_aux(lua_State *L, int status, int envarg)
{
if (status == LUA_OK) {
- if (tvistab(L->base+envarg-1)) {
+ /*
+ ** Set environment table for top-level function.
+ ** Don't do this for non-native bytecode, which returns a prototype.
+ */
+ if (tvistab(L->base+envarg-1) && tvisfunc(L->top-1)) {
GCfunc *fn = funcV(L->top-1);
GCtab *t = tabV(L->base+envarg-1);
setgcref(fn->c.env, obj2gco(t));
@@ -697,7 +701,10 @@ static int ffh_resume(lua_State *L, lua_State *co, int wrap)
setstrV(L, L->base-LJ_FR2, lj_err_str(L, em));
return FFH_RES(2);
}
- lj_state_growstack(co, (MSize)(L->top - L->base));
+ if (lj_state_cpgrowstack(co, (MSize)(L->top - L->base)) != LUA_OK) {
+ cTValue *msg = --co->top;
+ lj_err_callermsg(L, strVdata(msg));
+ }
return FFH_RETRY;
}
diff --git a/src/lib_ffi.c b/src/lib_ffi.c
index 6dee2e74..ba783173 100644
--- a/src/lib_ffi.c
+++ b/src/lib_ffi.c
@@ -746,7 +746,7 @@ LJLIB_CF(ffi_abi) LJLIB_REC(.)
"\003win"
#endif
#if LJ_ABI_PAUTH
- "\007pauth"
+ "\005pauth"
#endif
#if LJ_TARGET_UWP
"\003uwp"
diff --git a/src/lib_jit.c b/src/lib_jit.c
index 34b35e66..6d159474 100644
--- a/src/lib_jit.c
+++ b/src/lib_jit.c
@@ -161,24 +161,6 @@ LJLIB_PUSH(top-2) LJLIB_SET(version)
/* -- Reflection API for Lua functions ------------------------------------ */
-/* Return prototype of first argument (Lua function or prototype object) */
-static GCproto *check_Lproto(lua_State *L, int nolua)
-{
- TValue *o = L->base;
- if (L->top > o) {
- if (tvisproto(o)) {
- return protoV(o);
- } else if (tvisfunc(o)) {
- if (isluafunc(funcV(o)))
- return funcproto(funcV(o));
- else if (nolua)
- return NULL;
- }
- }
- lj_err_argt(L, 1, LUA_TFUNCTION);
- return NULL; /* unreachable */
-}
-
static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val)
{
setintV(lj_tab_setstr(L, t, lj_str_newz(L, name)), val);
@@ -187,7 +169,7 @@ static void setintfield(lua_State *L, GCtab *t, const char *name, int32_t val)
/* local info = jit.util.funcinfo(func [,pc]) */
LJLIB_CF(jit_util_funcinfo)
{
- GCproto *pt = check_Lproto(L, 1);
+ GCproto *pt = lj_lib_checkLproto(L, 1, 1);
if (pt) {
BCPos pc = (BCPos)lj_lib_optint(L, 2, 0);
GCtab *t;
@@ -229,7 +211,7 @@ LJLIB_CF(jit_util_funcinfo)
/* local ins, m = jit.util.funcbc(func, pc) */
LJLIB_CF(jit_util_funcbc)
{
- GCproto *pt = check_Lproto(L, 0);
+ GCproto *pt = lj_lib_checkLproto(L, 1, 0);
BCPos pc = (BCPos)lj_lib_checkint(L, 2);
if (pc < pt->sizebc) {
BCIns ins = proto_bc(pt)[pc];
@@ -246,7 +228,7 @@ LJLIB_CF(jit_util_funcbc)
/* local k = jit.util.funck(func, idx) */
LJLIB_CF(jit_util_funck)
{
- GCproto *pt = check_Lproto(L, 0);
+ GCproto *pt = lj_lib_checkLproto(L, 1, 0);
ptrdiff_t idx = (ptrdiff_t)lj_lib_checkint(L, 2);
if (idx >= 0) {
if (idx < (ptrdiff_t)pt->sizekn) {
@@ -266,7 +248,7 @@ LJLIB_CF(jit_util_funck)
/* local name = jit.util.funcuvname(func, idx) */
LJLIB_CF(jit_util_funcuvname)
{
- GCproto *pt = check_Lproto(L, 0);
+ GCproto *pt = lj_lib_checkLproto(L, 1, 0);
uint32_t idx = (uint32_t)lj_lib_checkint(L, 2);
if (idx < pt->sizeuv) {
setstrV(L, L->top-1, lj_str_newz(L, lj_debug_uvname(pt, idx)));
diff --git a/src/lib_string.c b/src/lib_string.c
index a1521573..487ef22e 100644
--- a/src/lib_string.c
+++ b/src/lib_string.c
@@ -122,11 +122,25 @@ static int writer_buf(lua_State *L, const void *p, size_t size, void *sb)
LJLIB_CF(string_dump)
{
- GCfunc *fn = lj_lib_checkfunc(L, 1);
- int strip = L->base+1 < L->top && tvistruecond(L->base+1);
- SBuf *sb = lj_buf_tmp_(L); /* Assumes lj_bcwrite() doesn't use tmpbuf. */
+ GCproto *pt = lj_lib_checkLproto(L, 1, 1);
+ uint32_t flags = 0;
+ SBuf *sb;
+ TValue *o = L->base+1;
+ if (o < L->top) {
+ if (tvisstr(o)) {
+ const char *mode = strVdata(o);
+ char c;
+ while ((c = *mode++)) {
+ if (c == 's') flags |= BCDUMP_F_STRIP;
+ if (c == 'd') flags |= BCDUMP_F_DETERMINISTIC;
+ }
+ } else if (tvistruecond(o)) {
+ flags |= BCDUMP_F_STRIP;
+ }
+ }
+ sb = lj_buf_tmp_(L); /* Assumes lj_bcwrite() doesn't use tmpbuf. */
L->top = L->base+1;
- if (!isluafunc(fn) || lj_bcwrite(L, funcproto(fn), writer_buf, sb, strip))
+ if (!pt || lj_bcwrite(L, pt, writer_buf, sb, flags))
lj_err_caller(L, LJ_ERR_STRDUMP);
setstrV(L, L->top-1, lj_buf_str(L, sb));
lj_gc_check(L);
diff --git a/src/lj_api.c b/src/lj_api.c
index 06ba87c6..c5a2ff1a 100644
--- a/src/lj_api.c
+++ b/src/lj_api.c
@@ -104,7 +104,12 @@ LUA_API int lua_checkstack(lua_State *L, int size)
if (size > LUAI_MAXCSTACK || (L->top - L->base + size) > LUAI_MAXCSTACK) {
return 0; /* Stack overflow. */
} else if (size > 0) {
- lj_state_checkstack(L, (MSize)size);
+ int avail = (int)(mref(L->maxstack, TValue) - L->top);
+ if (size > avail &&
+ lj_state_cpgrowstack(L, (MSize)(size - avail)) != LUA_OK) {
+ L->top--;
+ return 0; /* Out of memory. */
+ }
}
return 1;
}
diff --git a/src/lj_arch.h b/src/lj_arch.h
index 40030291..24b90945 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -57,7 +57,7 @@
#define LUAJIT_TARGET LUAJIT_ARCH_X64
#elif defined(__arm__) || defined(__arm) || defined(__ARM__) || defined(__ARM)
#define LUAJIT_TARGET LUAJIT_ARCH_ARM
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(_M_ARM64)
#define LUAJIT_TARGET LUAJIT_ARCH_ARM64
#elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC)
#define LUAJIT_TARGET LUAJIT_ARCH_PPC
@@ -66,7 +66,7 @@
#elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS)
#define LUAJIT_TARGET LUAJIT_ARCH_MIPS32
#else
-#error "No support for this architecture (yet)"
+#error "Architecture not supported (in this version), see: https://luajit.org/status.html#architectures"
#endif
#endif
@@ -237,7 +237,7 @@
#define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */
#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL
-#if __ARM_ARCH == 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
+#if __ARM_ARCH >= 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
#define LJ_ARCH_VERSION 80
#elif __ARM_ARCH == 7 || __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__
#define LJ_ARCH_VERSION 70
@@ -331,6 +331,7 @@
#define LJ_ARCH_NOFFI 1
#elif LJ_ARCH_BITS == 64
#error "No support for PPC64"
+#undef LJ_TARGET_PPC
#endif
#if _ARCH_PWR7
@@ -490,36 +491,45 @@
#elif LJ_TARGET_ARM
#if defined(__ARMEB__)
#error "No support for big-endian ARM"
+#undef LJ_TARGET_ARM
#endif
#if __ARM_ARCH_6M__ || __ARM_ARCH_7M__ || __ARM_ARCH_7EM__
#error "No support for Cortex-M CPUs"
+#undef LJ_TARGET_ARM
#endif
#if !(__ARM_EABI__ || LJ_TARGET_IOS)
#error "Only ARM EABI or iOS 3.0+ ABI is supported"
+#undef LJ_TARGET_ARM
#endif
#elif LJ_TARGET_ARM64
#if defined(_ILP32)
#error "No support for ILP32 model on ARM64"
+#undef LJ_TARGET_ARM64
#endif
#elif LJ_TARGET_PPC
#if defined(_LITTLE_ENDIAN) && (!defined(_BYTE_ORDER) || (_BYTE_ORDER == _LITTLE_ENDIAN))
#error "No support for little-endian PPC32"
+#undef LJ_TARGET_PPC
#endif
#if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT)
-#error "No support for PPC/e500 anymore (use LuaJIT 2.0)"
+#error "No support for PPC/e500, use LuaJIT 2.0"
+#undef LJ_TARGET_PPC
#endif
#elif LJ_TARGET_MIPS32
#if !((defined(_MIPS_SIM_ABI32) && _MIPS_SIM == _MIPS_SIM_ABI32) || (defined(_ABIO32) && _MIPS_SIM == _ABIO32))
#error "Only o32 ABI supported for MIPS32"
+#undef LJ_TARGET_MIPS
#endif
#if LJ_TARGET_MIPSR6
/* Not that useful, since most available r6 CPUs are 64 bit. */
#error "No support for MIPS32R6"
+#undef LJ_TARGET_MIPS
#endif
#elif LJ_TARGET_MIPS64
#if !((defined(_MIPS_SIM_ABI64) && _MIPS_SIM == _MIPS_SIM_ABI64) || (defined(_ABI64) && _MIPS_SIM == _ABI64))
/* MIPS32ON64 aka n32 ABI support might be desirable, but difficult. */
#error "Only n64 ABI supported for MIPS64"
+#undef LJ_TARGET_MIPS
#endif
#endif
#endif
diff --git a/src/lj_asm.c b/src/lj_asm.c
index c02a1b9e..844910ad 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -606,7 +606,11 @@ static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow)
IRIns *ir = IR(ref);
if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) ||
#if LJ_GC64
+#if LJ_TARGET_ARM64
+ (ir->o == IR_KINT && (uint64_t)k == (uint32_t)ir->i) ||
+#else
(ir->o == IR_KINT && k == ir->i) ||
+#endif
(ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) ||
((ir->o == IR_KPTR || ir->o == IR_KKPTR) &&
k == (intptr_t)ir_kptr(ir))
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h
index b3b1f096..bd5fbeb1 100644
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -969,24 +969,32 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
static void asm_uref(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
- if (irref_isk(ir->op1)) {
+ int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+ if (irref_isk(ir->op1) && !guarded) {
GCfunc *fn = ir_kfunc(IR(ir->op1));
MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
emit_lsptr(as, ARMI_LDR, dest, v);
} else {
- Reg uv = ra_scratch(as, RSET_GPR);
- Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
- if (ir->o == IR_UREFC) {
- asm_guardcc(as, CC_NE);
+ if (guarded) {
+ asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ);
emit_n(as, ARMI_CMP|ARMI_K12|1, RID_TMP);
- emit_opk(as, ARMI_ADD, dest, uv,
- (int32_t)offsetof(GCupval, tv), RSET_GPR);
- emit_lso(as, ARMI_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
- } else {
- emit_lso(as, ARMI_LDR, dest, uv, (int32_t)offsetof(GCupval, v));
}
- emit_lso(as, ARMI_LDR, uv, func,
- (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+ if (ir->o == IR_UREFC)
+ emit_opk(as, ARMI_ADD, dest, dest,
+ (int32_t)offsetof(GCupval, tv), RSET_GPR);
+ else
+ emit_lso(as, ARMI_LDR, dest, dest, (int32_t)offsetof(GCupval, v));
+ if (guarded)
+ emit_lso(as, ARMI_LDRB, RID_TMP, dest,
+ (int32_t)offsetof(GCupval, closed));
+ if (irref_isk(ir->op1)) {
+ GCfunc *fn = ir_kfunc(IR(ir->op1));
+ int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
+ emit_loadi(as, dest, k);
+ } else {
+ emit_lso(as, ARMI_LDR, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+ (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+ }
}
}
@@ -1990,6 +1998,7 @@ static void asm_prof(ASMState *as, IRIns *ir)
static void asm_stack_check(ASMState *as, BCReg topslot,
IRIns *irp, RegSet allow, ExitNo exitno)
{
+ int savereg = 0;
Reg pbase;
uint32_t k;
if (irp) {
@@ -2000,12 +2009,14 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
pbase = rset_pickbot(allow);
} else {
pbase = RID_RET;
- emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0); /* Restore temp. register. */
+ savereg = 1;
}
} else {
pbase = RID_BASE;
}
emit_branch(as, ARMF_CC(ARMI_BL, CC_LS), exitstub_addr(as->J, exitno));
+ if (savereg)
+ emit_lso(as, ARMI_LDR, RID_RET, RID_SP, 0); /* Restore temp. register. */
k = emit_isk12(0, (int32_t)(8*topslot));
lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot);
emit_n(as, ARMI_CMP^k, RID_TMP);
@@ -2017,7 +2028,7 @@ static void asm_stack_check(ASMState *as, BCReg topslot,
if (ra_hasspill(irp->s))
emit_lso(as, ARMI_LDR, pbase, RID_SP, sps_scale(irp->s));
emit_lso(as, ARMI_LDR, RID_TMP, RID_TMP, (i & 4095));
- if (ra_hasspill(irp->s) && !allow)
+ if (savereg)
emit_lso(as, ARMI_STR, RID_RET, RID_SP, 0); /* Save temp. register. */
emit_loadi(as, RID_TMP, (i & ~4095));
} else {
@@ -2031,11 +2042,12 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
SnapEntry *map = &as->T->snapmap[snap->mapofs];
SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1];
MSize n, nent = snap->nent;
+ int32_t bias = 0;
/* Store the value of all modified slots to the Lua stack. */
for (n = 0; n < nent; n++) {
SnapEntry sn = map[n];
BCReg s = snap_slot(sn);
- int32_t ofs = 8*((int32_t)s-1);
+ int32_t ofs = 8*((int32_t)s-1) - bias;
IRRef ref = snap_ref(sn);
IRIns *ir = IR(ref);
if ((sn & SNAP_NORESTORE))
@@ -2054,6 +2066,12 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
emit_lso(as, ARMI_STR, tmp, RID_BASE, ofs+4);
#else
Reg src = ra_alloc1(as, ref, RSET_FPR);
+ if (LJ_UNLIKELY(ofs < -1020 || ofs > 1020)) {
+ int32_t adj = ofs & 0xffffff00; /* K12-friendly. */
+ bias += adj;
+ ofs -= adj;
+ emit_addptr(as, RID_BASE, -adj);
+ }
emit_vlso(as, ARMI_VSTR_D, src, RID_BASE, ofs);
#endif
} else {
@@ -2082,6 +2100,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
}
checkmclim(as);
}
+ emit_addptr(as, RID_BASE, bias);
lj_assertA(map + nent == flinks, "inconsistent frames in snapshot");
}
@@ -2252,7 +2271,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
}
if (nslots > as->evenspill) /* Leave room for args in stack slots. */
as->evenspill = nslots;
- return REGSP_HINT(RID_RET);
+ return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
}
static void asm_setup_target(ASMState *as)
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
index 5e690308..5b40f4cc 100644
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@@ -84,18 +84,23 @@ static void asm_guardcc(ASMState *as, A64CC cc)
emit_cond_branch(as, cc, target);
}
-/* Emit test and branch instruction to exit for guard. */
-static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
+/* Emit test and branch instruction to exit for guard, if in range. */
+static int asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
{
MCode *target = asm_exitstub_addr(as, as->snapno);
MCode *p = as->mcp;
+ ptrdiff_t delta = target - p;
if (LJ_UNLIKELY(p == as->invmcp)) {
+ if (as->orignins > 1023) return 0; /* Delta might end up too large. */
as->loopinv = 1;
- *p = A64I_B | A64F_S26(target-p);
- emit_tnb(as, ai^0x01000000u, r, bit, p-1);
- return;
+ *p = A64I_B | A64F_S26(delta);
+ ai ^= 0x01000000u;
+ target = p-1;
+ } else if (LJ_UNLIKELY(delta >= 0x1fff)) {
+ return 0;
}
emit_tnb(as, ai, r, bit, target);
+ return 1;
}
/* Emit compare and branch instruction to exit for guard. */
@@ -211,16 +216,14 @@ static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow,
static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
{
IRIns *ir = IR(ref);
+ int logical = (ai & 0x1f000000) == 0x0a000000;
if (ra_hasreg(ir->r)) {
ra_noweak(as, ir->r);
return A64F_M(ir->r);
} else if (irref_isk(ref)) {
- uint32_t m;
int64_t k = get_k64val(as, ref);
- if ((ai & 0x1f000000) == 0x0a000000)
- m = emit_isk13(k, irt_is64(ir->t));
- else
- m = emit_isk12(k);
+ uint32_t m = logical ? emit_isk13(k, irt_is64(ir->t)) :
+ emit_isk12(irt_is64(ir->t) ? k : (int32_t)k);
if (m)
return m;
} else if (mayfuse(as, ref)) {
@@ -232,7 +235,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
(IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
IRIns *irl = IR(ir->op1);
if (sh == A64SH_LSL &&
- irl->o == IR_CONV &&
+ irl->o == IR_CONV && !logical &&
irl->op2 == ((IRT_I64<op1, allow);
return A64F_M(m) | A64F_SH(sh, shift);
}
- } else if (ir->o == IR_CONV &&
+ } else if (ir->o == IR_BROR && logical && irref_isk(ir->op2)) {
+ Reg m = ra_alloc1(as, ir->op1, allow);
+ int shift = (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
+ return A64F_M(m) | A64F_SH(A64SH_ROR, shift);
+ } else if (ir->o == IR_CONV && !logical &&
ir->op2 == ((IRT_I64<op1, allow);
return A64F_M(m) | A64F_EX(A64EX_SXTW);
@@ -426,6 +433,11 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
as->cost[gpr] = REGCOST(~0u, ASMREF_L);
gpr = REGARG_FIRSTGPR;
+#if LJ_HASFFI && LJ_ABI_WIN
+ if ((ci->flags & CCI_VARARG)) {
+ fpr = REGARG_LASTFPR+1;
+ }
+#endif
for (n = 0; n < nargs; n++) { /* Setup args. */
IRRef ref = args[n];
IRIns *ir = IR(ref);
@@ -436,6 +448,11 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
"reg %d not free", fpr); /* Must have been evicted. */
ra_leftov(as, fpr, ref);
fpr++;
+#if LJ_HASFFI && LJ_ABI_WIN
+ } else if ((ci->flags & CCI_VARARG) && (gpr <= REGARG_LASTGPR)) {
+ Reg rf = ra_alloc1(as, ref, RSET_FPR);
+ emit_dn(as, A64I_FMOV_R_D, gpr++, rf & 31);
+#endif
} else {
Reg r = ra_alloc1(as, ref, RSET_FPR);
int32_t al = spalign;
@@ -541,8 +558,6 @@ static void asm_retf(ASMState *as, IRIns *ir)
as->topslot -= (BCReg)delta;
if ((int32_t)as->topslot < 0) as->topslot = 0;
irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */
- /* Need to force a spill on REF_BASE now to update the stack slot. */
- emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
emit_setgl(as, base, jit_base);
emit_addptr(as, base, -8*delta);
asm_guardcc(as, CC_NE);
@@ -666,25 +681,22 @@ static void asm_strto(ASMState *as, IRIns *ir)
{
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
IRRef args[2];
- Reg dest = 0, tmp;
- int destused = ra_used(ir);
+ Reg tmp;
int32_t ofs = 0;
ra_evictset(as, RSET_SCRATCH);
- if (destused) {
+ if (ra_used(ir)) {
if (ra_hasspill(ir->s)) {
ofs = sps_scale(ir->s);
- destused = 0;
if (ra_hasreg(ir->r)) {
ra_free(as, ir->r);
ra_modified(as, ir->r);
emit_spload(as, ir, ir->r, ofs);
}
} else {
- dest = ra_dest(as, ir, RSET_FPR);
+ Reg dest = ra_dest(as, ir, RSET_FPR);
+ emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
}
}
- if (destused)
- emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
asm_guardcnb(as, A64I_CBZ, RID_RET);
args[0] = ir->op1; /* GCstr *str */
args[1] = ASMREF_TMP1; /* TValue *n */
@@ -775,113 +787,75 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
int destused = ra_used(ir);
Reg dest = ra_dest(as, ir, allow);
Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
- Reg key = 0, tmp = RID_TMP;
- Reg ftmp = RID_NONE, type = RID_NONE, scr = RID_NONE, tisnum = RID_NONE;
+ Reg tmp = RID_TMP, type = RID_NONE, key, tkey;
IRRef refkey = ir->op2;
IRIns *irkey = IR(refkey);
- int isk = irref_isk(ir->op2);
+ int isk = irref_isk(refkey);
IRType1 kt = irkey->t;
uint32_t k = 0;
uint32_t khash;
- MCLabel l_end, l_loop, l_next;
+ MCLabel l_end, l_loop;
rset_clear(allow, tab);
- if (!isk) {
- key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
- rset_clear(allow, key);
- if (!irt_isstr(kt)) {
- tmp = ra_scratch(as, allow);
- rset_clear(allow, tmp);
- }
- } else if (irt_isnum(kt)) {
- int64_t val = (int64_t)ir_knum(irkey)->u64;
- if (!(k = emit_isk12(val))) {
- key = ra_allock(as, val, allow);
- rset_clear(allow, key);
- }
- } else if (!irt_ispri(kt)) {
- if (!(k = emit_isk12(irkey->i))) {
- key = ra_alloc1(as, refkey, allow);
- rset_clear(allow, key);
- }
- }
-
- /* Allocate constants early. */
- if (irt_isnum(kt)) {
- if (!isk) {
- tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
- ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
- rset_clear(allow, tisnum);
- }
- } else if (irt_isaddr(kt)) {
- if (isk) {
- int64_t kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
- scr = ra_allock(as, kk, allow);
+ /* Allocate register for tkey outside of the loop. */
+ if (isk) {
+ int64_t kk;
+ if (irt_isaddr(kt)) {
+ kk = ((int64_t)irt_toitype(kt) << 47) | irkey[1].tv.u64;
+ } else if (irt_isnum(kt)) {
+ kk = (int64_t)ir_knum(irkey)->u64;
+ /* Assumes -0.0 is already canonicalized to +0.0. */
} else {
- scr = ra_scratch(as, allow);
+ lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
+ kk = ~((int64_t)~irt_toitype(kt) << 47);
}
- rset_clear(allow, scr);
+ k = emit_isk12(kk);
+ tkey = k ? 0 : ra_allock(as, kk, allow);
} else {
- lj_assertA(irt_ispri(kt) && !irt_isnil(kt), "bad HREF key type");
- type = ra_allock(as, ~((int64_t)~irt_toitype(kt) << 47), allow);
- scr = ra_scratch(as, rset_clear(allow, type));
- rset_clear(allow, scr);
+ tkey = ra_scratch(as, allow);
}
/* Key not found in chain: jump to exit (if merged) or load niltv. */
l_end = emit_label(as);
as->invmcp = NULL;
- if (merge == IR_NE)
+ if (merge == IR_NE) {
asm_guardcc(as, CC_AL);
- else if (destused)
- emit_loada(as, dest, niltvg(J2G(as->J)));
+ } else if (destused) {
+ uint32_t k12 = emit_isk12(offsetof(global_State, nilnode.val));
+ lj_assertA(k12 != 0, "Cannot k12 encode niltv(L)");
+ emit_dn(as, A64I_ADDx^k12, dest, RID_GL);
+ }
/* Follow hash chain until the end. */
l_loop = --as->mcp;
- emit_n(as, A64I_CMPx^A64I_K12^0, dest);
- emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
- l_next = emit_label(as);
+ if (destused)
+ emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
/* Type and value comparison. */
if (merge == IR_EQ)
asm_guardcc(as, CC_EQ);
else
emit_cond_branch(as, CC_EQ, l_end);
+ emit_nm(as, A64I_CMPx^k, tmp, tkey);
+ if (!destused)
+ emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+ emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key));
+ *l_loop = A64I_X | A64I_CBNZ | A64F_S19(as->mcp - l_loop) | dest;
- if (irt_isnum(kt)) {
- if (isk) {
- /* Assumes -0.0 is already canonicalized to +0.0. */
- if (k)
- emit_n(as, A64I_CMPx^k, tmp);
- else
- emit_nm(as, A64I_CMPx, key, tmp);
- emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+ /* Construct tkey as canonicalized or tagged key. */
+ if (!isk) {
+ if (irt_isnum(kt)) {
+ key = ra_alloc1(as, refkey, RSET_FPR);
+ emit_dnm(as, A64I_CSELx | A64F_CC(CC_EQ), tkey, RID_ZERO, tkey);
+ /* A64I_FMOV_R_D from key to tkey done below. */
} else {
- emit_nm(as, A64I_FCMPd, key, ftmp);
- emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
- emit_cond_branch(as, CC_LO, l_next);
- emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
- emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
+ lj_assertA(irt_isaddr(kt), "bad HREF key type");
+ key = ra_alloc1(as, refkey, allow);
+ type = ra_allock(as, irt_toitype(kt) << 15, rset_clear(allow, key));
+ emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 32), tkey, key, type);
}
- } else if (irt_isaddr(kt)) {
- if (isk) {
- emit_nm(as, A64I_CMPx, scr, tmp);
- emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
- } else {
- emit_nm(as, A64I_CMPx, tmp, scr);
- emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
- }
- } else {
- emit_nm(as, A64I_CMPx, scr, type);
- emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
}
- *l_loop = A64I_BCC | A64F_S19(as->mcp - l_loop) | CC_NE;
- if (!isk && irt_isaddr(kt)) {
- type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
- emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
- rset_clear(allow, type);
- }
/* Load main position relative to tab->node into dest. */
khash = isk ? ir_khash(as, irkey) : 1;
if (khash == 0) {
@@ -895,7 +869,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
} else if (irt_isstr(kt)) {
- /* Fetch of str->sid is cheaper than ra_allock. */
emit_dnm(as, A64I_ANDw, dest, dest, tmp);
emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, sid));
emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
@@ -904,23 +877,18 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
emit_dnm(as, A64I_SUBw, dest, dest, tmp);
emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
- emit_dnm(as, A64I_EORw, dest, dest, tmp);
- emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
+ emit_dnm(as, A64I_EORw | A64F_SH(A64SH_ROR, 32-HASH_ROT2), dest, tmp, dest);
emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
- emit_dnm(as, A64I_EORw, tmp, tmp, dest);
if (irt_isnum(kt)) {
+ emit_dnm(as, A64I_EORw, tmp, tkey, dest);
emit_dnm(as, A64I_ADDw, dest, dest, dest);
- emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
- emit_dm(as, A64I_MOVw, tmp, dest);
- emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
+ emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, tkey);
+ emit_nm(as, A64I_FCMPZd, (key & 31), 0);
+ emit_dn(as, A64I_FMOV_R_D, tkey, (key & 31));
} else {
- checkmclim(as);
- emit_dm(as, A64I_MOVw, tmp, key);
- emit_dnm(as, A64I_EORw, dest, dest,
- ra_allock(as, irt_toitype(kt) << 15, allow));
- emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
- emit_dm(as, A64I_MOVx, dest, key);
+ emit_dnm(as, A64I_EORw, tmp, key, dest);
+ emit_dnm(as, A64I_EORx | A64F_SH(A64SH_LSR, 32), dest, type, key);
}
}
}
@@ -935,7 +903,7 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
int bigofs = !emit_checkofs(A64I_LDRx, kofs);
Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
Reg node = ra_alloc1(as, ir->op1, RSET_GPR);
- Reg key, idx = node;
+ Reg idx = node;
RegSet allow = rset_exclude(RSET_GPR, node);
uint64_t k;
lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
@@ -954,9 +922,8 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
} else {
k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
}
- key = ra_scratch(as, allow);
- emit_nm(as, A64I_CMPx, key, ra_allock(as, k, rset_exclude(allow, key)));
- emit_lso(as, A64I_LDRx, key, idx, kofs);
+ emit_nm(as, A64I_CMPx, RID_TMP, ra_allock(as, k, allow));
+ emit_lso(as, A64I_LDRx, RID_TMP, idx, kofs);
if (bigofs)
emit_opk(as, A64I_ADDx, dest, node, ofs, rset_exclude(RSET_GPR, node));
}
@@ -964,24 +931,30 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
static void asm_uref(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
- if (irref_isk(ir->op1)) {
+ int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+ if (irref_isk(ir->op1) && !guarded) {
GCfunc *fn = ir_kfunc(IR(ir->op1));
MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
emit_lsptr(as, A64I_LDRx, dest, v);
} else {
- Reg uv = ra_scratch(as, RSET_GPR);
- Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
- if (ir->o == IR_UREFC) {
- asm_guardcc(as, CC_NE);
- emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
- emit_opk(as, A64I_ADDx, dest, uv,
+ if (guarded)
+ asm_guardcnb(as, ir->o == IR_UREFC ? A64I_CBZ : A64I_CBNZ, RID_TMP);
+ if (ir->o == IR_UREFC)
+ emit_opk(as, A64I_ADDx, dest, dest,
(int32_t)offsetof(GCupval, tv), RSET_GPR);
- emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+ else
+ emit_lso(as, A64I_LDRx, dest, dest, (int32_t)offsetof(GCupval, v));
+ if (guarded)
+ emit_lso(as, A64I_LDRB, RID_TMP, dest,
+ (int32_t)offsetof(GCupval, closed));
+ if (irref_isk(ir->op1)) {
+ GCfunc *fn = ir_kfunc(IR(ir->op1));
+ uint64_t k = gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
+ emit_loadu64(as, dest, k);
} else {
- emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
+ emit_lso(as, A64I_LDRx, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+ (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
}
- emit_lso(as, A64I_LDRx, uv, func,
- (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
}
}
@@ -1086,7 +1059,7 @@ static void asm_xstore(ASMState *as, IRIns *ir)
static void asm_ahuvload(ASMState *as, IRIns *ir)
{
- Reg idx, tmp, type;
+ Reg idx, tmp;
int32_t ofs = 0;
RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
lj_assertA(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
@@ -1105,8 +1078,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
} else {
tmp = ra_scratch(as, gpr);
}
- type = ra_scratch(as, rset_clear(gpr, tmp));
- idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
+ idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, tmp), A64I_LDRx);
rset_clear(gpr, idx);
if (ofs & FUSE_REG) rset_clear(gpr, ofs & 31);
if (ir->o == IR_VLOAD) ofs += 8 * ir->op2;
@@ -1118,8 +1090,8 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
ra_allock(as, LJ_TISNUM << 15, gpr), tmp);
} else if (irt_isaddr(ir->t)) {
- emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
- emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+ emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), RID_TMP);
+ emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp);
} else if (irt_isnil(ir->t)) {
emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
} else {
@@ -1242,9 +1214,8 @@ dotypecheck:
emit_nm(as, A64I_CMPx,
ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
} else {
- Reg type = ra_scratch(as, allow);
- emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
- emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+ emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), RID_TMP);
+ emit_dn(as, A64I_ASRx | A64F_IMMR(47), RID_TMP, tmp);
}
emit_lso(as, A64I_LDRx, tmp, base, ofs);
return;
@@ -1330,7 +1301,6 @@ static void asm_obar(ASMState *as, IRIns *ir)
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
IRRef args[2];
MCLabel l_end;
- RegSet allow = RSET_GPR;
Reg obj, val, tmp;
/* No need for other object barriers (yet). */
lj_assertA(IR(ir->op1)->o == IR_UREFC, "bad OBAR type");
@@ -1341,14 +1311,13 @@ static void asm_obar(ASMState *as, IRIns *ir)
asm_gencall(as, ci, args);
emit_dm(as, A64I_MOVx, ra_releasetmp(as, ASMREF_TMP1), RID_GL);
obj = IR(ir->op1)->r;
- tmp = ra_scratch(as, rset_exclude(allow, obj));
- emit_cond_branch(as, CC_EQ, l_end);
- emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
+ tmp = ra_scratch(as, rset_exclude(RSET_GPR, obj));
+ emit_tnb(as, A64I_TBZ, tmp, lj_ffs(LJ_GC_BLACK), l_end);
emit_cond_branch(as, CC_EQ, l_end);
emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
emit_lso(as, A64I_LDRB, tmp, obj,
- (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
+ (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
}
@@ -1390,12 +1359,12 @@ static int asm_swapops(ASMState *as, IRRef lref, IRRef rref)
if (irref_isk(lref))
return 1; /* But swap constants to the right. */
ir = IR(rref);
- if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+ if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) ||
(ir->o == IR_ADD && ir->op1 == ir->op2) ||
(ir->o == IR_CONV && ir->op2 == ((IRT_I64<o >= IR_BSHL && ir->o <= IR_BSAR) ||
+ if ((ir->o >= IR_BSHL && ir->o <= IR_BROR) ||
(ir->o == IR_ADD && ir->op1 == ir->op2) ||
(ir->o == IR_CONV && ir->op2 == ((IRT_I64<t)) { /* IR_MULOV */
asm_guardcc(as, CC_NE);
emit_dm(as, A64I_MOVw, dest, dest); /* Zero-extend. */
- emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
- emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
+ emit_nm(as, A64I_CMPx | A64F_EX(A64EX_SXTW), dest, dest);
emit_dnm(as, A64I_SMULL, dest, right, left);
} else {
emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
@@ -1707,16 +1675,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir)
if (asm_swapops(as, blref, brref)) {
Reg tmp = blref; blref = brref; brref = tmp;
}
+ bleft = ra_alloc1(as, blref, RSET_GPR);
if (irref_isk(brref)) {
uint64_t k = get_k64val(as, brref);
- if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) {
- asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ,
- ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k));
+ if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE) &&
+ asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, bleft,
+ emit_ctz64(k)))
return;
- }
m2 = emit_isk13(k, irt_is64(irl->t));
}
- bleft = ra_alloc1(as, blref, RSET_GPR);
ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
if (!m2)
m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
@@ -1791,37 +1758,28 @@ static void asm_prof(ASMState *as, IRIns *ir)
static void asm_stack_check(ASMState *as, BCReg topslot,
IRIns *irp, RegSet allow, ExitNo exitno)
{
- Reg pbase;
uint32_t k;
+ Reg pbase = RID_BASE;
if (irp) {
- if (!ra_hasspill(irp->s)) {
- pbase = irp->r;
- lj_assertA(ra_hasreg(pbase), "base reg lost");
- } else if (allow) {
- pbase = rset_pickbot(allow);
- } else {
- pbase = RID_RET;
- emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0); /* Restore temp register. */
- }
- } else {
- pbase = RID_BASE;
+ pbase = irp->r;
+ if (!ra_hasreg(pbase))
+ pbase = allow ? (0x40 | rset_pickbot(allow)) : (0xC0 | RID_RET);
}
emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno));
+ if (pbase & 0x80) /* Restore temp. register. */
+ emit_lso(as, A64I_LDRx, (pbase & 31), RID_SP, 0);
k = emit_isk12((8*topslot));
lj_assertA(k, "slot offset %d does not fit in K12", 8*topslot);
emit_n(as, A64I_CMPx^k, RID_TMP);
- emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
+ emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, (pbase & 31));
emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
(int32_t)offsetof(lua_State, maxstack));
- if (irp) { /* Must not spill arbitrary registers in head of side trace. */
- if (ra_hasspill(irp->s))
- emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
- emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
- if (ra_hasspill(irp->s) && !allow)
- emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0); /* Save temp register. */
- } else {
- emit_getgl(as, RID_TMP, cur_L);
+ if (pbase & 0x40) {
+ emit_getgl(as, (pbase & 31), jit_base);
+ if (pbase & 0x80) /* Save temp register. */
+ emit_lso(as, A64I_STRx, (pbase & 31), RID_SP, 0);
}
+ emit_getgl(as, RID_TMP, cur_L);
}
/* Restore Lua stack from on-trace state. */
@@ -1863,7 +1821,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
/* Marker to prevent patching the GC check exit. */
#define ARM64_NOPATCH_GC_CHECK \
- (A64I_ORRx|A64F_D(RID_TMP)|A64F_M(RID_TMP)|A64F_N(RID_TMP))
+ (A64I_ORRx|A64F_D(RID_ZERO)|A64F_M(RID_ZERO)|A64F_N(RID_ZERO))
/* Check GC threshold and do one or more GC steps. */
static void asm_gc_check(ASMState *as)
@@ -1918,46 +1876,40 @@ static void asm_loop_tail_fixup(ASMState *as)
/* -- Head of trace ------------------------------------------------------- */
-/* Reload L register from g->cur_L. */
-static void asm_head_lreg(ASMState *as)
-{
- IRIns *ir = IR(ASMREF_L);
- if (ra_used(ir)) {
- Reg r = ra_dest(as, ir, RSET_GPR);
- emit_getgl(as, r, cur_L);
- ra_evictk(as);
- }
-}
-
/* Coalesce BASE register for a root trace. */
static void asm_head_root_base(ASMState *as)
{
- IRIns *ir;
- asm_head_lreg(as);
- ir = IR(REF_BASE);
- if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
- ra_spill(as, ir);
- ra_destreg(as, ir, RID_BASE);
+ IRIns *ir = IR(REF_BASE);
+ Reg r = ir->r;
+ if (ra_hasreg(r)) {
+ ra_free(as, r);
+ if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+ ir->r = RID_INIT; /* No inheritance for modified BASE register. */
+ if (r != RID_BASE)
+ emit_movrr(as, ir, r, RID_BASE);
+ }
}
/* Coalesce BASE register for a side trace. */
static Reg asm_head_side_base(ASMState *as, IRIns *irp)
{
- IRIns *ir;
- asm_head_lreg(as);
- ir = IR(REF_BASE);
- if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
- ra_spill(as, ir);
- if (ra_hasspill(irp->s)) {
- return ra_dest(as, ir, RSET_GPR);
- } else {
- Reg r = irp->r;
- lj_assertA(ra_hasreg(r), "base reg lost");
- if (r != ir->r && !rset_test(as->freeset, r))
- ra_restore(as, regcost_ref(as->cost[r]));
- ra_destreg(as, ir, r);
- return r;
+ IRIns *ir = IR(REF_BASE);
+ Reg r = ir->r;
+ if (ra_hasreg(r)) {
+ ra_free(as, r);
+ if (rset_test(as->modset, r) || irt_ismarked(ir->t))
+ ir->r = RID_INIT; /* No inheritance for modified BASE register. */
+ if (irp->r == r) {
+ return r; /* Same BASE register already coalesced. */
+ } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) {
+ /* Move from coalesced parent reg. */
+ emit_movrr(as, ir, r, irp->r);
+ return irp->r;
+ } else {
+ emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */
+ }
}
+ return RID_NONE;
}
/* -- Tail of trace ------------------------------------------------------- */
@@ -2009,6 +1961,9 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
int spofs = 0, spalign = LJ_TARGET_OSX ? 0 : 7, nslots;
asm_collectargs(as, ir, ci, args);
+#if LJ_ABI_WIN
+ if ((ci->flags & CCI_VARARG)) nfpr = 0;
+#endif
for (i = 0; i < nargs; i++) {
int al = spalign;
if (!args[i]) {
@@ -2020,7 +1975,9 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
#endif
} else if (irt_isfp(IR(args[i])->t)) {
if (nfpr > 0) { nfpr--; continue; }
-#if LJ_TARGET_OSX
+#if LJ_ABI_WIN
+ if ((ci->flags & CCI_VARARG) && ngpr > 0) { ngpr--; continue; }
+#elif LJ_TARGET_OSX
al |= irt_isnum(IR(args[i])->t) ? 7 : 3;
#endif
} else {
@@ -2036,7 +1993,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
as->evenspill = nslots;
}
#endif
- return REGSP_HINT(RID_RET);
+ return REGSP_HINT(irt_isfp(ir->t) ? RID_FPRET : RID_RET);
}
static void asm_setup_target(ASMState *as)
diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h
index b02da663..5b83e34d 100644
--- a/src/lj_asm_mips.h
+++ b/src/lj_asm_mips.h
@@ -653,11 +653,11 @@ static void asm_conv(ASMState *as, IRIns *ir)
rset_exclude(RSET_GPR, dest));
emit_fg(as, MIPSI_TRUNC_L_D, tmp, left); /* Delay slot. */
#if !LJ_TARGET_MIPSR6
- emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
- emit_fgh(as, MIPSI_C_OLT_D, 0, left, tmp);
+ emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
+ emit_fgh(as, MIPSI_C_OLT_D, 0, left, tmp);
#else
- emit_branch(as, MIPSI_BC1NEZ, 0, (left&31), l_end);
- emit_fgh(as, MIPSI_CMP_LT_D, left, left, tmp);
+ emit_branch(as, MIPSI_BC1NEZ, 0, (tmp&31), l_end);
+ emit_fgh(as, MIPSI_CMP_LT_D, tmp, left, tmp);
#endif
emit_lsptr(as, MIPSI_LDC1, (tmp & 31),
(void *)&as->J->k64[LJ_K64_2P63],
@@ -670,11 +670,11 @@ static void asm_conv(ASMState *as, IRIns *ir)
rset_exclude(RSET_GPR, dest));
emit_fg(as, MIPSI_TRUNC_L_S, tmp, left); /* Delay slot. */
#if !LJ_TARGET_MIPSR6
- emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
- emit_fgh(as, MIPSI_C_OLT_S, 0, left, tmp);
+ emit_branch(as, MIPSI_BC1T, 0, 0, l_end);
+ emit_fgh(as, MIPSI_C_OLT_S, 0, left, tmp);
#else
- emit_branch(as, MIPSI_BC1NEZ, 0, (left&31), l_end);
- emit_fgh(as, MIPSI_CMP_LT_S, left, left, tmp);
+ emit_branch(as, MIPSI_BC1NEZ, 0, (tmp&31), l_end);
+ emit_fgh(as, MIPSI_CMP_LT_S, tmp, left, tmp);
#endif
emit_lsptr(as, MIPSI_LWC1, (tmp & 31),
(void *)&as->J->k32[LJ_K32_2P63],
@@ -690,8 +690,8 @@ static void asm_conv(ASMState *as, IRIns *ir)
MIPSIns mi = irt_is64(ir->t) ?
(st == IRT_NUM ? MIPSI_TRUNC_L_D : MIPSI_TRUNC_L_S) :
(st == IRT_NUM ? MIPSI_TRUNC_W_D : MIPSI_TRUNC_W_S);
- emit_tg(as, irt_is64(ir->t) ? MIPSI_DMFC1 : MIPSI_MFC1, dest, left);
- emit_fg(as, mi, left, left);
+ emit_tg(as, irt_is64(ir->t) ? MIPSI_DMFC1 : MIPSI_MFC1, dest, tmp);
+ emit_fg(as, mi, tmp, left);
#endif
}
}
@@ -1207,22 +1207,29 @@ nolo:
static void asm_uref(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
- if (irref_isk(ir->op1)) {
+ int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+ if (irref_isk(ir->op1) && !guarded) {
GCfunc *fn = ir_kfunc(IR(ir->op1));
MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
emit_lsptr(as, MIPSI_AL, dest, v, RSET_GPR);
} else {
- Reg uv = ra_scratch(as, RSET_GPR);
- Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
- if (ir->o == IR_UREFC) {
- asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO);
- emit_tsi(as, MIPSI_AADDIU, dest, uv, (int32_t)offsetof(GCupval, tv));
- emit_tsi(as, MIPSI_LBU, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+ if (guarded)
+ asm_guard(as, ir->o == IR_UREFC ? MIPSI_BEQ : MIPSI_BNE, RID_TMP, RID_ZERO);
+ if (ir->o == IR_UREFC)
+ emit_tsi(as, MIPSI_AADDIU, dest, dest, (int32_t)offsetof(GCupval, tv));
+ else
+ emit_tsi(as, MIPSI_AL, dest, dest, (int32_t)offsetof(GCupval, v));
+ if (guarded)
+ emit_tsi(as, MIPSI_LBU, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
+ if (irref_isk(ir->op1)) {
+ GCfunc *fn = ir_kfunc(IR(ir->op1));
+ GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
+ emit_loada(as, dest, o);
} else {
- emit_tsi(as, MIPSI_AL, dest, uv, (int32_t)offsetof(GCupval, v));
+ emit_tsi(as, MIPSI_AL, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+ (int32_t)offsetof(GCfuncL, uvptr) +
+ (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
}
- emit_tsi(as, MIPSI_AL, uv, func, (int32_t)offsetof(GCfuncL, uvptr) +
- (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
}
}
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h
index 6555312d..8e9a92a4 100644
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -840,23 +840,30 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
static void asm_uref(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
- if (irref_isk(ir->op1)) {
+ int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+ if (irref_isk(ir->op1) && !guarded) {
GCfunc *fn = ir_kfunc(IR(ir->op1));
MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
emit_lsptr(as, PPCI_LWZ, dest, v, RSET_GPR);
} else {
- Reg uv = ra_scratch(as, RSET_GPR);
- Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
- if (ir->o == IR_UREFC) {
- asm_guardcc(as, CC_NE);
+ if (guarded) {
+ asm_guardcc(as, ir->o == IR_UREFC ? CC_NE : CC_EQ);
emit_ai(as, PPCI_CMPWI, RID_TMP, 1);
- emit_tai(as, PPCI_ADDI, dest, uv, (int32_t)offsetof(GCupval, tv));
- emit_tai(as, PPCI_LBZ, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
- } else {
- emit_tai(as, PPCI_LWZ, dest, uv, (int32_t)offsetof(GCupval, v));
}
- emit_tai(as, PPCI_LWZ, uv, func,
- (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+ if (ir->o == IR_UREFC)
+ emit_tai(as, PPCI_ADDI, dest, dest, (int32_t)offsetof(GCupval, tv));
+ else
+ emit_tai(as, PPCI_LWZ, dest, dest, (int32_t)offsetof(GCupval, v));
+ if (guarded)
+ emit_tai(as, PPCI_LBZ, RID_TMP, dest, (int32_t)offsetof(GCupval, closed));
+ if (irref_isk(ir->op1)) {
+ GCfunc *fn = ir_kfunc(IR(ir->op1));
+ int32_t k = (int32_t)gcrefu(fn->l.uvptr[(ir->op2 >> 8)]);
+ emit_loadi(as, dest, k);
+ } else {
+ emit_tai(as, PPCI_LWZ, dest, ra_alloc1(as, ir->op1, RSET_GPR),
+ (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8));
+ }
}
}
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 9f779bf5..aee33716 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -109,7 +109,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
/* Check if there's no conflicting instruction between curins and ref.
** Also avoid fusing loads if there are multiple references.
*/
-static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
+static int noconflict(ASMState *as, IRRef ref, IROp conflict, int check)
{
IRIns *ir = as->ir;
IRRef i = as->curins;
@@ -118,7 +118,9 @@ static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
while (--i > ref) {
if (ir[i].o == conflict)
return 0; /* Conflict found. */
- else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref))
+ else if ((check & 1) && (ir[i].o == IR_NEWREF || ir[i].o == IR_CALLS))
+ return 0;
+ else if ((check & 2) && (ir[i].op1 == ref || ir[i].op2 == ref))
return 0;
}
return 1; /* Ok, no conflict. */
@@ -134,13 +136,14 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref)
lj_assertA(irb->op2 == IRFL_TAB_ARRAY, "expected FLOAD TAB_ARRAY");
/* We can avoid the FLOAD of t->array for colocated arrays. */
if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
- !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) {
+ !neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 0)) {
as->mrm.ofs = (int32_t)sizeof(GCtab); /* Ofs to colocated array. */
return irb->op1; /* Table obj. */
}
} else if (irb->o == IR_ADD && irref_isk(irb->op2)) {
/* Fuse base offset (vararg load). */
- as->mrm.ofs = IR(irb->op2)->i;
+ IRIns *irk = IR(irb->op2);
+ as->mrm.ofs = irk->o == IR_KINT ? irk->i : (int32_t)ir_kint64(irk)->u64;
return irb->op1;
}
return ref; /* Otherwise use the given array base. */
@@ -455,7 +458,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
if (ir->o == IR_SLOAD) {
if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
- noconflict(as, ref, IR_RETF, 0) &&
+ noconflict(as, ref, IR_RETF, 2) &&
!(LJ_GC64 && irt_isaddr(ir->t))) {
as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
@@ -466,12 +469,12 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
} else if (ir->o == IR_FLOAD) {
/* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) &&
- noconflict(as, ref, IR_FSTORE, 0)) {
+ noconflict(as, ref, IR_FSTORE, 2)) {
asm_fusefref(as, ir, xallow);
return RID_MRM;
}
} else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
- if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
+ if (noconflict(as, ref, ir->o + IRDELTA_L2S, 2+(ir->o != IR_ULOAD)) &&
!(LJ_GC64 && irt_isaddr(ir->t))) {
asm_fuseahuref(as, ir->op1, xallow);
return RID_MRM;
@@ -481,7 +484,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
*/
if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) &&
- noconflict(as, ref, IR_XSTORE, 0)) {
+ noconflict(as, ref, IR_XSTORE, 2)) {
asm_fusexref(as, ir->op1, xallow);
return RID_MRM;
}
@@ -814,6 +817,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
emit_rr(as, XO_UCOMISD, left, tmp);
emit_rr(as, XO_CVTSI2SD, tmp, dest);
emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
+ checkmclim(as);
emit_rr(as, XO_CVTTSD2SI, dest, left);
/* Can't fuse since left is needed twice. */
}
@@ -856,6 +860,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
emit_rma(as, XO_MOVSD, bias, k);
+ checkmclim(as);
emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
return;
} else { /* Integer to FP conversion. */
@@ -1172,6 +1177,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
asm_guardcc(as, CC_E);
else
emit_sjcc(as, CC_E, l_end);
+ checkmclim(as);
if (irt_isnum(kt)) {
if (isk) {
/* Assumes -0.0 is already canonicalized to +0.0. */
@@ -1231,7 +1237,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
#endif
}
emit_sfixup(as, l_loop);
- checkmclim(as);
#if LJ_GC64
if (!isk && irt_isaddr(kt)) {
emit_rr(as, XO_OR, tmp|REX_64, key);
@@ -1258,6 +1263,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
emit_shifti(as, XOg_ROL, tmp, HASH_ROT3);
emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
+ checkmclim(as);
emit_shifti(as, XOg_ROL, dest, HASH_ROT2);
emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
emit_shifti(as, XOg_ROL, dest, HASH_ROT1);
@@ -1275,7 +1281,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
} else {
emit_rr(as, XO_MOV, tmp, key);
#if LJ_GC64
- checkmclim(as);
emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15);
if ((as->flags & JIT_F_BMI2)) {
emit_i8(as, 32);
@@ -1372,24 +1377,31 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
static void asm_uref(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
- if (irref_isk(ir->op1)) {
+ int guarded = (irt_t(ir->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC);
+ if (irref_isk(ir->op1) && !guarded) {
GCfunc *fn = ir_kfunc(IR(ir->op1));
MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
emit_rma(as, XO_MOV, dest|REX_GC64, v);
} else {
Reg uv = ra_scratch(as, RSET_GPR);
- Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
- if (ir->o == IR_UREFC) {
+ if (ir->o == IR_UREFC)
emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv));
- asm_guardcc(as, CC_NE);
- emit_i8(as, 1);
- emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
- } else {
+ else
emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v));
+ if (guarded) {
+ asm_guardcc(as, ir->o == IR_UREFC ? CC_E : CC_NE);
+ emit_i8(as, 0);
+ emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed));
+ }
+ if (irref_isk(ir->op1)) {
+ GCfunc *fn = ir_kfunc(IR(ir->op1));
+ GCobj *o = gcref(fn->l.uvptr[(ir->op2 >> 8)]);
+ emit_loada(as, uv, o);
+ } else {
+ emit_rmro(as, XO_MOV, uv|REX_GC64, ra_alloc1(as, ir->op1, RSET_GPR),
+ (int32_t)offsetof(GCfuncL, uvptr) +
+ (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
}
- emit_rmro(as, XO_MOV, uv|REX_GC64, func,
- (int32_t)offsetof(GCfuncL, uvptr) +
- (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8));
}
}
@@ -1546,6 +1558,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
if (irt_islightud(ir->t)) {
Reg dest = asm_load_lightud64(as, ir, 1);
if (ra_hasreg(dest)) {
+ checkmclim(as);
asm_fuseahuref(as, ir->op1, RSET_GPR);
if (ir->o == IR_VLOAD) as->mrm.ofs += 8 * ir->op2;
emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
@@ -1593,6 +1606,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
"bad load type %d", irt_type(ir->t));
+ checkmclim(as);
#if LJ_GC64
emit_u32(as, LJ_TISNUM << 15);
#else
diff --git a/src/lj_bcdump.h b/src/lj_bcdump.h
index 6ba71e25..3e56e39c 100644
--- a/src/lj_bcdump.h
+++ b/src/lj_bcdump.h
@@ -46,6 +46,8 @@
#define BCDUMP_F_KNOWN (BCDUMP_F_FR2*2-1)
+#define BCDUMP_F_DETERMINISTIC 0x80000000
+
/* Type codes for the GC constants of a prototype. Plus length for strings. */
enum {
BCDUMP_KGC_CHILD, BCDUMP_KGC_TAB, BCDUMP_KGC_I64, BCDUMP_KGC_U64,
@@ -61,7 +63,7 @@ enum {
/* -- Bytecode reader/writer ---------------------------------------------- */
LJ_FUNC int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer,
- void *data, int strip);
+ void *data, uint32_t flags);
LJ_FUNC GCproto *lj_bcread_proto(LexState *ls);
LJ_FUNC GCproto *lj_bcread(LexState *ls);
diff --git a/src/lj_bcread.c b/src/lj_bcread.c
index c98c0d42..637ef067 100644
--- a/src/lj_bcread.c
+++ b/src/lj_bcread.c
@@ -281,8 +281,11 @@ static void bcread_knum(LexState *ls, GCproto *pt, MSize sizekn)
static void bcread_bytecode(LexState *ls, GCproto *pt, MSize sizebc)
{
BCIns *bc = proto_bc(pt);
- bc[0] = BCINS_AD((pt->flags & PROTO_VARARG) ? BC_FUNCV : BC_FUNCF,
- pt->framesize, 0);
+ BCIns op;
+ if (ls->fr2 != LJ_FR2) op = BC_NOT; /* Mark non-native prototype. */
+ else if ((pt->flags & PROTO_VARARG)) op = BC_FUNCV;
+ else op = BC_FUNCF;
+ bc[0] = BCINS_AD(op, pt->framesize, 0);
bcread_block(ls, bc+1, (sizebc-1)*(MSize)sizeof(BCIns));
/* Swap bytecode instructions if the endianess differs. */
if (bcread_swap(ls)) {
@@ -395,7 +398,7 @@ static int bcread_header(LexState *ls)
bcread_byte(ls) != BCDUMP_VERSION) return 0;
bcread_flags(ls) = flags = bcread_uleb128(ls);
if ((flags & ~(BCDUMP_F_KNOWN)) != 0) return 0;
- if ((flags & BCDUMP_F_FR2) != LJ_FR2*BCDUMP_F_FR2) return 0;
+ if ((flags & BCDUMP_F_FR2) != (uint32_t)ls->fr2*BCDUMP_F_FR2) return 0;
if ((flags & BCDUMP_F_FFI)) {
#if LJ_HASFFI
lua_State *L = ls->L;
diff --git a/src/lj_bcwrite.c b/src/lj_bcwrite.c
index dd969413..ddfa46c5 100644
--- a/src/lj_bcwrite.c
+++ b/src/lj_bcwrite.c
@@ -27,7 +27,9 @@ typedef struct BCWriteCtx {
GCproto *pt; /* Root prototype. */
lua_Writer wfunc; /* Writer callback. */
void *wdata; /* Writer callback data. */
- int strip; /* Strip debug info. */
+ TValue **heap; /* Heap used for deterministic sorting. */
+ uint32_t heapsz; /* Size of heap. */
+ uint32_t flags; /* BCDUMP_F_* flags. */
int status; /* Status from writer callback. */
#ifdef LUA_USE_ASSERT
global_State *g;
@@ -76,6 +78,75 @@ static void bcwrite_ktabk(BCWriteCtx *ctx, cTValue *o, int narrow)
ctx->sb.w = p;
}
+/* Compare two template table keys. */
+static LJ_AINLINE int bcwrite_ktabk_lt(TValue *a, TValue *b)
+{
+ uint32_t at = itype(a), bt = itype(b);
+ if (at != bt) { /* This also handles false and true keys. */
+ return at < bt;
+ } else if (at == LJ_TSTR) {
+ return lj_str_cmp(strV(a), strV(b)) < 0;
+ } else {
+ return a->u64 < b->u64; /* This works for numbers and integers. */
+ }
+}
+
+/* Insert key into a sorted heap. */
+static void bcwrite_ktabk_heap_insert(TValue **heap, MSize idx, MSize end,
+ TValue *key)
+{
+ MSize child;
+ while ((child = idx * 2 + 1) < end) {
+ /* Find lower of the two children. */
+ TValue *c0 = heap[child];
+ if (child + 1 < end) {
+ TValue *c1 = heap[child + 1];
+ if (bcwrite_ktabk_lt(c1, c0)) {
+ c0 = c1;
+ child++;
+ }
+ }
+ if (bcwrite_ktabk_lt(key, c0)) break; /* Key lower? Found our position. */
+ heap[idx] = c0; /* Move lower child up. */
+ idx = child; /* Descend. */
+ }
+ heap[idx] = key; /* Insert key here. */
+}
+
+/* Resize heap, dropping content. */
+static void bcwrite_heap_resize(BCWriteCtx *ctx, uint32_t nsz)
+{
+ lua_State *L = sbufL(&ctx->sb);
+ if (ctx->heapsz) {
+ lj_mem_freevec(G(L), ctx->heap, ctx->heapsz, TValue *);
+ ctx->heapsz = 0;
+ }
+ if (nsz) {
+ ctx->heap = lj_mem_newvec(L, nsz, TValue *);
+ ctx->heapsz = nsz;
+ }
+}
+
+/* Write hash part of template table in sorted order. */
+static void bcwrite_ktab_sorted_hash(BCWriteCtx *ctx, Node *node, MSize nhash)
+{
+ TValue **heap = ctx->heap;
+ MSize i = nhash;
+ for (;; node--) { /* Build heap. */
+ if (!tvisnil(&node->key)) {
+ bcwrite_ktabk_heap_insert(heap, --i, nhash, &node->key);
+ if (i == 0) break;
+ }
+ }
+ do { /* Drain heap. */
+ TValue *key = heap[0]; /* Output lowest key from top. */
+ bcwrite_ktabk(ctx, key, 0);
+ bcwrite_ktabk(ctx, (TValue *)((char *)key - offsetof(Node, key)), 1);
+ key = heap[--nhash]; /* Remove last key. */
+ bcwrite_ktabk_heap_insert(heap, 0, nhash, key); /* Re-insert. */
+ } while (nhash);
+}
+
/* Write a template table. */
static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t)
{
@@ -92,7 +163,7 @@ static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t)
MSize i, hmask = t->hmask;
Node *node = noderef(t->node);
for (i = 0; i <= hmask; i++)
- nhash += !tvisnil(&node[i].val);
+ nhash += !tvisnil(&node[i].key);
}
/* Write number of array slots and hash slots. */
p = lj_strfmt_wuleb128(p, narray);
@@ -105,14 +176,20 @@ static void bcwrite_ktab(BCWriteCtx *ctx, char *p, const GCtab *t)
bcwrite_ktabk(ctx, o, 1);
}
if (nhash) { /* Write hash entries. */
- MSize i = nhash;
Node *node = noderef(t->node) + t->hmask;
- for (;; node--)
- if (!tvisnil(&node->val)) {
- bcwrite_ktabk(ctx, &node->key, 0);
- bcwrite_ktabk(ctx, &node->val, 1);
- if (--i == 0) break;
- }
+ if ((ctx->flags & BCDUMP_F_DETERMINISTIC) && nhash > 1) {
+ if (ctx->heapsz < nhash)
+ bcwrite_heap_resize(ctx, t->hmask + 1);
+ bcwrite_ktab_sorted_hash(ctx, node, nhash);
+ } else {
+ MSize i = nhash;
+ for (;; node--)
+ if (!tvisnil(&node->key)) {
+ bcwrite_ktabk(ctx, &node->key, 0);
+ bcwrite_ktabk(ctx, &node->val, 1);
+ if (--i == 0) break;
+ }
+ }
}
}
@@ -269,7 +346,7 @@ static void bcwrite_proto(BCWriteCtx *ctx, GCproto *pt)
p = lj_strfmt_wuleb128(p, pt->sizekgc);
p = lj_strfmt_wuleb128(p, pt->sizekn);
p = lj_strfmt_wuleb128(p, pt->sizebc-1);
- if (!ctx->strip) {
+ if (!(ctx->flags & BCDUMP_F_STRIP)) {
if (proto_lineinfo(pt))
sizedbg = pt->sizept - (MSize)((char *)proto_lineinfo(pt) - (char *)pt);
p = lj_strfmt_wuleb128(p, sizedbg);
@@ -317,11 +394,10 @@ static void bcwrite_header(BCWriteCtx *ctx)
*p++ = BCDUMP_HEAD2;
*p++ = BCDUMP_HEAD3;
*p++ = BCDUMP_VERSION;
- *p++ = (ctx->strip ? BCDUMP_F_STRIP : 0) +
+ *p++ = (ctx->flags & (BCDUMP_F_STRIP | BCDUMP_F_FR2)) +
LJ_BE*BCDUMP_F_BE +
- ((ctx->pt->flags & PROTO_FFI) ? BCDUMP_F_FFI : 0) +
- LJ_FR2*BCDUMP_F_FR2;
- if (!ctx->strip) {
+ ((ctx->pt->flags & PROTO_FFI) ? BCDUMP_F_FFI : 0);
+ if (!(ctx->flags & BCDUMP_F_STRIP)) {
p = lj_strfmt_wuleb128(p, len);
p = lj_buf_wmem(p, name, len);
}
@@ -352,14 +428,16 @@ static TValue *cpwriter(lua_State *L, lua_CFunction dummy, void *ud)
/* Write bytecode for a prototype. */
int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, void *data,
- int strip)
+ uint32_t flags)
{
BCWriteCtx ctx;
int status;
ctx.pt = pt;
ctx.wfunc = writer;
ctx.wdata = data;
- ctx.strip = strip;
+ ctx.heapsz = 0;
+ if ((bc_op(proto_bc(pt)[0]) != BC_NOT) == LJ_FR2) flags |= BCDUMP_F_FR2;
+ ctx.flags = flags;
ctx.status = 0;
#ifdef LUA_USE_ASSERT
ctx.g = G(L);
@@ -368,6 +446,7 @@ int lj_bcwrite(lua_State *L, GCproto *pt, lua_Writer writer, void *data,
status = lj_vm_cpcall(L, NULL, &ctx, cpwriter);
if (status == 0) status = ctx.status;
lj_buf_free(G(sbufL(&ctx.sb)), &ctx.sb);
+ bcwrite_heap_resize(&ctx, 0);
return status;
}
diff --git a/src/lj_carith.c b/src/lj_carith.c
index df5f801e..9bea0a33 100644
--- a/src/lj_carith.c
+++ b/src/lj_carith.c
@@ -44,9 +44,13 @@ static int carith_checkarg(lua_State *L, CTState *cts, CDArith *ca)
p = (uint8_t *)cdata_getptr(p, ct->size);
if (ctype_isref(ct->info)) ct = ctype_rawchild(cts, ct);
} else if (ctype_isfunc(ct->info)) {
+ CTypeID id0 = i ? ctype_typeid(cts, ca->ct[0]) : 0;
p = (uint8_t *)*(void **)p;
ct = ctype_get(cts,
lj_ctype_intern(cts, CTINFO(CT_PTR, CTALIGN_PTR|id), CTSIZE_PTR));
+ if (i) { /* cts->tab may have been reallocated. */
+ ca->ct[0] = ctype_get(cts, id0);
+ }
}
if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
ca->ct[i] = ct;
diff --git a/src/lj_ccall.c b/src/lj_ccall.c
index 00e753b9..5f95f5d8 100644
--- a/src/lj_ccall.c
+++ b/src/lj_ccall.c
@@ -985,6 +985,14 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
fid = ctf->sib;
}
+#if LJ_TARGET_ARM64 && LJ_ABI_WIN
+ if ((ct->info & CTF_VARARG)) {
+ nsp -= maxgpr * CTSIZE_PTR; /* May end up with negative nsp. */
+ ngpr = maxgpr;
+ nfpr = CCALL_NARG_FPR;
+ }
+#endif
+
/* Walk through all passed arguments. */
for (o = L->base+1, narg = 1; o < top; o++, narg++) {
CTypeID did;
@@ -1035,9 +1043,14 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
align = CTSIZE_PTR-1;
nsp = (nsp + align) & ~align;
}
+#if LJ_TARGET_ARM64 && LJ_ABI_WIN
+ /* A negative nsp points into cc->gpr. Blame MS for their messy ABI. */
+ dp = ((uint8_t *)cc->stack) + (int32_t)nsp;
+#else
dp = ((uint8_t *)cc->stack) + nsp;
+#endif
nsp += CCALL_PACK_STACKARG ? sz : n * CTSIZE_PTR;
- if (nsp > CCALL_SIZE_STACK) { /* Too many arguments. */
+ if ((int32_t)nsp > CCALL_SIZE_STACK) { /* Too many arguments. */
err_nyi:
lj_err_caller(L, LJ_ERR_FFI_NYICALL);
}
@@ -1099,6 +1112,9 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct,
#endif
}
if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG); /* Too few arguments. */
+#if LJ_TARGET_ARM64 && LJ_ABI_WIN
+ if ((int32_t)nsp < 0) nsp = 0;
+#endif
#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP)
cc->nfpr = nfpr; /* Required for vararg functions. */
diff --git a/src/lj_cparse.c b/src/lj_cparse.c
index 226bab46..9e646c7f 100644
--- a/src/lj_cparse.c
+++ b/src/lj_cparse.c
@@ -1768,9 +1768,11 @@ static void cp_pragma(CPState *cp, BCLine pragmaline)
cp_check(cp, '(');
if (cp->tok == CTOK_IDENT) {
if (cp_str_is(cp->str, "push")) {
- if (cp->curpack < CPARSE_MAX_PACKSTACK) {
+ if (cp->curpack < CPARSE_MAX_PACKSTACK-1) {
cp->packstack[cp->curpack+1] = cp->packstack[cp->curpack];
cp->curpack++;
+ } else {
+ cp_errmsg(cp, cp->tok, LJ_ERR_XLEVELS);
}
} else if (cp_str_is(cp->str, "pop")) {
if (cp->curpack > 0) cp->curpack--;
diff --git a/src/lj_crecord.c b/src/lj_crecord.c
index d7a522fb..55d0b3ef 100644
--- a/src/lj_crecord.c
+++ b/src/lj_crecord.c
@@ -1118,12 +1118,8 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd,
ngpr = 1;
else if (ctype_cconv(ct->info) == CTCC_FASTCALL)
ngpr = 2;
-#elif LJ_TARGET_ARM64
-#if LJ_ABI_WIN
-#error "NYI: ARM64 Windows ABI calling conventions"
-#elif LJ_TARGET_OSX
+#elif LJ_TARGET_ARM64 && LJ_TARGET_OSX
int ngpr = CCALL_NARG_GPR;
-#endif
#endif
/* Skip initial attributes. */
diff --git a/src/lj_ctype.h b/src/lj_ctype.h
index 2a74e321..917346a3 100644
--- a/src/lj_ctype.h
+++ b/src/lj_ctype.h
@@ -276,6 +276,8 @@ typedef struct CTState {
#define CTTYDEFP(_)
#endif
+#define CTF_LONG_IF8 (CTF_LONG * (sizeof(long) == 8))
+
/* Common types. */
#define CTTYDEF(_) \
_(NONE, 0, CT_ATTRIB, CTATTRIB(CTA_BAD)) \
@@ -289,8 +291,8 @@ typedef struct CTState {
_(UINT16, 2, CT_NUM, CTF_UNSIGNED|CTALIGN(1)) \
_(INT32, 4, CT_NUM, CTALIGN(2)) \
_(UINT32, 4, CT_NUM, CTF_UNSIGNED|CTALIGN(2)) \
- _(INT64, 8, CT_NUM, CTF_LONG|CTALIGN(3)) \
- _(UINT64, 8, CT_NUM, CTF_UNSIGNED|CTF_LONG|CTALIGN(3)) \
+ _(INT64, 8, CT_NUM, CTF_LONG_IF8|CTALIGN(3)) \
+ _(UINT64, 8, CT_NUM, CTF_UNSIGNED|CTF_LONG_IF8|CTALIGN(3)) \
_(FLOAT, 4, CT_NUM, CTF_FP|CTALIGN(2)) \
_(DOUBLE, 8, CT_NUM, CTF_FP|CTALIGN(3)) \
_(COMPLEX_FLOAT, 8, CT_ARRAY, CTF_COMPLEX|CTALIGN(2)|CTID_FLOAT) \
diff --git a/src/lj_debug.c b/src/lj_debug.c
index fa189b6e..8d8b9eb5 100644
--- a/src/lj_debug.c
+++ b/src/lj_debug.c
@@ -64,6 +64,7 @@ static BCPos debug_framepc(lua_State *L, GCfunc *fn, cTValue *nextframe)
if (cf == NULL || (char *)cframe_pc(cf) == (char *)cframe_L(cf))
return NO_BCPOS;
ins = cframe_pc(cf); /* Only happens during error/hook handling. */
+ if (!ins) return NO_BCPOS;
} else {
if (frame_islua(nextframe)) {
ins = frame_pc(nextframe);
diff --git a/src/lj_def.h b/src/lj_def.h
index 9b5a5977..7eebf58c 100644
--- a/src/lj_def.h
+++ b/src/lj_def.h
@@ -69,7 +69,7 @@ typedef unsigned int uintptr_t;
#define LJ_MAX_UPVAL 160 /* Max. # of upvalues. */
#define LJ_MAX_IDXCHAIN 100 /* __index/__newindex chain limit. */
-#define LJ_STACK_EXTRA (5+2*LJ_FR2) /* Extra stack space (metamethods). */
+#define LJ_STACK_EXTRA (5+3*LJ_FR2) /* Extra stack space (metamethods). */
#define LJ_NUM_CBPAGE 1 /* Number of FFI callback pages. */
@@ -146,15 +146,9 @@ typedef uintptr_t BloomFilter;
#define LJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
#define lj_ffs(x) ((uint32_t)__builtin_ctz(x))
-/* Don't ask ... */
-#if defined(__INTEL_COMPILER) && (defined(__i386__) || defined(__x86_64__))
-static LJ_AINLINE uint32_t lj_fls(uint32_t x)
-{
- uint32_t r; __asm__("bsrl %1, %0" : "=r" (r) : "rm" (x) : "cc"); return r;
-}
-#else
#define lj_fls(x) ((uint32_t)(__builtin_clz(x)^31))
-#endif
+#define lj_ffs64(x) ((uint32_t)__builtin_ctzll(x))
+#define lj_fls64(x) ((uint32_t)(__builtin_clzll(x)^63))
#if defined(__arm__)
static LJ_AINLINE uint32_t lj_bswap(uint32_t x)
@@ -277,6 +271,23 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x)
{
unsigned long r; _BitScanReverse(&r, x); return (uint32_t)r;
}
+
+#if defined(_M_X64) || defined(_M_ARM64)
+unsigned char _BitScanForward64(unsigned long *, uint64_t);
+unsigned char _BitScanReverse64(unsigned long *, uint64_t);
+#pragma intrinsic(_BitScanForward64)
+#pragma intrinsic(_BitScanReverse64)
+
+static LJ_AINLINE uint32_t lj_ffs64(uint64_t x)
+{
+ unsigned long r; _BitScanForward64(&r, x); return (uint32_t)r;
+}
+
+static LJ_AINLINE uint32_t lj_fls64(uint64_t x)
+{
+ unsigned long r; _BitScanReverse64(&r, x); return (uint32_t)r;
+}
+#endif
#endif
unsigned long _byteswap_ulong(unsigned long);
diff --git a/src/lj_dispatch.c b/src/lj_dispatch.c
index 57809e62..b9748bba 100644
--- a/src/lj_dispatch.c
+++ b/src/lj_dispatch.c
@@ -453,7 +453,7 @@ static int call_init(lua_State *L, GCfunc *fn)
int numparams = pt->numparams;
int gotparams = (int)(L->top - L->base);
int need = pt->framesize;
- if ((pt->flags & PROTO_VARARG)) need += 1+gotparams;
+ if ((pt->flags & PROTO_VARARG)) need += 1+LJ_FR2+gotparams;
lj_state_checkstack(L, (MSize)need);
numparams -= gotparams;
return numparams >= 0 ? numparams : 0;
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
index 6926c71a..51d0c351 100644
--- a/src/lj_emit_arm64.h
+++ b/src/lj_emit_arm64.h
@@ -20,7 +20,7 @@ static uint64_t get_k64val(ASMState *as, IRRef ref)
} else {
lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
"bad 64 bit const IR op %d", ir->o);
- return ir->i; /* Sign-extended. */
+ return (uint32_t)ir->i; /* Zero-extended. */
}
}
@@ -30,39 +30,31 @@ static uint32_t emit_isk12(int64_t n)
uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n;
uint32_t m = n < 0 ? 0x40000000 : 0;
if (k < 0x1000) {
- return A64I_K12|m|A64F_U12(k);
+ return (uint32_t)(A64I_K12|m|A64F_U12(k));
} else if ((k & 0xfff000) == k) {
- return A64I_K12|m|0x400000|A64F_U12(k>>12);
+ return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12));
}
return 0;
}
-#define emit_clz64(n) __builtin_clzll(n)
-#define emit_ctz64(n) __builtin_ctzll(n)
+#define emit_clz64(n) (lj_fls64(n)^63)
+#define emit_ctz64(n) lj_ffs64(n)
/* Encode constant in K13 format for logical data processing instructions. */
static uint32_t emit_isk13(uint64_t n, int is64)
{
- int inv = 0, w = 128, lz, tz;
- if (n & 1) { n = ~n; w = 64; inv = 1; } /* Avoid wrap-around of ones. */
- if (!n) return 0; /* Neither all-zero nor all-ones are allowed. */
- do { /* Find the repeat width. */
- if (is64 && (uint32_t)(n^(n>>32))) break;
- n = (uint32_t)n;
- if (!n) return 0; /* Ditto when passing n=0xffffffff and is64=0. */
- w = 32; if ((n^(n>>16)) & 0xffff) break;
- n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
- n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
- n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
- n = n & 0x3; w = 2;
- } while (0);
- lz = emit_clz64(n);
- tz = emit_ctz64(n);
- if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
- if (inv)
- return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
- else
- return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
+ /* Thanks to: https://dougallj.wordpress.com/2021/10/30/ */
+ int rot, ones, size, immr, imms;
+ if (!is64) n = ((uint64_t)n << 32) | (uint32_t)n;
+ if ((n+1u) <= 1u) return 0; /* Neither all-zero nor all-ones are allowed. */
+ rot = (n & (n+1u)) ? emit_ctz64(n & (n+1u)) : 64;
+ n = lj_ror(n, rot & 63);
+ ones = emit_ctz64(~n);
+ size = emit_clz64(n) + ones;
+ if (lj_ror(n, size & 63) != n) return 0; /* Non-repeating? */
+ immr = -rot & (size - 1);
+ imms = (-(size << 1) | (ones - 1)) & 63;
+ return A64I_K13 | A64F_IMMR(immr | (size & 64)) | A64F_IMMS(imms);
}
static uint32_t emit_isfpk64(uint64_t n)
@@ -121,9 +113,20 @@ static int emit_checkofs(A64Ins ai, int64_t ofs)
}
}
-static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
+static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc)
{
- int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
+ if (ofs >= 0) {
+ return ai | A64F_U12(ofs>>sc); /* Subsequent lj_ror checks ofs. */
+ } else if (ofs >= -256) {
+ return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff);
+ } else {
+ return A64F_D(31); /* Will mismatch prev. */
+ }
+}
+
+static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs64)
+{
+ int ot = emit_checkofs(ai, ofs64), sc = (ai >> 30) & 3, ofs = (int)ofs64;
lj_assertA(ot, "load/store offset %d out of range", ofs);
/* Combine LDR/STR pairs to LDP/STP. */
if ((sc == 2 || sc == 3) &&
@@ -132,11 +135,9 @@ static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
uint32_t prev = *as->mcp & ~A64F_D(31);
int ofsm = ofs - (1<>sc)) ||
- prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
+ if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) {
aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
- } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
- prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
+ } else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) {
aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
ofsm = ofs;
} else {
@@ -158,13 +159,12 @@ nopair:
/* -- Emit loads/stores --------------------------------------------------- */
/* Prefer rematerialization of BASE/L from global_State over spills. */
-#define emit_canremat(ref) ((ref) <= ASMREF_L)
+#define emit_canremat(ref) ((ref) <= REF_BASE)
-/* Try to find an N-step delta relative to other consts with N < lim. */
-static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
+/* Try to find a one-step delta relative to other consts. */
+static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int is64)
{
RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL);
- if (lim <= 1) return 0; /* Can't beat that. */
while (work) {
Reg r = rset_picktop(work);
IRRef ref = regcost_ref(as->cost[r]);
@@ -173,13 +173,14 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
get_k64val(as, ref);
int64_t delta = (int64_t)(k - kx);
+ if (!is64) delta = (int64_t)(int32_t)delta; /* Sign-extend. */
if (delta == 0) {
- emit_dm(as, A64I_MOVx, rd, r);
+ emit_dm(as, is64|A64I_MOVw, rd, r);
return 1;
} else {
uint32_t k12 = emit_isk12(delta < 0 ? (int64_t)(~(uint64_t)delta+1u) : delta);
if (k12) {
- emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
+ emit_dn(as, (delta < 0 ? A64I_SUBw : A64I_ADDw)^is64^k12, rd, r);
return 1;
}
/* Do other ops or multi-step deltas pay off? Probably not.
@@ -192,54 +193,6 @@ static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
return 0; /* Failed. */
}
-static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
-{
- int i, zeros = 0, ones = 0, neg;
- if (!is64) u64 = (int64_t)(int32_t)u64; /* Sign-extend. */
- /* Count homogeneous 16 bit fragments. */
- for (i = 0; i < 4; i++) {
- uint64_t frag = (u64 >> i*16) & 0xffff;
- zeros += (frag == 0);
- ones += (frag == 0xffff);
- }
- neg = ones > zeros; /* Use MOVN if it pays off. */
- if ((neg ? ones : zeros) < 3) { /* Need 2+ ins. Try shorter K13 encoding. */
- uint32_t k13 = emit_isk13(u64, is64);
- if (k13) {
- emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
- return;
- }
- }
- if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
- int shift = 0, lshift = 0;
- uint64_t n64 = neg ? ~u64 : u64;
- if (n64 != 0) {
- /* Find first/last fragment to be filled. */
- shift = (63-emit_clz64(n64)) & ~15;
- lshift = emit_ctz64(n64) & ~15;
- }
- /* MOVK requires the original value (u64). */
- while (shift > lshift) {
- uint32_t u16 = (u64 >> shift) & 0xffff;
- /* Skip fragments that are correctly filled by MOVN/MOVZ. */
- if (u16 != (neg ? 0xffff : 0))
- emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
- shift -= 16;
- }
- /* But MOVN needs an inverted value (n64). */
- emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
- A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
- }
-}
-
-/* Load a 32 bit constant into a GPR. */
-#define emit_loadi(as, rd, i) emit_loadk(as, rd, i, 0)
-
-/* Load a 64 bit constant into a GPR. */
-#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X)
-
-#define emit_loada(as, r, addr) emit_loadu64(as, (r), (uintptr_t)(addr))
-
#define glofs(as, k) \
((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
#define mcpofs(as, k) \
@@ -247,24 +200,94 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
#define checkmcpofs(as, k) \
(A64F_S_OK(mcpofs(as, k)>>2, 19))
+/* Try to form a const as ADR or ADRP or ADRP + ADD. */
+static int emit_kadrp(ASMState *as, Reg rd, uint64_t k)
+{
+ A64Ins ai = A64I_ADR;
+ int64_t ofs = mcpofs(as, k);
+ if (!A64F_S_OK((uint64_t)ofs, 21)) {
+ uint64_t kpage = k & ~0xfffull;
+ MCode *adrp = as->mcp - 1 - (k != kpage);
+ ofs = (int64_t)(kpage - ((uint64_t)adrp & ~0xfffull)) >> 12;
+ if (!A64F_S_OK(ofs, 21))
+ return 0; /* Failed. */
+ if (k != kpage)
+ emit_dn(as, (A64I_ADDx^A64I_K12)|A64F_U12(k - kpage), rd, rd);
+ ai = A64I_ADRP;
+ }
+ emit_d(as, ai|(((uint32_t)ofs&3)<<29)|A64F_S19(ofs>>2), rd);
+ return 1;
+}
+
+static void emit_loadk(ASMState *as, Reg rd, uint64_t u64)
+{
+ int zeros = 0, ones = 0, neg, lshift = 0;
+ int is64 = (u64 >> 32) ? A64I_X : 0, i = is64 ? 4 : 2;
+ /* Count non-homogeneous 16 bit fragments. */
+ while (--i >= 0) {
+ uint32_t frag = (u64 >> i*16) & 0xffff;
+ zeros += (frag != 0);
+ ones += (frag != 0xffff);
+ }
+ neg = ones < zeros; /* Use MOVN if it pays off. */
+ if ((neg ? ones : zeros) > 1) { /* Need 2+ ins. Try 1 ins encodings. */
+ uint32_t k13 = emit_isk13(u64, is64);
+ if (k13) {
+ emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
+ return;
+ }
+ if (emit_kdelta(as, rd, u64, is64)) {
+ return;
+ }
+ if (emit_kadrp(as, rd, u64)) { /* Either 1 or 2 ins. */
+ return;
+ }
+ }
+ if (neg) {
+ u64 = ~u64;
+ if (!is64) u64 = (uint32_t)u64;
+ }
+ if (u64) {
+ /* Find first/last fragment to be filled. */
+ int shift = (63-emit_clz64(u64)) & ~15;
+ lshift = emit_ctz64(u64) & ~15;
+ for (; shift > lshift; shift -= 16) {
+ uint32_t frag = (u64 >> shift) & 0xffff;
+ if (frag == 0) continue; /* Will be correctly filled by MOVN/MOVZ. */
+ if (neg) frag ^= 0xffff; /* MOVK requires the original value. */
+ emit_d(as, is64 | A64I_MOVKw | A64F_U16(frag) | A64F_LSL16(shift), rd);
+ }
+ }
+ /* But MOVN needs an inverted value. */
+ emit_d(as, is64 | (neg ? A64I_MOVNw : A64I_MOVZw) |
+ A64F_U16((u64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
+}
+
+/* Load a 32 bit constant into a GPR. */
+#define emit_loadi(as, rd, i) emit_loadk(as, rd, (uint32_t)i)
+
+/* Load a 64 bit constant into a GPR. */
+#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i)
+
static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
/* Get/set from constant pointer. */
static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
{
- /* First, check if ip + offset is in range. */
- if ((ai & 0x00400000) && checkmcpofs(as, p)) {
+ Reg base = RID_GL;
+ int64_t ofs = glofs(as, p);
+ if (emit_checkofs(ai, ofs)) {
+ /* GL + offset, might subsequently fuse to LDP/STP. */
+ } else if (ai == A64I_LDRx && checkmcpofs(as, p)) {
+ /* IP + offset is cheaper than allock, but address must be in range. */
emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
- } else {
- Reg base = RID_GL; /* Next, try GL + offset. */
- int64_t ofs = glofs(as, p);
- if (!emit_checkofs(ai, ofs)) { /* Else split up into base reg + offset. */
- int64_t i64 = i64ptr(p);
- base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
- ofs = i64 & 0x7fffull;
- }
- emit_lso(as, ai, r, base, ofs);
+ return;
+ } else { /* Split up into base reg + offset. */
+ int64_t i64 = i64ptr(p);
+ base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
+ ofs = i64 & 0x7fffull;
}
+ emit_lso(as, ai, r, base, ofs);
}
/* Load 64 bit IR constant into register. */
diff --git a/src/lj_err.c b/src/lj_err.c
index 6e50cbee..414ef477 100644
--- a/src/lj_err.c
+++ b/src/lj_err.c
@@ -174,12 +174,15 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
case FRAME_PCALL: /* FF pcall() frame. */
case FRAME_PCALLH: /* FF pcall() frame inside hook. */
if (errcode) {
+ global_State *g;
if (errcode == LUA_YIELD) {
frame = frame_prevd(frame);
break;
}
+ g = G(L);
+ setgcref(g->cur_L, obj2gco(L));
if (frame_typep(frame) == FRAME_PCALL)
- hook_leave(G(L));
+ hook_leave(g);
L->base = frame_prevd(frame) + 1;
L->cframe = cf;
unwindstack(L, L->base);
@@ -209,11 +212,6 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
** from 3rd party docs or must be found by trial-and-error. They really
** don't want you to write your own language-specific exception handler
** or to interact gracefully with MSVC. :-(
-**
-** Apparently MSVC doesn't call C++ destructors for foreign exceptions
-** unless you compile your C++ code with /EHa. Unfortunately this means
-** catch (...) also catches things like access violations. The use of
-** _set_se_translator doesn't really help, because it requires /EHa, too.
*/
#define WIN32_LEAN_AND_MEAN
@@ -261,6 +259,8 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec,
{
#if LJ_TARGET_X86
void *cf = (char *)f - CFRAME_OFS_SEH;
+#elif LJ_TARGET_ARM64
+ void *cf = (char *)f - CFRAME_SIZE;
#else
void *cf = f;
#endif
@@ -268,11 +268,25 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec,
int errcode = LJ_EXCODE_CHECK(rec->ExceptionCode) ?
LJ_EXCODE_ERRCODE(rec->ExceptionCode) : LUA_ERRRUN;
if ((rec->ExceptionFlags & 6)) { /* EH_UNWINDING|EH_EXIT_UNWIND */
+ if (rec->ExceptionCode == STATUS_LONGJUMP &&
+ rec->ExceptionRecord &&
+ LJ_EXCODE_CHECK(rec->ExceptionRecord->ExceptionCode)) {
+ errcode = LJ_EXCODE_ERRCODE(rec->ExceptionRecord->ExceptionCode);
+ if ((rec->ExceptionFlags & 0x20)) { /* EH_TARGET_UNWIND */
+ /* Unwinding is about to finish; revert the ExceptionCode so that
+ ** RtlRestoreContext does not try to restore from a _JUMP_BUFFER.
+ */
+ rec->ExceptionCode = 0;
+ }
+ }
/* Unwind internal frames. */
err_unwind(L, cf, errcode);
} else {
void *cf2 = err_unwind(L, cf, 0);
if (cf2) { /* We catch it, so start unwinding the upper frames. */
+#if !LJ_TARGET_X86
+ EXCEPTION_RECORD rec2;
+#endif
if (rec->ExceptionCode == LJ_MSVC_EXCODE ||
rec->ExceptionCode == LJ_GCC_EXCODE) {
#if !LJ_TARGET_CYGWIN
@@ -295,14 +309,29 @@ LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec,
(void *)lj_vm_unwind_ff : (void *)lj_vm_unwind_c, errcode);
/* lj_vm_rtlunwind does not return. */
#else
+ if (LJ_EXCODE_CHECK(rec->ExceptionCode)) {
+ /* For unwind purposes, wrap the EXCEPTION_RECORD in something that
+ ** looks like a longjmp, so that MSVC will execute C++ destructors in
+ ** the frames we unwind over. ExceptionInformation[0] should really
+ ** contain a _JUMP_BUFFER*, but hopefully nobody is looking too closely
+ ** at this point.
+ */
+ rec2.ExceptionCode = STATUS_LONGJUMP;
+ rec2.ExceptionRecord = rec;
+ rec2.ExceptionAddress = 0;
+ rec2.NumberParameters = 1;
+ rec2.ExceptionInformation[0] = (ULONG_PTR)ctx;
+ rec = &rec2;
+ }
/* Unwind the stack and call all handlers for all lower C frames
** (including ourselves) again with EH_UNWINDING set. Then set
- ** stack pointer = cf, result = errcode and jump to the specified target.
+ ** stack pointer = f, result = errcode and jump to the specified target.
*/
- RtlUnwindEx(cf, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ?
- lj_vm_unwind_ff_eh :
- lj_vm_unwind_c_eh),
- rec, (void *)(uintptr_t)errcode, ctx, dispatch->HistoryTable);
+ RtlUnwindEx(f, (void *)((cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ?
+ lj_vm_unwind_ff_eh :
+ lj_vm_unwind_c_eh),
+ rec, (void *)(uintptr_t)errcode, dispatch->ContextRecord,
+ dispatch->HistoryTable);
/* RtlUnwindEx should never return. */
#endif
}
@@ -789,7 +818,14 @@ LJ_NOINLINE void lj_err_mem(lua_State *L)
TValue *base = tvref(G(L)->jit_base);
if (base) L->base = base;
}
- if (curr_funcisL(L)) L->top = curr_topL(L);
+ if (curr_funcisL(L)) {
+ L->top = curr_topL(L);
+ if (LJ_UNLIKELY(L->top > tvref(L->maxstack))) {
+ /* The current Lua frame violates the stack. Replace it with a dummy. */
+ L->top = L->base;
+ setframe_gc(L->base - 1 - LJ_FR2, obj2gco(L), LJ_TTHREAD);
+ }
+ }
setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRMEM));
lj_err_throw(L, LUA_ERRMEM);
}
@@ -850,9 +886,11 @@ LJ_NOINLINE void LJ_FASTCALL lj_err_run(lua_State *L)
{
ptrdiff_t ef = (LJ_HASJIT && tvref(G(L)->jit_base)) ? 0 : finderrfunc(L);
if (ef) {
- TValue *errfunc = restorestack(L, ef);
- TValue *top = L->top;
+ TValue *errfunc, *top;
+ lj_state_checkstack(L, LUA_MINSTACK * 2); /* Might raise new error. */
lj_trace_abort(G(L));
+ errfunc = restorestack(L, ef);
+ top = L->top;
if (!tvisfunc(errfunc) || L->status == LUA_ERRERR) {
setstrV(L, top-1, lj_err_str(L, LJ_ERR_ERRERR));
lj_err_throw(L, LUA_ERRERR);
@@ -867,7 +905,15 @@ LJ_NOINLINE void LJ_FASTCALL lj_err_run(lua_State *L)
lj_err_throw(L, LUA_ERRRUN);
}
+/* Stack overflow error. */
+void LJ_FASTCALL lj_err_stkov(lua_State *L)
+{
+ lj_debug_addloc(L, err2msg(LJ_ERR_STKOV), L->base-1, NULL);
+ lj_err_run(L);
+}
+
#if LJ_HASJIT
+/* Rethrow error after doing a trace exit. */
LJ_NOINLINE void LJ_FASTCALL lj_err_trace(lua_State *L, int errcode)
{
if (errcode == LUA_ERRRUN)
diff --git a/src/lj_err.h b/src/lj_err.h
index 8768fefd..67686cb7 100644
--- a/src/lj_err.h
+++ b/src/lj_err.h
@@ -23,6 +23,7 @@ LJ_DATA const char *lj_err_allmsg;
LJ_FUNC GCstr *lj_err_str(lua_State *L, ErrMsg em);
LJ_FUNCA_NORET void LJ_FASTCALL lj_err_throw(lua_State *L, int errcode);
LJ_FUNC_NORET void lj_err_mem(lua_State *L);
+LJ_FUNC_NORET void LJ_FASTCALL lj_err_stkov(lua_State *L);
LJ_FUNC_NORET void LJ_FASTCALL lj_err_run(lua_State *L);
#if LJ_HASJIT
LJ_FUNCA_NORET void LJ_FASTCALL lj_err_trace(lua_State *L, int errcode);
diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c
index 8ebf4165..30dc6bfc 100644
--- a/src/lj_ffrecord.c
+++ b/src/lj_ffrecord.c
@@ -1130,7 +1130,7 @@ static TRef recff_sbufx_check(jit_State *J, RecordFFData *rd, ptrdiff_t arg)
/* Emit BUFHDR for write to extended string buffer. */
static TRef recff_sbufx_write(jit_State *J, TRef ud)
{
- TRef trbuf = emitir(IRT(IR_ADD, IRT_PGC), ud, lj_ir_kint(J, sizeof(GCudata)));
+ TRef trbuf = emitir(IRT(IR_ADD, IRT_PGC), ud, lj_ir_kintpgc(J, sizeof(GCudata)));
return emitir(IRT(IR_BUFHDR, IRT_PGC), trbuf, IRBUFHDR_WRITE);
}
@@ -1164,20 +1164,19 @@ static void LJ_FASTCALL recff_buffer_method_reset(jit_State *J, RecordFFData *rd
SBufExt *sbx = bufV(&rd->argv[0]);
int iscow = (int)sbufiscow(sbx);
TRef trl = recff_sbufx_get_L(J, ud);
- TRef trcow = emitir(IRT(IR_BAND, IRT_IGC), trl, lj_ir_kint(J, SBUF_FLAG_COW));
- TRef zero = lj_ir_kint(J, 0);
- emitir(IRTG(iscow ? IR_NE : IR_EQ, IRT_IGC), trcow, zero);
+ TRef trcow = emitir(IRT(IR_BAND, IRT_IGC), trl, lj_ir_kintpgc(J, SBUF_FLAG_COW));
+ TRef zeropgc = lj_ir_kintpgc(J, 0);
+ emitir(IRTG(iscow ? IR_NE : IR_EQ, IRT_IGC), trcow, zeropgc);
if (iscow) {
- trl = emitir(IRT(IR_BXOR, IRT_IGC), trl,
- LJ_GC64 ? lj_ir_kint64(J, SBUF_FLAG_COW) :
- lj_ir_kint(J, SBUF_FLAG_COW));
- recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, zero);
- recff_sbufx_set_ptr(J, ud, IRFL_SBUF_E, zero);
- recff_sbufx_set_ptr(J, ud, IRFL_SBUF_B, zero);
+ TRef zerop = lj_ir_kintp(J, 0);
+ trl = emitir(IRT(IR_BXOR, IRT_IGC), trl, lj_ir_kintpgc(J, SBUF_FLAG_COW));
+ recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, zerop);
+ recff_sbufx_set_ptr(J, ud, IRFL_SBUF_E, zerop);
+ recff_sbufx_set_ptr(J, ud, IRFL_SBUF_B, zerop);
recff_sbufx_set_L(J, ud, trl);
emitir(IRT(IR_FSTORE, IRT_PGC),
- emitir(IRT(IR_FREF, IRT_PGC), ud, IRFL_SBUF_REF), zero);
- recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, zero);
+ emitir(IRT(IR_FREF, IRT_PGC), ud, IRFL_SBUF_REF), zeropgc);
+ recff_sbufx_set_ptr(J, ud, IRFL_SBUF_R, zerop);
} else {
TRef trb = recff_sbufx_get_ptr(J, ud, IRFL_SBUF_B);
recff_sbufx_set_ptr(J, ud, IRFL_SBUF_W, trb);
@@ -1205,6 +1204,12 @@ static void LJ_FASTCALL recff_buffer_method_set(jit_State *J, RecordFFData *rd)
if (tref_isstr(tr)) {
TRef trp = emitir(IRT(IR_STRREF, IRT_PGC), tr, lj_ir_kint(J, 0));
TRef len = emitir(IRTI(IR_FLOAD), tr, IRFL_STR_LEN);
+ IRIns *irp = IR(tref_ref(trp));
+ /* trp must point into the anchored obj, even after folding. */
+ if (irp->o == IR_STRREF)
+ tr = irp->op1;
+ else if (!tref_isk(tr))
+ trp = emitir(IRT(IR_ADD, IRT_PGC), tr, lj_ir_kintpgc(J, sizeof(GCstr)));
lj_ir_call(J, IRCALL_lj_bufx_set, trbuf, trp, len, tr);
#if LJ_HASFFI
} else if (tref_iscdata(tr)) {
@@ -1445,6 +1450,15 @@ static void LJ_FASTCALL recff_table_new(jit_State *J, RecordFFData *rd)
{
TRef tra = lj_opt_narrow_toint(J, J->base[0]);
TRef trh = lj_opt_narrow_toint(J, J->base[1]);
+ if (tref_isk(tra) && tref_isk(trh)) {
+ int32_t a = IR(tref_ref(tra))->i;
+ if (a < 0x7fff) {
+ uint32_t hbits = hsize2hbits(IR(tref_ref(trh))->i);
+ a = a > 0 ? a+1 : 0;
+ J->base[0] = emitir(IRTG(IR_TNEW, IRT_TAB), (uint32_t)a, hbits);
+ return;
+ }
+ }
J->base[0] = lj_ir_call(J, IRCALL_lj_tab_new_ah, tra, trh);
UNUSED(rd);
}
diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
index e8a66635..56094cf1 100644
--- a/src/lj_gdbjit.c
+++ b/src/lj_gdbjit.c
@@ -637,7 +637,7 @@ static void LJ_FASTCALL gdbjit_debugabbrev(GDBJITctx *ctx)
DUV(DW_AT_low_pc); DUV(DW_FORM_addr);
DUV(DW_AT_high_pc); DUV(DW_FORM_addr);
DUV(DW_AT_stmt_list); DUV(DW_FORM_data4);
- DB(0); DB(0);
+ DB(0); DB(0); DB(0);
ctx->p = p;
}
diff --git a/src/lj_ir.h b/src/lj_ir.h
index b32bd095..cc73a849 100644
--- a/src/lj_ir.h
+++ b/src/lj_ir.h
@@ -76,8 +76,8 @@
\
_(ABS, N , ref, ref) \
_(LDEXP, N , ref, ref) \
- _(MIN, C , ref, ref) \
- _(MAX, C , ref, ref) \
+ _(MIN, N , ref, ref) \
+ _(MAX, N , ref, ref) \
_(FPMATH, N , ref, lit) \
\
/* Overflow-checking arithmetic ops. */ \
@@ -383,6 +383,7 @@ typedef struct IRType1 { uint8_t irt; } IRType1;
#define irt_isu32(t) (irt_type(t) == IRT_U32)
#define irt_isi64(t) (irt_type(t) == IRT_I64)
#define irt_isu64(t) (irt_type(t) == IRT_U64)
+#define irt_isp32(t) (irt_type(t) == IRT_P32)
#define irt_isfp(t) (irt_isnum(t) || irt_isfloat(t))
#define irt_isinteger(t) (irt_typerange((t), IRT_I8, IRT_INT))
diff --git a/src/lj_ircall.h b/src/lj_ircall.h
index 569134e9..f342cdd2 100644
--- a/src/lj_ircall.h
+++ b/src/lj_ircall.h
@@ -63,7 +63,7 @@ typedef struct CCallInfo {
/* Helpers for conditional function definitions. */
#define IRCALLCOND_ANY(x) x
-#if LJ_TARGET_X86ORX64
+#if LJ_TARGET_X86ORX64 || LJ_TARGET_ARM64
#define IRCALLCOND_FPMATH(x) NULL
#else
#define IRCALLCOND_FPMATH(x) x
diff --git a/src/lj_iropt.h b/src/lj_iropt.h
index 458a5511..a71a717b 100644
--- a/src/lj_iropt.h
+++ b/src/lj_iropt.h
@@ -56,6 +56,12 @@ LJ_FUNC TRef lj_ir_ktrace(jit_State *J);
#define lj_ir_kintp(J, k) lj_ir_kint(J, (int32_t)(k))
#endif
+#if LJ_GC64
+#define lj_ir_kintpgc lj_ir_kintp
+#else
+#define lj_ir_kintpgc lj_ir_kint
+#endif
+
static LJ_AINLINE TRef lj_ir_knum(jit_State *J, lua_Number n)
{
TValue tv;
diff --git a/src/lj_lex.c b/src/lj_lex.c
index f0cd64e5..b2fa5647 100644
--- a/src/lj_lex.c
+++ b/src/lj_lex.c
@@ -415,6 +415,7 @@ int lj_lex_setup(lua_State *L, LexState *ls)
ls->linenumber = 1;
ls->lastline = 1;
ls->endmark = 0;
+ ls->fr2 = LJ_FR2; /* Generate native bytecode by default. */
lex_next(ls); /* Read-ahead first char. */
if (ls->c == 0xef && ls->p + 2 <= ls->pe && (uint8_t)ls->p[0] == 0xbb &&
(uint8_t)ls->p[1] == 0xbf) { /* Skip UTF-8 BOM (if buffered). */
diff --git a/src/lj_lex.h b/src/lj_lex.h
index e46fbd89..2ef7fc77 100644
--- a/src/lj_lex.h
+++ b/src/lj_lex.h
@@ -74,6 +74,7 @@ typedef struct LexState {
MSize sizebcstack; /* Size of bytecode stack. */
uint32_t level; /* Syntactical nesting level. */
int endmark; /* Trust bytecode end marker, even if not at EOF. */
+ int fr2; /* Generate bytecode for LJ_FR2 mode. */
} LexState;
LJ_FUNC int lj_lex_setup(lua_State *L, LexState *ls);
diff --git a/src/lj_lib.c b/src/lj_lib.c
index ebe0dc78..06ae4fcf 100644
--- a/src/lj_lib.c
+++ b/src/lj_lib.c
@@ -62,6 +62,7 @@ static const uint8_t *lib_read_lfunc(lua_State *L, const uint8_t *p, GCtab *tab)
ls.pe = (const char *)~(uintptr_t)0;
ls.c = -1;
ls.level = (BCDUMP_F_STRIP|(LJ_BE*BCDUMP_F_BE));
+ ls.fr2 = LJ_FR2;
ls.chunkname = name;
pt = lj_bcread_proto(&ls);
pt->firstline = ~(BCLine)0;
@@ -266,6 +267,23 @@ GCfunc *lj_lib_checkfunc(lua_State *L, int narg)
return funcV(o);
}
+GCproto *lj_lib_checkLproto(lua_State *L, int narg, int nolua)
+{
+ TValue *o = L->base + narg-1;
+ if (L->top > o) {
+ if (tvisproto(o)) {
+ return protoV(o);
+ } else if (tvisfunc(o)) {
+ if (isluafunc(funcV(o)))
+ return funcproto(funcV(o));
+ else if (nolua)
+ return NULL;
+ }
+ }
+ lj_err_argt(L, narg, LUA_TFUNCTION);
+ return NULL; /* unreachable */
+}
+
GCtab *lj_lib_checktab(lua_State *L, int narg)
{
TValue *o = L->base + narg-1;
diff --git a/src/lj_lib.h b/src/lj_lib.h
index 6c3a1c83..a48e3c98 100644
--- a/src/lj_lib.h
+++ b/src/lj_lib.h
@@ -42,6 +42,7 @@ LJ_FUNC lua_Number lj_lib_checknum(lua_State *L, int narg);
LJ_FUNC int32_t lj_lib_checkint(lua_State *L, int narg);
LJ_FUNC int32_t lj_lib_optint(lua_State *L, int narg, int32_t def);
LJ_FUNC GCfunc *lj_lib_checkfunc(lua_State *L, int narg);
+LJ_FUNC GCproto *lj_lib_checkLproto(lua_State *L, int narg, int nolua);
LJ_FUNC GCtab *lj_lib_checktab(lua_State *L, int narg);
LJ_FUNC GCtab *lj_lib_checktabornil(lua_State *L, int narg);
LJ_FUNC int lj_lib_checkopt(lua_State *L, int narg, int def, const char *lst);
diff --git a/src/lj_load.c b/src/lj_load.c
index 0c0f08e7..29e45a54 100644
--- a/src/lj_load.c
+++ b/src/lj_load.c
@@ -34,14 +34,28 @@ static TValue *cpparser(lua_State *L, lua_CFunction dummy, void *ud)
UNUSED(dummy);
cframe_errfunc(L->cframe) = -1; /* Inherit error function. */
bc = lj_lex_setup(L, ls);
- if (ls->mode && !strchr(ls->mode, bc ? 'b' : 't')) {
- setstrV(L, L->top++, lj_err_str(L, LJ_ERR_XMODE));
- lj_err_throw(L, LUA_ERRSYNTAX);
+ if (ls->mode) {
+ int xmode = 1;
+ const char *mode = ls->mode;
+ char c;
+ while ((c = *mode++)) {
+ if (c == (bc ? 'b' : 't')) xmode = 0;
+ if (c == (LJ_FR2 ? 'W' : 'X')) ls->fr2 = !LJ_FR2;
+ }
+ if (xmode) {
+ setstrV(L, L->top++, lj_err_str(L, LJ_ERR_XMODE));
+ lj_err_throw(L, LUA_ERRSYNTAX);
+ }
}
pt = bc ? lj_bcread(ls) : lj_parse(ls);
- fn = lj_func_newL_empty(L, pt, tabref(L->env));
- /* Don't combine above/below into one statement. */
- setfuncV(L, L->top++, fn);
+ if (ls->fr2 == LJ_FR2) {
+ fn = lj_func_newL_empty(L, pt, tabref(L->env));
+ /* Don't combine above/below into one statement. */
+ setfuncV(L, L->top++, fn);
+ } else {
+ /* Non-native generation returns a dumpable, but non-runnable prototype. */
+ setprotoV(L, L->top++, pt);
+ }
return NULL;
}
@@ -245,9 +259,10 @@ LUALIB_API int luaL_loadstring(lua_State *L, const char *s)
LUA_API int lua_dump(lua_State *L, lua_Writer writer, void *data)
{
cTValue *o = L->top-1;
+ uint32_t flags = LJ_FR2*BCDUMP_F_FR2; /* Default mode for legacy C API. */
lj_checkapi(L->top > L->base, "top slot empty");
if (tvisfunc(o) && isluafunc(funcV(o)))
- return lj_bcwrite(L, funcproto(funcV(o)), writer, data, 0);
+ return lj_bcwrite(L, funcproto(funcV(o)), writer, data, flags);
else
return 1;
}
diff --git a/src/lj_mcode.c b/src/lj_mcode.c
index 94767937..8a4851dd 100644
--- a/src/lj_mcode.c
+++ b/src/lj_mcode.c
@@ -29,6 +29,11 @@
#include
#endif
+#if LJ_TARGET_WINDOWS
+#define WIN32_LEAN_AND_MEAN
+#include
+#endif
+
#if LJ_TARGET_IOS
void sys_icache_invalidate(void *start, size_t len);
#endif
@@ -41,6 +46,8 @@ void lj_mcode_sync(void *start, void *end)
#endif
#if LJ_TARGET_X86ORX64
UNUSED(start); UNUSED(end);
+#elif LJ_TARGET_WINDOWS
+ FlushInstructionCache(GetCurrentProcess(), start, (char *)end-(char *)start);
#elif LJ_TARGET_IOS
sys_icache_invalidate(start, (char *)end-(char *)start);
#elif LJ_TARGET_PPC
@@ -58,9 +65,6 @@ void lj_mcode_sync(void *start, void *end)
#if LJ_TARGET_WINDOWS
-#define WIN32_LEAN_AND_MEAN
-#include
-
#define MCPROT_RW PAGE_READWRITE
#define MCPROT_RX PAGE_EXECUTE_READ
#define MCPROT_RWX PAGE_EXECUTE_READWRITE
@@ -363,7 +367,7 @@ void lj_mcode_limiterr(jit_State *J, size_t need)
sizemcode = (size_t)J->param[JIT_P_sizemcode] << 10;
sizemcode = (sizemcode + LJ_PAGESIZE-1) & ~(size_t)(LJ_PAGESIZE - 1);
maxmcode = (size_t)J->param[JIT_P_maxmcode] << 10;
- if ((size_t)need > sizemcode)
+ if (need * sizeof(MCode) > sizemcode)
lj_trace_err(J, LJ_TRERR_MCODEOV); /* Too long for any area. */
if (J->szallmcarea + sizemcode > maxmcode)
lj_trace_err(J, LJ_TRERR_MCODEAL);
diff --git a/src/lj_opt_dce.c b/src/lj_opt_dce.c
index c6c3e1bc..e6fcc552 100644
--- a/src/lj_opt_dce.c
+++ b/src/lj_opt_dce.c
@@ -44,12 +44,12 @@ static void dce_propagate(jit_State *J)
IRIns *ir = IR(ins);
if (irt_ismarked(ir->t)) {
irt_clearmark(ir->t);
- pchain[ir->o] = &ir->prev;
} else if (!ir_sideeff(ir)) {
*pchain[ir->o] = ir->prev; /* Reroute original instruction chain. */
lj_ir_nop(ir);
continue;
}
+ pchain[ir->o] = &ir->prev;
if (ir->op1 >= REF_FIRST) irt_setmark(IR(ir->op1)->t);
if (ir->op2 >= REF_FIRST) irt_setmark(IR(ir->op2)->t);
}
diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c
index 48effb8a..ce78505b 100644
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@@ -377,10 +377,10 @@ static uint64_t kfold_int64arith(jit_State *J, uint64_t k1, uint64_t k2,
case IR_BOR: k1 |= k2; break;
case IR_BXOR: k1 ^= k2; break;
case IR_BSHL: k1 <<= (k2 & 63); break;
- case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 63)); break;
- case IR_BSAR: k1 >>= (k2 & 63); break;
- case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 63)); break;
- case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 63)); break;
+ case IR_BSHR: k1 >>= (k2 & 63); break;
+ case IR_BSAR: k1 = (uint64_t)((int64_t)k1 >> (k2 & 63)); break;
+ case IR_BROL: k1 = lj_rol(k1, (k2 & 63)); break;
+ case IR_BROR: k1 = lj_ror(k1, (k2 & 63)); break;
default: lj_assertJ(0, "bad IR op %d", op); break;
}
#else
@@ -1972,7 +1972,10 @@ LJFOLD(NE any any)
LJFOLDF(comm_equal)
{
/* For non-numbers only: x == x ==> drop; x ~= x ==> fail */
- if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
+ if (fins->op1 == fins->op2 &&
+ (!irt_isnum(fins->t) ||
+ (fleft->o == IR_CONV && /* Converted integers cannot be NaN. */
+ (uint32_t)(fleft->op2 & IRCONV_SRCMASK) - (uint32_t)IRT_I8 <= (uint32_t)(IRT_U64 - IRT_U8))))
return CONDFOLD(fins->o == IR_EQ);
return fold_comm_swap(J);
}
@@ -2131,8 +2134,26 @@ LJFOLDX(lj_opt_fwd_uload)
LJFOLD(ALEN any any)
LJFOLDX(lj_opt_fwd_alen)
+/* Try to merge UREFO/UREFC into referenced instruction. */
+static TRef merge_uref(jit_State *J, IRRef ref, IRIns* ir)
+{
+ if (ir->o == IR_UREFO && irt_isguard(ir->t)) {
+ /* Might be pointing to some other coroutine's stack.
+ ** And GC might shrink said stack, thereby repointing the upvalue.
+ ** GC might even collect said coroutine, thereby closing the upvalue.
+ */
+ if (gcstep_barrier(J, ref))
+ return EMITFOLD; /* So cannot merge. */
+ /* Current fins wants a check, but ir doesn't have one. */
+ if ((irt_t(fins->t) & (IRT_GUARD|IRT_TYPE)) == (IRT_GUARD|IRT_PGC) &&
+ irt_type(ir->t) == IRT_IGC)
+ ir->t.irt += IRT_PGC-IRT_IGC; /* So install a check. */
+ }
+ return ref; /* Not a TRef, but the caller doesn't care. */
+}
+
/* Upvalue refs are really loads, but there are no corresponding stores.
-** So CSE is ok for them, except for UREFO across a GC step (see below).
+** So CSE is ok for them, except for guarded UREFO across a GC step.
** If the referenced function is const, its upvalue addresses are const, too.
** This can be used to improve CSE by looking for the same address,
** even if the upvalues originate from a different function.
@@ -2150,9 +2171,7 @@ LJFOLDF(cse_uref)
if (irref_isk(ir->op1)) {
GCfunc *fn2 = ir_kfunc(IR(ir->op1));
if (gco2uv(gcref(fn2->l.uvptr[(ir->op2 >> 8)])) == uv) {
- if (fins->o == IR_UREFO && gcstep_barrier(J, ref))
- break;
- return ref;
+ return merge_uref(J, ref, ir);
}
}
ref = ir->prev;
@@ -2161,6 +2180,24 @@ LJFOLDF(cse_uref)
return EMITFOLD;
}
+/* Custom CSE for UREFO. */
+LJFOLD(UREFO any any)
+LJFOLDF(cse_urefo)
+{
+ if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
+ IRRef ref = J->chain[IR_UREFO];
+ IRRef lim = fins->op1;
+ IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
+ while (ref > lim) {
+ IRIns *ir = IR(ref);
+ if (ir->op12 == op12)
+ return merge_uref(J, ref, ir);
+ ref = ir->prev;
+ }
+ }
+ return EMITFOLD;
+}
+
LJFOLD(HREFK any any)
LJFOLDX(lj_opt_fwd_hrefk)
@@ -2381,14 +2418,9 @@ LJFOLDF(fold_base)
/* Write barriers are amenable to CSE, but not across any incremental
** GC steps.
-**
-** The same logic applies to open upvalue references, because a stack
-** may be resized during a GC step (not the current stack, but maybe that
-** of a coroutine).
*/
LJFOLD(TBAR any)
LJFOLD(OBAR any any)
-LJFOLD(UREFO any any)
LJFOLDF(barrier_tab)
{
TRef tr = lj_opt_cse(J);
diff --git a/src/lj_opt_mem.c b/src/lj_opt_mem.c
index 351d958c..29b33f29 100644
--- a/src/lj_opt_mem.c
+++ b/src/lj_opt_mem.c
@@ -217,25 +217,23 @@ static TRef fwd_ahload(jit_State *J, IRRef xref)
}
ref = store->prev;
}
- if (ir->o == IR_TNEW && !irt_isnil(fins->t))
- return 0; /* Type instability in loop-carried dependency. */
- if (irt_ispri(fins->t)) {
- return TREF_PRI(irt_type(fins->t));
- } else if (irt_isnum(fins->t) || (LJ_DUALNUM && irt_isint(fins->t)) ||
- irt_isstr(fins->t)) {
+ /* Simplified here: let loop_unroll() figure out any type instability. */
+ if (ir->o == IR_TNEW) {
+ return TREF_NIL;
+ } else {
TValue keyv;
cTValue *tv;
IRIns *key = IR(xr->op2);
if (key->o == IR_KSLOT) key = IR(key->op1);
lj_ir_kvalue(J->L, &keyv, key);
tv = lj_tab_get(J->L, ir_ktab(IR(ir->op1)), &keyv);
- if (itype2irt(tv) != irt_type(fins->t))
- return 0; /* Type instability in loop-carried dependency. */
- if (irt_isnum(fins->t))
+ if (tvispri(tv))
+ return TREF_PRI(itype2irt(tv));
+ else if (tvisnum(tv))
return lj_ir_knum_u64(J, tv->u64);
- else if (LJ_DUALNUM && irt_isint(fins->t))
+ else if (tvisint(tv))
return lj_ir_kint(J, intV(tv));
- else
+ else if (tvisgcv(tv))
return lj_ir_kstr(J, strV(tv));
}
/* Othwerwise: don't intern as a constant. */
@@ -464,18 +462,23 @@ doemit:
*/
static AliasRet aa_uref(IRIns *refa, IRIns *refb)
{
- if (refa->o != refb->o)
- return ALIAS_NO; /* Different UREFx type. */
if (refa->op1 == refb->op1) { /* Same function. */
if (refa->op2 == refb->op2)
return ALIAS_MUST; /* Same function, same upvalue idx. */
else
return ALIAS_NO; /* Same function, different upvalue idx. */
} else { /* Different functions, check disambiguation hash values. */
- if (((refa->op2 ^ refb->op2) & 0xff))
+ if (((refa->op2 ^ refb->op2) & 0xff)) {
return ALIAS_NO; /* Upvalues with different hash values cannot alias. */
- else
- return ALIAS_MAY; /* No conclusion can be drawn for same hash value. */
+ } else if (refa->o != refb->o) {
+ /* Different UREFx type, but need to confirm the UREFO really is open. */
+ if (irt_type(refa->t) == IRT_IGC) refa->t.irt += IRT_PGC-IRT_IGC;
+ else if (irt_type(refb->t) == IRT_IGC) refb->t.irt += IRT_PGC-IRT_IGC;
+ return ALIAS_NO;
+ } else {
+ /* No conclusion can be drawn for same hash value and same UREFx type. */
+ return ALIAS_MAY;
+ }
}
}
@@ -957,6 +960,8 @@ int lj_opt_fwd_wasnonnil(jit_State *J, IROpT loadop, IRRef xref)
if (skref == xkref || !irref_isk(skref) || !irref_isk(xkref))
return 0; /* A nil store with same const key or var key MAY alias. */
/* Different const keys CANNOT alias. */
+ } else if (irt_isp32(IR(skref)->t) != irt_isp32(IR(xkref)->t)) {
+ return 0; /* HREF and HREFK MAY alias. */
} /* Different key types CANNOT alias. */
} /* Other non-nil stores MAY alias. */
ref = store->prev;
diff --git a/src/lj_parse.c b/src/lj_parse.c
index 5d369c08..2d2a73a2 100644
--- a/src/lj_parse.c
+++ b/src/lj_parse.c
@@ -670,19 +670,20 @@ static void bcemit_store(FuncState *fs, ExpDesc *var, ExpDesc *e)
/* Emit method lookup expression. */
static void bcemit_method(FuncState *fs, ExpDesc *e, ExpDesc *key)
{
- BCReg idx, func, obj = expr_toanyreg(fs, e);
+ BCReg idx, func, fr2, obj = expr_toanyreg(fs, e);
expr_free(fs, e);
func = fs->freereg;
- bcemit_AD(fs, BC_MOV, func+1+LJ_FR2, obj); /* Copy object to 1st argument. */
+ fr2 = fs->ls->fr2;
+ bcemit_AD(fs, BC_MOV, func+1+fr2, obj); /* Copy object to 1st argument. */
lj_assertFS(expr_isstrk(key), "bad usage");
idx = const_str(fs, key);
if (idx <= BCMAX_C) {
- bcreg_reserve(fs, 2+LJ_FR2);
+ bcreg_reserve(fs, 2+fr2);
bcemit_ABC(fs, BC_TGETS, func, obj, idx);
} else {
- bcreg_reserve(fs, 3+LJ_FR2);
- bcemit_AD(fs, BC_KSTR, func+2+LJ_FR2, idx);
- bcemit_ABC(fs, BC_TGETV, func, obj, func+2+LJ_FR2);
+ bcreg_reserve(fs, 3+fr2);
+ bcemit_AD(fs, BC_KSTR, func+2+fr2, idx);
+ bcemit_ABC(fs, BC_TGETV, func, obj, func+2+fr2);
fs->freereg--;
}
e->u.s.info = func;
@@ -1336,9 +1337,12 @@ static void fs_fixup_bc(FuncState *fs, GCproto *pt, BCIns *bc, MSize n)
{
BCInsLine *base = fs->bcbase;
MSize i;
+ BCIns op;
pt->sizebc = n;
- bc[0] = BCINS_AD((fs->flags & PROTO_VARARG) ? BC_FUNCV : BC_FUNCF,
- fs->framesize, 0);
+ if (fs->ls->fr2 != LJ_FR2) op = BC_NOT; /* Mark non-native prototype. */
+ else if ((fs->flags & PROTO_VARARG)) op = BC_FUNCV;
+ else op = BC_FUNCF;
+ bc[0] = BCINS_AD(op, fs->framesize, 0);
for (i = 1; i < n; i++)
bc[i] = base[i].ins;
}
@@ -1981,11 +1985,11 @@ static void parse_args(LexState *ls, ExpDesc *e)
lj_assertFS(e->k == VNONRELOC, "bad expr type %d", e->k);
base = e->u.s.info; /* Base register for call. */
if (args.k == VCALL) {
- ins = BCINS_ABC(BC_CALLM, base, 2, args.u.s.aux - base - 1 - LJ_FR2);
+ ins = BCINS_ABC(BC_CALLM, base, 2, args.u.s.aux - base - 1 - ls->fr2);
} else {
if (args.k != VVOID)
expr_tonextreg(fs, &args);
- ins = BCINS_ABC(BC_CALL, base, 2, fs->freereg - base - LJ_FR2);
+ ins = BCINS_ABC(BC_CALL, base, 2, fs->freereg - base - ls->fr2);
}
expr_init(e, VCALL, bcemit_INS(fs, ins));
e->u.s.aux = base;
@@ -2025,7 +2029,7 @@ static void expr_primary(LexState *ls, ExpDesc *v)
parse_args(ls, v);
} else if (ls->tok == '(' || ls->tok == TK_string || ls->tok == '{') {
expr_tonextreg(fs, v);
- if (LJ_FR2) bcreg_reserve(fs, 1);
+ if (ls->fr2) bcreg_reserve(fs, 1);
parse_args(ls, v);
} else {
break;
@@ -2610,7 +2614,7 @@ static void parse_for_iter(LexState *ls, GCstr *indexname)
line = ls->linenumber;
assign_adjust(ls, 3, expr_list(ls, &e), &e);
/* The iterator needs another 3 [4] slots (func [pc] | state ctl). */
- bcreg_bump(fs, 3+LJ_FR2);
+ bcreg_bump(fs, 3+ls->fr2);
isnext = (nvars <= 5 && predict_next(ls, fs, exprpc));
var_add(ls, 3); /* Hidden control variables. */
lex_check(ls, TK_do);
diff --git a/src/lj_record.c b/src/lj_record.c
index 9d0021a6..b7af5896 100644
--- a/src/lj_record.c
+++ b/src/lj_record.c
@@ -976,6 +976,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults)
emitir(IRTG(IR_RETF, IRT_PGC), trpt, trpc);
J->retdepth++;
J->needsnap = 1;
+ J->scev.idx = REF_NIL;
lj_assertJ(J->baseslot == 1+LJ_FR2, "bad baseslot for return");
/* Shift result slots up and clear the slots of the new frame below. */
memmove(J->base + cbase, J->base-1-LJ_FR2, sizeof(TRef)*nresults);
@@ -1599,10 +1600,16 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix)
lj_assertJ(!hasmm, "inconsistent metamethod handling");
if (oldv == niltvg(J2G(J))) { /* Need to insert a new key. */
TRef key = ix->key;
- if (tref_isinteger(key)) /* NEWREF needs a TValue as a key. */
+ if (tref_isinteger(key)) { /* NEWREF needs a TValue as a key. */
key = emitir(IRTN(IR_CONV), key, IRCONV_NUM_INT);
- else if (tref_isnumber(key) && tref_isk(key) && tvismzero(&ix->keyv))
- key = lj_ir_knum_zero(J); /* Canonicalize -0.0 to +0.0. */
+ } else if (tref_isnum(key)) {
+ if (tref_isk(key)) {
+ if (tvismzero(&ix->keyv))
+ key = lj_ir_knum_zero(J); /* Canonicalize -0.0 to +0.0. */
+ } else {
+ emitir(IRTG(IR_EQ, IRT_NUM), key, key); /* Check for !NaN. */
+ }
+ }
xref = emitir(IRT(IR_NEWREF, IRT_PGC), ix->tab, key);
keybarrier = 0; /* NEWREF already takes care of the key barrier. */
#ifdef LUAJIT_ENABLE_TABLE_BUMP
@@ -1766,16 +1773,16 @@ noconstify:
/* Note: this effectively limits LJ_MAX_UPVAL to 127. */
uv = (uv << 8) | (hashrot(uvp->dhash, uvp->dhash + HASH_BIAS) & 0xff);
if (!uvp->closed) {
- uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_PGC), fn, uv));
/* In current stack? */
if (uvval(uvp) >= tvref(J->L->stack) &&
uvval(uvp) < tvref(J->L->maxstack)) {
int32_t slot = (int32_t)(uvval(uvp) - (J->L->base - J->baseslot));
if (slot >= 0) { /* Aliases an SSA slot? */
+ uref = tref_ref(emitir(IRT(IR_UREFO, IRT_PGC), fn, uv));
emitir(IRTG(IR_EQ, IRT_PGC),
REF_BASE,
emitir(IRT(IR_ADD, IRT_PGC), uref,
- lj_ir_kint(J, (slot - 1 - LJ_FR2) * -8)));
+ lj_ir_kintpgc(J, (slot - 1 - LJ_FR2) * -8)));
slot -= (int32_t)J->baseslot; /* Note: slot number may be negative! */
if (val == 0) {
return getslot(J, slot);
@@ -1786,12 +1793,21 @@ noconstify:
}
}
}
+ /* IR_UREFO+IRT_IGC is not checked for open-ness at runtime.
+ ** Always marked as a guard, since it might get promoted to IRT_PGC later.
+ */
+ uref = emitir(IRTG(IR_UREFO, tref_isgcv(val) ? IRT_PGC : IRT_IGC), fn, uv);
+ uref = tref_ref(uref);
emitir(IRTG(IR_UGT, IRT_PGC),
emitir(IRT(IR_SUB, IRT_PGC), uref, REF_BASE),
- lj_ir_kint(J, (J->baseslot + J->maxslot) * 8));
+ lj_ir_kintpgc(J, (J->baseslot + J->maxslot) * 8));
} else {
+ /* If fn is constant, then so is the GCupval*, and the upvalue cannot
+ ** transition back to open, so no guard is required in this case.
+ */
+ IRType t = (tref_isk(fn) ? 0 : IRT_GUARD) | IRT_PGC;
+ uref = tref_ref(emitir(IRT(IR_UREFC, t), fn, uv));
needbarrier = 1;
- uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_PGC), fn, uv));
}
if (val == 0) { /* Upvalue load */
IRType t = itype2irt(uvval(uvp));
@@ -1966,7 +1982,8 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults)
emitir(IRTGI(IR_EQ), fr,
lj_ir_kint(J, (int32_t)frame_ftsz(J->L->base-1)));
vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr);
- vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8*(1+LJ_FR2)));
+ vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase,
+ lj_ir_kintpgc(J, frofs-8*(1+LJ_FR2)));
for (i = 0; i < nload; i++) {
IRType t = itype2irt(&J->L->base[i-1-LJ_FR2-nvararg]);
J->base[dst+i] = lj_record_vload(J, vbase, (MSize)i, t);
@@ -1985,8 +2002,11 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults)
TRef tr = TREF_NIL;
ptrdiff_t idx = lj_ffrecord_select_mode(J, tridx, &J->L->base[dst-1]);
if (idx < 0) goto nyivarg;
- if (idx != 0 && !tref_isinteger(tridx))
+ if (idx != 0 && !tref_isinteger(tridx)) {
+ if (tref_isstr(tridx))
+ tridx = emitir(IRTG(IR_STRTO, IRT_NUM), tridx, 0);
tridx = emitir(IRTGI(IR_CONV), tridx, IRCONV_INT_NUM|IRCONV_INDEX);
+ }
if (idx != 0 && tref_isk(tridx)) {
emitir(IRTGI(idx <= nvararg ? IR_GE : IR_LT),
fr, lj_ir_kint(J, frofs+8*(int32_t)idx));
@@ -2014,7 +2034,7 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults)
IRType t;
TRef aref, vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr);
vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase,
- lj_ir_kint(J, frofs-(8<L->base[idx-2-LJ_FR2-nvararg]);
aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, tridx);
tr = lj_record_vload(J, aref, 0, t);
diff --git a/src/lj_snap.c b/src/lj_snap.c
index 68de208f..f3645e87 100644
--- a/src/lj_snap.c
+++ b/src/lj_snap.c
@@ -453,6 +453,7 @@ static TRef snap_replay_const(jit_State *J, IRIns *ir)
case IR_KNUM: case IR_KINT64:
return lj_ir_k64(J, (IROp)ir->o, ir_k64(ir)->u64);
case IR_KPTR: return lj_ir_kptr(J, ir_kptr(ir)); /* Continuation. */
+ case IR_KNULL: return lj_ir_knull(J, irt_type(ir->t));
default: lj_assertJ(0, "bad IR constant op %d", ir->o); return TREF_NIL;
}
}
@@ -557,13 +558,15 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
IRRef refp = snap_ref(sn);
IRIns *ir = &T->ir[refp];
if (regsp_reg(ir->r) == RID_SUNK) {
+ uint8_t m;
if (J->slot[snap_slot(sn)] != snap_slot(sn)) continue;
pass23 = 1;
lj_assertJ(ir->o == IR_TNEW || ir->o == IR_TDUP ||
ir->o == IR_CNEW || ir->o == IR_CNEWI,
"sunk parent IR %04d has bad op %d", refp - REF_BIAS, ir->o);
- if (ir->op1 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op1);
- if (ir->op2 >= T->nk) snap_pref(J, T, map, nent, seen, ir->op2);
+ m = lj_ir_mode[ir->o];
+ if (irm_op1(m) == IRMref) snap_pref(J, T, map, nent, seen, ir->op1);
+ if (irm_op2(m) == IRMref) snap_pref(J, T, map, nent, seen, ir->op2);
if (LJ_HASFFI && ir->o == IR_CNEWI) {
if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP)
snap_pref(J, T, map, nent, seen, (ir+1)->op2);
@@ -591,14 +594,16 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
IRIns *ir = &T->ir[refp];
if (regsp_reg(ir->r) == RID_SUNK) {
TRef op1, op2;
+ uint8_t m;
if (J->slot[snap_slot(sn)] != snap_slot(sn)) { /* De-dup allocs. */
J->slot[snap_slot(sn)] = J->slot[J->slot[snap_slot(sn)]];
continue;
}
op1 = ir->op1;
- if (op1 >= T->nk) op1 = snap_pref(J, T, map, nent, seen, op1);
+ m = lj_ir_mode[ir->o];
+ if (irm_op1(m) == IRMref) op1 = snap_pref(J, T, map, nent, seen, op1);
op2 = ir->op2;
- if (op2 >= T->nk) op2 = snap_pref(J, T, map, nent, seen, op2);
+ if (irm_op2(m) == IRMref) op2 = snap_pref(J, T, map, nent, seen, op2);
if (LJ_HASFFI && ir->o == IR_CNEWI) {
if (LJ_32 && refp+1 < T->nins && (ir+1)->o == IR_HIOP) {
lj_needsplit(J); /* Emit joining HIOP. */
@@ -624,9 +629,25 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
if (irr->o == IR_HREFK || irr->o == IR_AREF) {
IRIns *irf = &T->ir[irr->op1];
tmp = emitir(irf->ot, tmp, irf->op2);
+ } else if (irr->o == IR_NEWREF) {
+ IRRef allocref = tref_ref(tr);
+ IRRef keyref = tref_ref(key);
+ IRRef newref_ref = J->chain[IR_NEWREF];
+ IRIns *newref = &J->cur.ir[newref_ref];
+ lj_assertJ(irref_isk(keyref),
+ "sunk store for parent IR %04d with bad key %04d",
+ refp - REF_BIAS, keyref - REF_BIAS);
+ if (newref_ref > allocref && newref->op2 == keyref) {
+ lj_assertJ(newref->op1 == allocref,
+ "sunk store for parent IR %04d with bad tab %04d",
+ refp - REF_BIAS, allocref - REF_BIAS);
+ tmp = newref_ref;
+ goto skip_newref;
+ }
}
}
tmp = emitir(irr->ot, tmp, key);
+ skip_newref:
val = snap_pref(J, T, map, nent, seen, irs->op2);
if (val == 0) {
IRIns *irc = &T->ir[irs->op2];
@@ -882,9 +903,13 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
if (irk->o == IR_FREF) {
switch (irk->op2) {
case IRFL_TAB_META:
- snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp);
- /* NOBARRIER: The table is new (marked white). */
- setgcref(t->metatable, obj2gco(tabV(&tmp)));
+ if (T->ir[irs->op2].o == IR_KNULL) {
+ setgcrefnull(t->metatable);
+ } else {
+ snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, &tmp);
+ /* NOBARRIER: The table is new (marked white). */
+ setgcref(t->metatable, obj2gco(tabV(&tmp)));
+ }
break;
case IRFL_TAB_NOMM:
/* Negative metamethod cache invalidated by lj_tab_set() below. */
diff --git a/src/lj_state.c b/src/lj_state.c
index b45a2043..af17e4b5 100644
--- a/src/lj_state.c
+++ b/src/lj_state.c
@@ -102,20 +102,49 @@ void lj_state_shrinkstack(lua_State *L, MSize used)
/* Try to grow stack. */
void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need)
{
- MSize n;
- if (L->stacksize > LJ_STACK_MAXEX) /* Overflow while handling overflow? */
- lj_err_throw(L, LUA_ERRERR);
- n = L->stacksize + need;
- if (n > LJ_STACK_MAX) {
- n += 2*LUA_MINSTACK;
- } else if (n < 2*L->stacksize) {
- n = 2*L->stacksize;
- if (n >= LJ_STACK_MAX)
- n = LJ_STACK_MAX;
+ MSize n = L->stacksize + need;
+ if (LJ_LIKELY(n < LJ_STACK_MAX)) { /* The stack can grow as requested. */
+ if (n < 2 * L->stacksize) { /* Try to double the size. */
+ n = 2 * L->stacksize;
+ if (n > LJ_STACK_MAX)
+ n = LJ_STACK_MAX;
+ }
+ resizestack(L, n);
+ } else { /* Request would overflow. Raise a stack overflow error. */
+ if (LJ_HASJIT) {
+ TValue *base = tvref(G(L)->jit_base);
+ if (base) L->base = base;
+ }
+ if (curr_funcisL(L)) {
+ L->top = curr_topL(L);
+ if (L->top > tvref(L->maxstack)) {
+ /* The current Lua frame violates the stack, so replace it with a
+ ** dummy. This can happen when BC_IFUNCF is trying to grow the stack.
+ */
+ L->top = L->base;
+ setframe_gc(L->base - 1 - LJ_FR2, obj2gco(L), LJ_TTHREAD);
+ }
+ }
+ if (L->stacksize <= LJ_STACK_MAXEX) {
+ /* An error handler might want to inspect the stack overflow error, but
+ ** will need some stack space to run in. We give it a stack size beyond
+ ** the normal limit in order to do so, then rely on lj_state_relimitstack
+ ** calls during unwinding to bring us back to a convential stack size.
+ ** The + 1 is space for the error message, and 2 * LUA_MINSTACK is for
+ ** the lj_state_checkstack() call in lj_err_run().
+ */
+ resizestack(L, LJ_STACK_MAX + 1 + 2 * LUA_MINSTACK);
+ lj_err_stkov(L); /* May invoke an error handler. */
+ } else {
+ /* If we're here, then the stack overflow error handler is requesting
+ ** to grow the stack even further. We have no choice but to abort the
+ ** error handler.
+ */
+ GCstr *em = lj_err_str(L, LJ_ERR_STKOV); /* Might OOM. */
+ setstrV(L, L->top++, em); /* There is always space to push an error. */
+ lj_err_throw(L, LUA_ERRERR); /* Does not invoke an error handler. */
+ }
}
- resizestack(L, n);
- if (L->stacksize >= LJ_STACK_MAXEX)
- lj_err_msg(L, LJ_ERR_STKOV);
}
void LJ_FASTCALL lj_state_growstack1(lua_State *L)
@@ -123,6 +152,18 @@ void LJ_FASTCALL lj_state_growstack1(lua_State *L)
lj_state_growstack(L, 1);
}
+static TValue *cpgrowstack(lua_State *co, lua_CFunction dummy, void *ud)
+{
+ UNUSED(dummy);
+ lj_state_growstack(co, *(MSize *)ud);
+ return NULL;
+}
+
+int LJ_FASTCALL lj_state_cpgrowstack(lua_State *L, MSize need)
+{
+ return lj_vm_cpcall(L, NULL, &need, cpgrowstack);
+}
+
/* Allocate basic stack for new state. */
static void stack_init(lua_State *L1, lua_State *L)
{
@@ -327,8 +368,11 @@ void LJ_FASTCALL lj_state_free(global_State *g, lua_State *L)
lj_assertG(L != mainthread(g), "free of main thread");
if (obj2gco(L) == gcref(g->cur_L))
setgcrefnull(g->cur_L);
- lj_func_closeuv(L, tvref(L->stack));
- lj_assertG(gcref(L->openupval) == NULL, "stale open upvalues");
+ if (gcref(L->openupval) != NULL) {
+ lj_func_closeuv(L, tvref(L->stack));
+ lj_trace_abort(g); /* For aa_uref soundness. */
+ lj_assertG(gcref(L->openupval) == NULL, "stale open upvalues");
+ }
lj_mem_freevec(g, tvref(L->stack), L->stacksize, TValue);
lj_mem_freet(g, L);
}
diff --git a/src/lj_state.h b/src/lj_state.h
index db67f03b..3850e5a1 100644
--- a/src/lj_state.h
+++ b/src/lj_state.h
@@ -18,6 +18,7 @@ LJ_FUNC void lj_state_relimitstack(lua_State *L);
LJ_FUNC void lj_state_shrinkstack(lua_State *L, MSize used);
LJ_FUNCA void LJ_FASTCALL lj_state_growstack(lua_State *L, MSize need);
LJ_FUNC void LJ_FASTCALL lj_state_growstack1(lua_State *L);
+LJ_FUNC int LJ_FASTCALL lj_state_cpgrowstack(lua_State *L, MSize need);
static LJ_AINLINE void lj_state_checkstack(lua_State *L, MSize need)
{
diff --git a/src/lj_strfmt_num.c b/src/lj_strfmt_num.c
index 79ec0263..c6e776aa 100644
--- a/src/lj_strfmt_num.c
+++ b/src/lj_strfmt_num.c
@@ -454,7 +454,8 @@ static char *lj_strfmt_wfnum(SBuf *sb, SFormat sf, lua_Number n, char *p)
prec--;
if (!i) {
if (ndlo == ndhi) { prec = 0; break; }
- lj_strfmt_wuint9(tail, nd[++ndlo]);
+ ndlo = (ndlo + 1) & 0x3f;
+ lj_strfmt_wuint9(tail, nd[ndlo]);
i = 9;
}
}
diff --git a/src/lj_target.h b/src/lj_target.h
index 09d19bd9..e7322c07 100644
--- a/src/lj_target.h
+++ b/src/lj_target.h
@@ -58,9 +58,13 @@ typedef uint32_t RegSP;
#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
typedef uint64_t RegSet;
#define RSET_BITS 6
+#define rset_picktop_(rs) ((Reg)lj_fls64(rs))
+#define rset_pickbot_(rs) ((Reg)lj_ffs64(rs))
#else
typedef uint32_t RegSet;
#define RSET_BITS 5
+#define rset_picktop_(rs) ((Reg)lj_fls(rs))
+#define rset_pickbot_(rs) ((Reg)lj_ffs(rs))
#endif
#define RID2RSET(r) (((RegSet)1) << (r))
@@ -71,13 +75,6 @@ typedef uint32_t RegSet;
#define rset_set(rs, r) (rs |= RID2RSET(r))
#define rset_clear(rs, r) (rs &= ~RID2RSET(r))
#define rset_exclude(rs, r) (rs & ~RID2RSET(r))
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
-#define rset_picktop_(rs) ((Reg)(__builtin_clzll(rs)^63))
-#define rset_pickbot_(rs) ((Reg)__builtin_ctzll(rs))
-#else
-#define rset_picktop_(rs) ((Reg)lj_fls(rs))
-#define rset_pickbot_(rs) ((Reg)lj_ffs(rs))
-#endif
/* -- Register allocation cost -------------------------------------------- */
diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h
index 65a14307..c34f1e59 100644
--- a/src/lj_target_arm64.h
+++ b/src/lj_target_arm64.h
@@ -234,6 +234,8 @@ typedef enum A64Ins {
A64I_MOVZx = 0xd2800000,
A64I_MOVNw = 0x12800000,
A64I_MOVNx = 0x92800000,
+ A64I_ADR = 0x10000000,
+ A64I_ADRP = 0x90000000,
A64I_LDRB = 0x39400000,
A64I_LDRH = 0x79400000,
diff --git a/src/lj_trace.c b/src/lj_trace.c
index f311d54b..a5e316e1 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -613,21 +613,27 @@ static int trace_abort(jit_State *J)
J->cur.link = 0;
J->cur.linktype = LJ_TRLINK_NONE;
lj_vmevent_send(L, TRACE,
- TValue *frame;
+ cTValue *bot = tvref(L->stack)+LJ_FR2;
+ cTValue *frame;
const BCIns *pc;
- GCfunc *fn;
+ BCPos pos = 0;
setstrV(L, L->top++, lj_str_newlit(L, "abort"));
setintV(L->top++, traceno);
/* Find original Lua function call to generate a better error message. */
- frame = J->L->base-1;
- pc = J->pc;
- while (!isluafunc(frame_func(frame))) {
- pc = (frame_iscont(frame) ? frame_contpc(frame) : frame_pc(frame)) - 1;
- frame = frame_prev(frame);
+ for (frame = J->L->base-1, pc = J->pc; ; frame = frame_prev(frame)) {
+ if (isluafunc(frame_func(frame))) {
+ pos = proto_bcpos(funcproto(frame_func(frame)), pc);
+ break;
+ } else if (frame_prev(frame) <= bot) {
+ break;
+ } else if (frame_iscont(frame)) {
+ pc = frame_contpc(frame) - 1;
+ } else {
+ pc = frame_pc(frame) - 1;
+ }
}
- fn = frame_func(frame);
- setfuncV(L, L->top++, fn);
- setintV(L->top++, proto_bcpos(funcproto(fn), pc));
+ setfuncV(L, L->top++, frame_func(frame));
+ setintV(L->top++, pos);
copyTV(L, L->top++, restorestack(L, errobj));
copyTV(L, L->top++, &J->errinfo);
);
@@ -922,7 +928,7 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
} else if (G(L)->gc.state == GCSatomic || G(L)->gc.state == GCSfinalize) {
if (!(G(L)->hookmask & HOOK_GC))
lj_gc_step(L); /* Exited because of GC: drive GC forward. */
- } else {
+ } else if ((J->flags & JIT_F_ON)) {
trace_hotside(J, pc);
}
/* Return MULTRES or 0 or -17. */
diff --git a/src/luajit_rolling.h b/src/luajit_rolling.h
index e564477a..2d04402c 100644
--- a/src/luajit_rolling.h
+++ b/src/luajit_rolling.h
@@ -76,4 +76,5 @@ LUA_API const char *luaJIT_profile_dumpstack(lua_State *L, const char *fmt,
/* Enforce (dynamic) linker error for version mismatches. Call from main. */
LUA_API void LUAJIT_VERSION_SYM(void);
+#error "DO NOT USE luajit_rolling.h -- only include build-generated luajit.h"
#endif
diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat
index 5a93d3e4..9a2e8b13 100644
--- a/src/msvcbuild.bat
+++ b/src/msvcbuild.bat
@@ -16,6 +16,7 @@
@rem Add more debug flags here, e.g. DEBUGCFLAGS=/DLUA_USE_APICHECK
@set DEBUGCFLAGS= /DLUA_USE_APICHECK /DLUA_USE_ASSERT /DLUAJIT_USE_SYSMALLOC /fsanitize=address
@set LJCOMPILE=cl /nologo /c /O2 /W3 /D_CRT_SECURE_NO_DEPRECATE /D_CRT_STDIO_INLINE=__declspec(dllexport)__inline /DLUAJIT_NUMMODE=2
+@set LJDYNBUILD=/MD /DLUA_BUILD_AS_DLL
@set LJLINK=link /nologo
@set LJMT=mt /nologo
@set LJLIB=lib /nologo /nodefaultlib
@@ -27,39 +28,52 @@
@set BUILDTYPE=release
@set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c lib_buffer.c
+@setlocal
+@call :SETHOSTVARS
%LJCOMPILE% host\minilua.c
@if errorlevel 1 goto :BAD
%LJLINK% /out:minilua.exe minilua.obj
@if errorlevel 1 goto :BAD
if exist minilua.exe.manifest^
%LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe
+@endlocal
-@set DASMFLAGS=-D WIN -D JIT -D FFI -D P64
+@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU -D P64
@set LJARCH=x64
@minilua
-@if errorlevel 8 goto :X64
+@if errorlevel 8 goto :NO32
@set DASC=vm_x86.dasc
-@set DASMFLAGS=-D WIN -D JIT -D FFI
+@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU
@set LJARCH=x86
@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
+@goto :DA
+:NO32
+@if "%VSCMD_ARG_TGT_ARCH%" neq "arm64" goto :X64
+@set DASC=vm_arm64.dasc
+@set DASMTARGET=-D LUAJIT_TARGET=LUAJIT_ARCH_ARM64
+@set LJARCH=arm64
+@goto :DA
:X64
-@if "%1" neq "nogc64" goto :GC64
+@if "%1" neq "nogc64" goto :DA
@shift
@set DASC=vm_x86.dasc
@set LJCOMPILE=%LJCOMPILE% /DLUAJIT_DISABLE_GC64
-:GC64
+:DA
minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC%
@if errorlevel 1 goto :BAD
if exist ..\.git ( git show -s --format=%%ct >luajit_relver.txt ) else ( type ..\.relver >luajit_relver.txt )
minilua host\genversion.lua
-%LJCOMPILE% /I "." /I %DASMDIR% host\buildvm*.c
+@setlocal
+@call :SETHOSTVARS
+%LJCOMPILE% /I "." /I %DASMDIR% %DASMTARGET% host\buildvm*.c
@if errorlevel 1 goto :BAD
%LJLINK% /out:buildvm.exe buildvm*.obj
@if errorlevel 1 goto :BAD
if exist buildvm.exe.manifest^
%LJMT% -manifest buildvm.exe.manifest -outputresource:buildvm.exe
+@endlocal
buildvm -m peobj -o lj_vm.obj
@if errorlevel 1 goto :BAD
@@ -80,12 +94,13 @@ buildvm -m folddef -o lj_folddef.h lj_opt_fold.c
@shift
@set BUILDTYPE=debug
@set LJCOMPILE=%LJCOMPILE% /Od /Zi %DEBUGCFLAGS%
+@set LJDYNBUILD=/MDd /DLUA_BUILD_AS_DLL
@set LJLINK=%LJLINK% /opt:ref /opt:icf /incremental:no
:NODEBUG
@set LJLINK=%LJLINK% /%BUILDTYPE%
@if "%1"=="amalg" goto :AMALGDLL
@if "%1"=="static" goto :STATIC
-%LJCOMPILE% /MD /DLUA_BUILD_AS_DLL lj_*.c lib_*.c
+%LJCOMPILE% %LJDYNBUILD% lj_*.c lib_*.c
@if errorlevel 1 goto :BAD
%LJLINK% /DLL /out:%LJDLLNAME% lj_*.obj lib_*.obj
@if errorlevel 1 goto :BAD
@@ -97,7 +112,7 @@ buildvm -m folddef -o lj_folddef.h lj_opt_fold.c
@if errorlevel 1 goto :BAD
@goto :MTDLL
:AMALGDLL
-%LJCOMPILE% /MD /DLUA_BUILD_AS_DLL ljamalg.c
+%LJCOMPILE% %LJDYNBUILD% ljamalg.c
@if errorlevel 1 goto :BAD
%LJLINK% /DLL /out:%LJDLLNAME% ljamalg.obj lj_vm.obj
@if errorlevel 1 goto :BAD
@@ -118,6 +133,12 @@ if exist luajit.exe.manifest^
@echo.
@echo === Successfully built LuaJIT for Windows/%LJARCH%[%BUILDTYPE%] ===
+@goto :END
+:SETHOSTVARS
+@if "%VSCMD_ARG_HOST_ARCH%_%VSCMD_ARG_TGT_ARCH%" equ "x64_arm64" (
+ call "%VSINSTALLDIR%Common7\Tools\VsDevCmd.bat" -arch=%VSCMD_ARG_HOST_ARCH% -no_logo
+ echo on
+)
@goto :END
:BAD
@echo.
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc
index 0d1ea95f..fc08c658 100644
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@@ -1195,8 +1195,11 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc pcall
+ | ldr RB, L->maxstack
+ | add INS, BASE, NARGS8:RC
| ldrb RA, [DISPATCH, #DISPATCH_GL(hookmask)]
| cmp NARGS8:RC, #8
+ | cmphs RB, INS
| blo ->fff_fallback
| tst RA, #HOOK_ACTIVE // Remember active hook before pcall.
| mov RB, BASE
@@ -1207,7 +1210,11 @@ static void build_subroutines(BuildCtx *ctx)
| b ->vm_call_dispatch
|
|.ffunc_2 xpcall
+ | ldr RB, L->maxstack
+ | add INS, BASE, NARGS8:RC
| ldrb RA, [DISPATCH, #DISPATCH_GL(hookmask)]
+ | cmp RB, INS
+ | blo ->fff_fallback
| checkfunc CARG4, ->fff_fallback // Traceback must be a function.
| mov RB, BASE
| strd CARG12, [BASE, #8] // Swap function and traceback.
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 698b4210..a6ce0507 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -113,13 +113,37 @@
|
|.define TMPDofs, #24
|
+|.if WIN
+|// Windows unwind data is suited to r1 stored first.
+|.macro stp_unwind, r1, r2, where
+| stp r1, r2, where
+|.endmacro
+|.macro ldp_unwind, r1, r2, where
+| ldp r1, r2, where
+|.endmacro
+|.macro ldp_unwind, r1, r2, where, post_index
+| ldp r1, r2, where, post_index
+|.endmacro
+|.else
+|// Otherwise store r2 first for compact unwind info (OSX).
+|.macro stp_unwind, r1, r2, where
+| stp r2, r1, where
+|.endmacro
+|.macro ldp_unwind, r1, r2, where
+| ldp r2, r1, where
+|.endmacro
+|.macro ldp_unwind, r1, r2, where, post_index
+| ldp r2, r1, where, post_index
+|.endmacro
+|.endif
+|
|.macro save_, gpr1, gpr2, fpr1, fpr2
-| stp d..fpr2, d..fpr1, [sp, # SAVE_FPR_+(14-fpr1)*8]
-| stp x..gpr2, x..gpr1, [sp, # SAVE_GPR_+(27-gpr1)*8]
+| stp_unwind d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(14-fpr1)*8]
+| stp_unwind x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(27-gpr1)*8]
|.endmacro
|.macro rest_, gpr1, gpr2, fpr1, fpr2
-| ldp d..fpr2, d..fpr1, [sp, # SAVE_FPR_+(14-fpr1)*8]
-| ldp x..gpr2, x..gpr1, [sp, # SAVE_GPR_+(27-gpr1)*8]
+| ldp_unwind d..fpr1, d..fpr2, [sp, # SAVE_FPR_+(14-fpr1)*8]
+| ldp_unwind x..gpr1, x..gpr2, [sp, # SAVE_GPR_+(27-gpr1)*8]
|.endmacro
|
|.macro saveregs
@@ -127,14 +151,14 @@
| sub sp, sp, # CFRAME_SPACE
| stp fp, lr, [sp, # SAVE_FP_LR_]
| add fp, sp, # SAVE_FP_LR_
-| stp x20, x19, [sp, # SAVE_GPR_+(27-19)*8]
+| stp_unwind x19, x20, [sp, # SAVE_GPR_+(27-19)*8]
| save_ 21, 22, 8, 9
| save_ 23, 24, 10, 11
| save_ 25, 26, 12, 13
| save_ 27, 28, 14, 15
|.endmacro
|.macro restoreregs
-| ldp x20, x19, [sp, # SAVE_GPR_+(27-19)*8]
+| ldp_unwind x19, x20, [sp, # SAVE_GPR_+(27-19)*8]
| rest_ 21, 22, 8, 9
| rest_ 23, 24, 10, 11
| rest_ 25, 26, 12, 13
@@ -267,8 +291,17 @@
| blo target
|.endmacro
|
+|.macro init_constants
+| movn TISNIL, #0
+| movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+| movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+|.endmacro
+|
|.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro
|.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro
+|.macro mov_nil, reg; mov reg, TISNIL; .endmacro
+|.macro cmp_nil, reg; cmp reg, TISNIL; .endmacro
+|.macro add_TISNUM, dst, src; add dst, src, TISNUM; .endmacro
|
#define GL_J(field) (GG_G2J + (int)offsetof(jit_State, field))
|
@@ -406,26 +439,26 @@ static void build_subroutines(BuildCtx *ctx)
|
|->vm_unwind_c: // Unwind C stack, return from vm_pcall.
| // (void *cframe, int errcode)
+ | add fp, CARG1, # SAVE_FP_LR_
| mov sp, CARG1
| mov CRET1, CARG2
- |->vm_unwind_c_eh: // Landing pad for external unwinder.
| ldr L, SAVE_L
- | mv_vmstate TMP0w, C
| ldr GL, L->glref
+ |->vm_unwind_c_eh: // Landing pad for external unwinder.
+ | mv_vmstate TMP0w, C
| st_vmstate TMP0w
| b ->vm_leave_unw
|
|->vm_unwind_ff: // Unwind C stack, return from ff pcall.
| // (void *cframe)
- | and sp, CARG1, #CFRAME_RAWMASK
- |->vm_unwind_ff_eh: // Landing pad for external unwinder.
+ | add fp, CARG1, # SAVE_FP_LR_
+ | mov sp, CARG1
| ldr L, SAVE_L
- | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
- | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
- | movn TISNIL, #0
+ | init_constants
+ | ldr GL, L->glref // Setup pointer to global state.
+ |->vm_unwind_ff_eh: // Landing pad for external unwinder.
| mov RC, #16 // 2 results: false + error message.
| ldr BASE, L->base
- | ldr GL, L->glref // Setup pointer to global state.
| mov_false TMP0
| sub RA, BASE, #8 // Results start at BASE-8.
| ldr PC, [BASE, FRAME_PC] // Fetch PC of previous frame.
@@ -486,11 +519,9 @@ static void build_subroutines(BuildCtx *ctx)
| str L, GL->cur_L
| mov RA, BASE
| ldp BASE, CARG1, L->base
- | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
- | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+ | init_constants
| ldr PC, [BASE, FRAME_PC]
| strb wzr, L->status
- | movn TISNIL, #0
| sub RC, CARG1, BASE
| ands CARG1, PC, #FRAME_TYPE
| add RC, RC, #8
@@ -526,10 +557,8 @@ static void build_subroutines(BuildCtx *ctx)
|3: // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
| str L, GL->cur_L
| ldp RB, CARG1, L->base // RB = old base (for vmeta_call).
- | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
- | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
| add PC, PC, BASE
- | movn TISNIL, #0
+ | init_constants
| sub PC, PC, RB // PC = frame delta + frame type
| sub NARGS8:RC, CARG1, BASE
| st_vmstate ST_INTERP
@@ -638,7 +667,7 @@ static void build_subroutines(BuildCtx *ctx)
| b >1
|
|->vmeta_tgetb: // RB = table, RC = index
- | add RC, RC, TISNUM
+ | add_TISNUM RC, RC
| add CARG2, BASE, RB, lsl #3
| add CARG3, sp, TMPDofs
| str RC, TMPD
@@ -673,7 +702,7 @@ static void build_subroutines(BuildCtx *ctx)
| sxtw CARG2, TMP1w
| bl extern lj_tab_getinth // (GCtab *t, int32_t key)
| // Returns cTValue * or NULL.
- | mov TMP0, TISNIL
+ | mov_nil TMP0
| cbz CRET1, ->BC_TGETR_Z
| ldr TMP0, [CRET1]
| b ->BC_TGETR_Z
@@ -696,7 +725,7 @@ static void build_subroutines(BuildCtx *ctx)
| b >1
|
|->vmeta_tsetb: // RB = table, RC = index
- | add RC, RC, TISNUM
+ | add_TISNUM RC, RC
| add CARG2, BASE, RB, lsl #3
| add CARG3, sp, TMPDofs
| str RC, TMPD
@@ -1010,7 +1039,7 @@ static void build_subroutines(BuildCtx *ctx)
|1: // Field metatable must be at same offset for GCtab and GCudata!
| ldr TAB:RB, TAB:CARG1->metatable
|2:
- | mov CARG1, TISNIL
+ | mov_nil CARG1
| ldr STR:RC, GL->gcroot[GCROOT_MMNAME+MM_metatable]
| cbz TAB:RB, ->fff_restv
| ldr TMP1w, TAB:RB->hmask
@@ -1032,7 +1061,7 @@ static void build_subroutines(BuildCtx *ctx)
| movk CARG1, #(LJ_TTAB>>1)&0xffff, lsl #48
| b ->fff_restv
|5:
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| bne ->fff_restv
| b <4
|
@@ -1132,8 +1161,8 @@ static void build_subroutines(BuildCtx *ctx)
| cbnz TAB:CARG2, ->fff_fallback
#endif
| mov RC, #(3+1)*8
- | stp CARG1, TISNIL, [BASE, #-8]
- | str CFUNC:CARG4, [BASE, #-16]
+ | stp CFUNC:CARG4, CARG1, [BASE, #-16]
+ | str TISNIL, [BASE]
| b ->fff_res
|
|.ffunc_2 ipairs_aux
@@ -1145,14 +1174,14 @@ static void build_subroutines(BuildCtx *ctx)
| add CARG2w, CARG2w, #1
| cmp CARG2w, TMP1w
| ldr PC, [BASE, FRAME_PC]
- | add TMP2, CARG2, TISNUM
+ | add_TISNUM TMP2, CARG2
| mov RC, #(0+1)*8
| str TMP2, [BASE, #-16]
| bhs >2 // Not in array part?
| ldr TMP0, [CARG3, CARG2, lsl #3]
|1:
| mov TMP1, #(2+1)*8
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| str TMP0, [BASE, #-8]
| csel RC, RC, TMP1, eq
| b ->fff_res
@@ -1175,13 +1204,17 @@ static void build_subroutines(BuildCtx *ctx)
| cbnz TAB:CARG2, ->fff_fallback
#endif
| mov RC, #(3+1)*8
- | stp CARG1, TISNUM, [BASE, #-8]
- | str CFUNC:CARG4, [BASE, #-16]
+ | stp CFUNC:CARG4, CARG1, [BASE, #-16]
+ | str TISNUM, [BASE]
| b ->fff_res
|
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc pcall
+ | ldr TMP1, L->maxstack
+ | add TMP2, BASE, NARGS8:RC
+ | cmp TMP1, TMP2
+ | blo ->fff_fallback
| cmp NARGS8:RC, #8
| ldrb TMP0w, GL->hookmask
| blo ->fff_fallback
@@ -1201,6 +1234,10 @@ static void build_subroutines(BuildCtx *ctx)
| b ->vm_call_dispatch
|
|.ffunc xpcall
+ | ldr TMP1, L->maxstack
+ | add TMP2, BASE, NARGS8:RC
+ | cmp TMP1, TMP2
+ | blo ->fff_fallback
| ldp CARG1, CARG2, [BASE]
| ldrb TMP0w, GL->hookmask
| subs NARGS8:TMP1, NARGS8:RC, #16
@@ -1366,7 +1403,7 @@ static void build_subroutines(BuildCtx *ctx)
| eor CARG2w, CARG1w, CARG1w, asr #31
| movz CARG3, #0x41e0, lsl #48 // 2^31.
| subs CARG1w, CARG2w, CARG1w, asr #31
- | add CARG1, CARG1, TISNUM
+ | add_TISNUM CARG1, CARG1
| csel CARG1, CARG1, CARG3, pl
| // Fallthrough.
|
@@ -1457,7 +1494,7 @@ static void build_subroutines(BuildCtx *ctx)
| ldr PC, [BASE, FRAME_PC]
| str d0, [BASE, #-16]
| mov RC, #(2+1)*8
- | add CARG2, CARG2, TISNUM
+ | add_TISNUM CARG2, CARG2
| str CARG2, [BASE, #-8]
| b ->fff_res
|
@@ -1523,7 +1560,7 @@ static void build_subroutines(BuildCtx *ctx)
| bne ->fff_fallback
| ldrb TMP0w, STR:CARG1[1] // Access is always ok (NUL at end).
| ldr CARG3w, STR:CARG1->len
- | add TMP0, TMP0, TISNUM
+ | add_TISNUM TMP0, TMP0
| str TMP0, [BASE, #-16]
| mov RC, #(0+1)*8
| cbz CARG3, ->fff_res
@@ -1669,17 +1706,17 @@ static void build_subroutines(BuildCtx *ctx)
|.ffunc_bit tobit
| mov TMP0w, CARG1w
|9: // Label reused by .ffunc_bit_op users.
- | add CARG1, TMP0, TISNUM
+ | add_TISNUM CARG1, TMP0
| b ->fff_restv
|
|.ffunc_bit bswap
| rev TMP0w, CARG1w
- | add CARG1, TMP0, TISNUM
+ | add_TISNUM CARG1, TMP0
| b ->fff_restv
|
|.ffunc_bit bnot
| mvn TMP0w, CARG1w
- | add CARG1, TMP0, TISNUM
+ | add_TISNUM CARG1, TMP0
| b ->fff_restv
|
|.macro .ffunc_bit_sh, name, ins, shmod
@@ -1700,7 +1737,7 @@ static void build_subroutines(BuildCtx *ctx)
| checkint CARG1, ->vm_tobit_fb
|2:
| ins TMP0w, CARG1w, TMP1w
- | add CARG1, TMP0, TISNUM
+ | add_TISNUM CARG1, TMP0
| b ->fff_restv
|.endmacro
|
@@ -1889,8 +1926,7 @@ static void build_subroutines(BuildCtx *ctx)
| and CARG3, CARG3, #LJ_GCVMASK
| beq >2
|1: // Move results down.
- | ldr CARG1, [RA]
- | add RA, RA, #8
+ | ldr CARG1, [RA], #8
| subs RB, RB, #8
| str CARG1, [BASE, RC, lsl #3]
| add RC, RC, #1
@@ -2005,13 +2041,11 @@ static void build_subroutines(BuildCtx *ctx)
|.if JIT
| ldr L, SAVE_L
|1:
+ | init_constants
| cmn CARG1w, #LUA_ERRERR
| bhs >9 // Check for error from exit.
- | lsl RC, CARG1, #3
| ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
- | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
- | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
- | movn TISNIL, #0
+ | lsl RC, CARG1, #3
| and LFUNC:CARG2, CARG2, #LJ_GCVMASK
| str RCw, SAVE_MULTRES
| str BASE, L->base
@@ -2162,7 +2196,7 @@ static void build_subroutines(BuildCtx *ctx)
|//-----------------------------------------------------------------------
|
|// Handler for callback functions.
- |// Saveregs already performed. Callback slot number in [sp], g in r12.
+ |// Saveregs already performed. Callback slot number in w9, g in x10.
|->vm_ffi_callback:
|.if FFI
|.type CTSTATE, CTState, PC
@@ -2186,9 +2220,7 @@ static void build_subroutines(BuildCtx *ctx)
| bl extern lj_ccallback_enter // (CTState *cts, void *cf)
| // Returns lua_State *.
| ldp BASE, RC, L:CRET1->base
- | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
- | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
- | movn TISNIL, #0
+ | init_constants
| mov L, CRET1
| ldr LFUNC:CARG3, [BASE, FRAME_FUNC]
| sub RC, RC, BASE
@@ -2215,7 +2247,7 @@ static void build_subroutines(BuildCtx *ctx)
|.if FFI
| .type CCSTATE, CCallState, x19
| sp_auth
- | stp x20, CCSTATE, [sp, #-32]!
+ | stp_unwind CCSTATE, x20, [sp, #-32]!
| stp fp, lr, [sp, #16]
| add fp, sp, #16
| mov CCSTATE, x0
@@ -2247,7 +2279,7 @@ static void build_subroutines(BuildCtx *ctx)
| stp d0, d1, CCSTATE->fpr[0]
| stp d2, d3, CCSTATE->fpr[2]
| ldp fp, lr, [sp, #16]
- | ldp x20, CCSTATE, [sp], #32
+ | ldp_unwind CCSTATE, x20, [sp], #32
| ret_auth
|.endif
|// Note: vm_ffi_call must be the last function in this object file!
@@ -2567,7 +2599,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| bne >5
| negs TMP0w, TMP0w
| movz CARG3, #0x41e0, lsl #48 // 2^31.
- | add TMP0, TMP0, TISNUM
+ | add_TISNUM TMP0, TMP0
| csel TMP0, TMP0, CARG3, vc
|5:
| str TMP0, [BASE, RA, lsl #3]
@@ -2582,7 +2614,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| bne >2
| ldr CARG1w, STR:CARG1->len
|1:
- | add CARG1, CARG1, TISNUM
+ | add_TISNUM CARG1, CARG1
| str CARG1, [BASE, RA, lsl #3]
| ins_next
|
@@ -2690,7 +2722,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| intins CARG1w, CARG1w, CARG2w
| ins_arithfallback bvs
|.endif
- | add CARG1, CARG1, TISNUM
+ | add_TISNUM CARG1, CARG1
| str CARG1, [BASE, RA, lsl #3]
|4:
| ins_next
@@ -2783,7 +2815,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
case BC_KSHORT:
| // RA = dst, RC = int16_literal
| sxth RCw, RCw
- | add TMP0, RC, TISNUM
+ | add_TISNUM TMP0, RC
| str TMP0, [BASE, RA, lsl #3]
| ins_next
break;
@@ -3006,7 +3038,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| cmp TMP1w, CARG1w // In array part?
| bhs ->vmeta_tgetv
| ldr TMP0, [CARG3]
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| beq >5
|1:
| str TMP0, [BASE, RA, lsl #3]
@@ -3049,7 +3081,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| ldr NODE:CARG3, NODE:CARG3->next
| cmp CARG1, CARG4
| bne >4
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| beq >5
|3:
| str TMP0, [BASE, RA, lsl #3]
@@ -3058,7 +3090,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|4: // Follow hash chain.
| cbnz NODE:CARG3, <1
| // End of hash chain: key not found, nil result.
- | mov TMP0, TISNIL
+ | mov_nil TMP0
|
|5: // Check for __index if table value is nil.
| ldr TAB:CARG1, TAB:CARG2->metatable
@@ -3079,7 +3111,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| cmp RCw, CARG1w // In array part?
| bhs ->vmeta_tgetb
| ldr TMP0, [CARG3]
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| beq >5
|1:
| str TMP0, [BASE, RA, lsl #3]
@@ -3126,7 +3158,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| ldr TMP1, [CARG3]
| ldr TMP0, [BASE, RA, lsl #3]
| ldrb TMP2w, TAB:CARG2->marked
- | cmp TMP1, TISNIL // Previous value is nil?
+ | cmp_nil TMP1 // Previous value is nil?
| beq >5
|1:
| str TMP0, [CARG3]
@@ -3178,7 +3210,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| cmp CARG1, CARG4
| bne >5
| ldr TMP0, [BASE, RA, lsl #3]
- | cmp TMP1, TISNIL // Previous value is nil?
+ | cmp_nil TMP1 // Previous value is nil?
| beq >4
|2:
| str TMP0, NODE:CARG3->val
@@ -3237,7 +3269,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| ldr TMP1, [CARG3]
| ldr TMP0, [BASE, RA, lsl #3]
| ldrb TMP2w, TAB:CARG2->marked
- | cmp TMP1, TISNIL // Previous value is nil?
+ | cmp_nil TMP1 // Previous value is nil?
| beq >5
|1:
| str TMP0, [CARG3]
@@ -3336,9 +3368,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|->BC_CALL_Z:
| mov RB, BASE // Save old BASE for vmeta_call.
| add BASE, BASE, RA, lsl #3
- | ldr CARG3, [BASE]
+ | ldr CARG3, [BASE], #16
| sub NARGS8:RC, NARGS8:RC, #8
- | add BASE, BASE, #16
| checkfunc CARG3, ->vmeta_call
| ins_call
break;
@@ -3354,9 +3385,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| // RA = base, (RB = 0,) RC = (nargs+1)*8
|->BC_CALLT1_Z:
| add RA, BASE, RA, lsl #3
- | ldr TMP1, [RA]
+ | ldr TMP1, [RA], #16
| sub NARGS8:RC, NARGS8:RC, #8
- | add RA, RA, #16
| checktp CARG3, TMP1, LJ_TFUNC, ->vmeta_callt
| ldr PC, [BASE, FRAME_PC]
|->BC_CALLT2_Z:
@@ -3436,10 +3466,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| add CARG3, CARG2, CARG1, lsl #3
| bhs >5 // Index points after array part?
| ldr TMP0, [CARG3]
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| cinc CARG1, CARG1, eq // Skip holes in array part.
| beq <1
- | add CARG1, CARG1, TISNUM
+ | add_TISNUM CARG1, CARG1
| stp CARG1, TMP0, [RA]
| add CARG1, CARG1, #1
|3:
@@ -3457,7 +3487,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| add NODE:CARG3, NODE:RB, CARG1, lsl #3 // node = tab->node + idx*3*8
| bhi <4
| ldp TMP0, CARG1, NODE:CARG3->val
- | cmp TMP0, TISNIL
+ | cmp_nil TMP0
| add RC, RC, #1
| beq <6 // Skip holes in hash part.
| stp CARG1, TMP0, [RA]
@@ -3475,8 +3505,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| checkfunc CFUNC:CARG1, >5
| asr TMP0, TAB:CARG3, #47
| ldrb TMP1w, CFUNC:CARG1->ffid
- | cmn TMP0, #-LJ_TTAB
- | ccmp CARG4, TISNIL, #0, eq
+ | cmp_nil CARG4
+ | ccmn TMP0, #-LJ_TTAB, #0, eq
| ccmp TMP1w, #FF_next_N, #0, eq
| bne >5
| mov TMP0w, #0xfffe7fff // LJ_KEYINDEX
@@ -3516,51 +3546,51 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| and RC, RC, #255
| // RA = base, RB = (nresults+1), RC = numparams
| ldr TMP1, [BASE, FRAME_PC]
- | add RC, BASE, RC, lsl #3
- | add RA, BASE, RA, lsl #3
- | add RC, RC, #FRAME_VARG
- | add TMP2, RA, RB, lsl #3
- | sub RC, RC, TMP1 // RC = vbase
- | // Note: RC may now be even _above_ BASE if nargs was < numparams.
+ | add TMP0, BASE, RC, lsl #3
+ | add RC, BASE, RA, lsl #3 // RC = destination
+ | add TMP0, TMP0, #FRAME_VARG
+ | add TMP2, RC, RB, lsl #3
+ | sub RA, TMP0, TMP1 // RA = vbase
+ | // Note: RA may now be even _above_ BASE if nargs was < numparams.
| sub TMP3, BASE, #16 // TMP3 = vtop
| cbz RB, >5
| sub TMP2, TMP2, #16
|1: // Copy vararg slots to destination slots.
- | cmp RC, TMP3
- | ldr TMP0, [RC], #8
- | csel TMP0, TMP0, TISNIL, lo
- | cmp RA, TMP2
- | str TMP0, [RA], #8
+ | cmp RA, TMP3
+ | ldr TMP0, [RA], #8
+ | csinv TMP0, TMP0, xzr, lo // TISNIL = ~xzr
+ | cmp RC, TMP2
+ | str TMP0, [RC], #8
| blo <1
|2:
| ins_next
|
|5: // Copy all varargs.
| ldr TMP0, L->maxstack
- | subs TMP2, TMP3, RC
+ | subs TMP2, TMP3, RA
| csel RB, xzr, TMP2, le // MULTRES = (max(vtop-vbase,0)+1)*8
| add RB, RB, #8
- | add TMP1, RA, TMP2
+ | add TMP1, RC, TMP2
| str RBw, SAVE_MULTRES
| ble <2 // Nothing to copy.
| cmp TMP1, TMP0
| bhi >7
|6:
- | ldr TMP0, [RC], #8
- | str TMP0, [RA], #8
- | cmp RC, TMP3
+ | ldr TMP0, [RA], #8
+ | str TMP0, [RC], #8
+ | cmp RA, TMP3
| blo <6
| b <2
|
|7: // Grow stack for varargs.
| lsr CARG2, TMP2, #3
- | stp BASE, RA, L->base
+ | stp BASE, RC, L->base
| mov CARG1, L
- | sub RC, RC, BASE // Need delta, because BASE may change.
+ | sub RA, RA, BASE // Need delta, because BASE may change.
| str PC, SAVE_PC
| bl extern lj_state_growstack // (lua_State *L, int n)
- | ldp BASE, RA, L->base
- | add RC, BASE, RC
+ | ldp BASE, RC, L->base
+ | add RA, BASE, RA
| sub TMP3, BASE, #16
| b <6
break;
@@ -3704,7 +3734,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
} else {
| adds CARG1w, CARG1w, CARG3w
| bvs >2
- | add TMP0, CARG1, TISNUM
+ | add_TISNUM TMP0, CARG1
| tbnz CARG3w, #31, >4
| cmp CARG1w, CARG2w
}
@@ -3783,7 +3813,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| // RA = base, RC = target
| ldr CARG1, [BASE, RA, lsl #3]
| add TMP1, BASE, RA, lsl #3
- | cmp CARG1, TISNIL
+ | cmp_nil CARG1
| beq >1 // Stop if iterator returned nil.
if (op == BC_JITERL) {
| str CARG1, [TMP1, #-8]
@@ -3816,9 +3846,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|.if JIT
| // RA = base (ignored), RC = traceno
| ldr CARG1, [GL, #GL_J(trace)]
- | mov CARG2w, #0 // Traces on ARM64 don't store the trace #, so use 0.
+ | st_vmstate wzr // Traces on ARM64 don't store the trace #, so use 0.
| ldr TRACE:RC, [CARG1, RC, lsl #3]
- | st_vmstate CARG2w
|.if PAUTH
| ldr RA, TRACE:RC->mcauth
|.else
@@ -3893,6 +3922,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| add TMP2, BASE, RC
| add LFUNC:CARG3, CARG3, TMP0, lsl #47
| add RA, RA, RC
+ | sub CARG1, CARG1, #8
| add TMP0, RC, #16+FRAME_VARG
| str LFUNC:CARG3, [TMP2], #8 // Store (tagged) copy of LFUNC.
| ldr KBASE, [PC, #-4+PC2PROTO(k)]
diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc
index f276745c..8760a1f6 100644
--- a/src/vm_mips.dasc
+++ b/src/vm_mips.dasc
@@ -1374,9 +1374,13 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc pcall
+ | lw TMP1, L->maxstack
+ | addu TMP2, BASE, NARGS8:RC
| lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
| beqz NARGS8:RC, ->fff_fallback
- | move TMP2, BASE
+ |. sltu AT, TMP1, TMP2
+ | bnez AT, ->fff_fallback
+ |. move TMP2, BASE
| addiu BASE, BASE, 8
| // Remember active hook before pcall.
| srl TMP3, TMP3, HOOK_ACTIVE_SHIFT
@@ -1386,8 +1390,12 @@ static void build_subroutines(BuildCtx *ctx)
|. addiu NARGS8:RC, NARGS8:RC, -8
|
|.ffunc xpcall
+ | lw TMP1, L->maxstack
+ | addu TMP2, BASE, NARGS8:RC
| sltiu AT, NARGS8:RC, 16
| lw CARG4, 8+HI(BASE)
+ | sltu TMP1, TMP1, TMP2
+ | or AT, AT, TMP1
| bnez AT, ->fff_fallback
|. lw CARG3, 8+LO(BASE)
| lw CARG1, LO(BASE)
diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc
index 6c215f2b..a8d20413 100644
--- a/src/vm_mips64.dasc
+++ b/src/vm_mips64.dasc
@@ -1415,8 +1415,12 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc pcall
+ | ld TMP1, L->maxstack
+ | daddu TMP2, BASE, NARGS8:RC
+ | sltu AT, TMP1, TMP2
+ | bnez AT, ->fff_fallback
+ |. lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
| daddiu NARGS8:RC, NARGS8:RC, -8
- | lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH)
| bltz NARGS8:RC, ->fff_fallback
|. move TMP2, BASE
| daddiu BASE, BASE, 16
@@ -1437,8 +1441,12 @@ static void build_subroutines(BuildCtx *ctx)
|. nop
|
|.ffunc xpcall
+ | ld TMP1, L->maxstack
+ | daddu TMP2, BASE, NARGS8:RC
+ | sltu AT, TMP1, TMP2
+ | bnez AT, ->fff_fallback
+ |. ld CARG1, 0(BASE)
| daddiu NARGS8:TMP0, NARGS8:RC, -16
- | ld CARG1, 0(BASE)
| ld CARG2, 8(BASE)
| bltz NARGS8:TMP0, ->fff_fallback
|. lbu TMP1, DISPATCH_GL(hookmask)(DISPATCH)
@@ -5396,6 +5404,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| settp LFUNC:RB, TMP0
| daddu TMP0, RA, RC
| sd LFUNC:RB, 0(TMP1) // Store (tagged) copy of LFUNC.
+ | daddiu TMP2, TMP2, -8
| daddiu TMP3, RC, 16+FRAME_VARG
| sltu AT, TMP0, TMP2
| ld KBASE, -4+PC2PROTO(k)(PC)
diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc
index f2e5a08f..abcc03e5 100644
--- a/src/vm_ppc.dasc
+++ b/src/vm_ppc.dasc
@@ -1735,8 +1735,12 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc pcall
+ | lwz TMP1, L->maxstack
+ | add TMP2, BASE, NARGS8:RC
| cmplwi NARGS8:RC, 8
| lbz TMP3, DISPATCH_GL(hookmask)(DISPATCH)
+ | cmplw cr1, TMP1, TMP2
+ | cror 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
| blt ->fff_fallback
| mr TMP2, BASE
| la BASE, 8(BASE)
@@ -1747,14 +1751,19 @@ static void build_subroutines(BuildCtx *ctx)
| b ->vm_call_dispatch
|
|.ffunc xpcall
+ | lwz TMP1, L->maxstack
+ | add TMP2, BASE, NARGS8:RC
| cmplwi NARGS8:RC, 16
| lwz CARG3, 8(BASE)
+ | cmplw cr1, TMP1, TMP2
|.if FPU
| lfd FARG2, 8(BASE)
+ | cror 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
| lfd FARG1, 0(BASE)
|.else
| lwz CARG1, 0(BASE)
| lwz CARG2, 4(BASE)
+ | cror 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
| lwz CARG4, 12(BASE)
|.endif
| blt ->fff_fallback
diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc
index 3635ba28..8c46ea59 100644
--- a/src/vm_x64.dasc
+++ b/src/vm_x64.dasc
@@ -1463,6 +1463,9 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc_1 pcall
+ | mov L:RB, SAVE_L
+ | lea RA, [BASE+NARGS:RD*8]
+ | cmp RA, L:RB->maxstack; ja ->fff_fallback
| lea RA, [BASE+16]
| sub NARGS:RDd, 1
| mov PCd, 16+FRAME_PCALL
@@ -1481,6 +1484,9 @@ static void build_subroutines(BuildCtx *ctx)
| jmp ->vm_call_dispatch
|
|.ffunc_2 xpcall
+ | mov L:RB, SAVE_L
+ | lea RA, [BASE+NARGS:RD*8]
+ | cmp RA, L:RB->maxstack; ja ->fff_fallback
| mov LFUNC:RA, [BASE+8]
| checktp_nc LFUNC:RA, LJ_TFUNC, ->fff_fallback
| mov LFUNC:RB, [BASE] // Swap function and traceback.
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index c44a24ff..9c5ae384 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -1369,7 +1369,7 @@ static void build_subroutines(BuildCtx *ctx)
| mov LFUNC:RB, [RA-8]
| add NARGS:RD, 1
| // This is fragile. L->base must not move, KBASE must always be defined.
- |.if x64
+ |.if X64
| cmp KBASEa, rdx // Continue with CALLT if flag set.
|.else
| cmp KBASE, BASE // Continue with CALLT if flag set.
@@ -1793,6 +1793,9 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Base library: catch errors ----------------------------------------
|
|.ffunc_1 pcall
+ | mov L:RB, SAVE_L
+ | lea RA, [BASE+NARGS:RD*8]
+ | cmp RA, L:RB->maxstack; ja ->fff_fallback
| lea RA, [BASE+8]
| sub NARGS:RD, 1
| mov PC, 8+FRAME_PCALL
@@ -1804,6 +1807,9 @@ static void build_subroutines(BuildCtx *ctx)
| jmp ->vm_call_dispatch
|
|.ffunc_2 xpcall
+ | mov L:RB, SAVE_L
+ | lea RA, [BASE+NARGS:RD*8]
+ | cmp RA, L:RB->maxstack; ja ->fff_fallback
| cmp dword [BASE+12], LJ_TFUNC; jne ->fff_fallback
| mov RB, [BASE+4] // Swap function and traceback.
| mov [BASE+12], RB