From 03b03ef68315b40d75c1888a933391744e19b359 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 6 Sep 2016 19:37:30 +0200 Subject: [PATCH 01/94] Windows/x86: Add MSVC flags for debug build with exception interop. --- src/msvcbuild.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index f977a249..4334bbde 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -67,7 +67,7 @@ buildvm -m folddef -o lj_folddef.h lj_opt_fold.c @if "%1" neq "debug" goto :NODEBUG @shift @set LJCOMPILE=%LJCOMPILE% /Zi -@set LJLINK=%LJLINK% /debug +@set LJLINK=%LJLINK% /debug /opt:ref /opt:icf /incremental:no :NODEBUG @if "%1"=="amalg" goto :AMALGDLL @if "%1"=="static" goto :STATIC From 9910dedae4d0fab1518ff25036e339dee16f48c5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 19 Sep 2016 21:17:58 +0200 Subject: [PATCH 02/94] Initialize uv->immutable for upvalues of loaded chunks. Thanks to Peter Cawley. --- src/lj_func.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lj_func.c b/src/lj_func.c index 431a56d7..3f6626b3 100644 --- a/src/lj_func.c +++ b/src/lj_func.c @@ -140,7 +140,9 @@ GCfunc *lj_func_newL_empty(lua_State *L, GCproto *pt, GCtab *env) /* NOBARRIER: The GCfunc is new (marked white). */ for (i = 0; i < nuv; i++) { GCupval *uv = func_emptyuv(L); - uv->dhash = (uint32_t)(uintptr_t)pt ^ ((uint32_t)proto_uv(pt)[i] << 24); + int32_t v = proto_uv(pt)[i]; + uv->immutable = ((v / PROTO_UV_IMMUTABLE) & 1); + uv->dhash = (uint32_t)(uintptr_t)pt ^ (v << 24); setgcref(fn->l.uvptr[i], obj2gco(uv)); } fn->l.nupvalues = (uint8_t)nuv; From 8ada57eb49fb03d4d1c3cb37e534b97fbb92a5e6 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 19 Sep 2016 21:22:19 +0200 Subject: [PATCH 03/94] Looks like COLORTERM has gone out of fashion. --- src/jit/dump.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/jit/dump.lua b/src/jit/dump.lua index 7b776422..f70926ab 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -644,7 +644,8 @@ end local function dumpon(opt, outfile) if active then dumpoff() end - local colormode = os.getenv("COLORTERM") and "A" or "T" + local term = os.getenv("TERM") + local colormode = (term and term:match("color") or os.getenv("COLORTERM")) and "A" or "T" if opt then opt = gsub(opt, "[TAH]", function(mode) colormode = mode; return ""; end) end From fcc824489914bd4f23c3a49ff9f32fb23cd53c48 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 2 Oct 2016 14:24:04 +0200 Subject: [PATCH 04/94] ARM: Fix BLX encoding for Thumb interworking calls. Thanks to Charles Baylis. --- src/lj_emit_arm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_emit_arm.h b/src/lj_emit_arm.h index 7f73698d..610bc6c5 100644 --- a/src/lj_emit_arm.h +++ b/src/lj_emit_arm.h @@ -273,7 +273,7 @@ static void emit_call(ASMState *as, void *target) ptrdiff_t delta = ((char *)target - (char *)p) - 8; if ((((delta>>2) + 0x00800000) >> 24) == 0) { if ((delta & 1)) - *p = ARMI_BLX | ((uint32_t)(delta>>2) & 0x00ffffffu) | ((delta&2) << 27); + *p = ARMI_BLX | ((uint32_t)(delta>>2) & 0x00ffffffu) | ((delta&2) << 23); else *p = ARMI_BL | ((uint32_t)(delta>>2) & 0x00ffffffu); } else { /* Target out of range: need indirect call. But don't use R0-R3. */ From cf80edbbbade52a842eb70ab4c5ad2b61cf152df Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 2 Oct 2016 14:33:31 +0200 Subject: [PATCH 05/94] Fix -jp=a mode for builtins. --- src/jit/p.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/src/jit/p.lua b/src/jit/p.lua index 09b3b9fe..c9b6f307 100644 --- a/src/jit/p.lua +++ b/src/jit/p.lua @@ -156,6 +156,7 @@ local function prof_annotate(count1, samples) ms = math.max(ms, v) if pct >= prof_min then local file, line = k:match("^(.*):(%d+)$") + if not file then file = k; line = 0 end local fl = files[file] if not fl then fl = {}; files[file] = fl; files[#files+1] = file end line = tonumber(line) From 63465fe71d2605499ce5f3355db3629fb8283cb2 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 8 Oct 2016 11:30:01 +0200 Subject: [PATCH 06/94] LJ_GC64: Fix jit.on/off. --- src/lj_dispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_dispatch.c b/src/lj_dispatch.c index e5aa495d..ad70cba6 100644 --- a/src/lj_dispatch.c +++ b/src/lj_dispatch.c @@ -267,7 +267,7 @@ int luaJIT_setmode(lua_State *L, int idx, int mode) case LUAJIT_MODE_FUNC: case LUAJIT_MODE_ALLFUNC: case LUAJIT_MODE_ALLSUBFUNC: { - cTValue *tv = idx == 0 ? frame_prev(L->base-1) : + cTValue *tv = idx == 0 ? frame_prev(L->base-1)-LJ_FR2 : idx > 0 ? L->base + (idx-1) : L->top + idx; GCproto *pt; if ((idx == 0 || tvisfunc(tv)) && isluafunc(&gcval(tv)->fn)) From 54b78e7c66bff326a0223aac1fd5373515ca3014 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 12 Oct 2016 17:36:45 +0200 Subject: [PATCH 07/94] LJ_GC64: Various fixes. Contributed by Peter Cawley. --- src/lj_record.c | 2 ++ src/lj_target_x86.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lj_record.c b/src/lj_record.c index 76699a9f..48018f42 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1765,6 +1765,8 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) int32_t numparams = J->pt->numparams; ptrdiff_t nvararg = frame_delta(J->L->base-1) - numparams - 1 - LJ_FR2; lua_assert(frame_isvarg(J->L->base-1)); + if (LJ_FR2 && dst > J->maxslot) + J->base[dst-1] = 0; /* Prevent resurrection of unrelated slot. */ if (J->framedepth > 0) { /* Simple case: varargs defined on-trace. */ ptrdiff_t i; if (nvararg < 0) nvararg = 0; diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index d5429597..c7be59ad 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -31,7 +31,7 @@ enum { FPRDEF(RIDENUM) /* Floating-point registers (FPRs). */ RID_MAX, RID_MRM = RID_MAX, /* Pseudo-id for ModRM operand. */ - RID_RIP = RID_MAX+1, /* Pseudo-id for RIP (x64 only). */ + RID_RIP = RID_MAX+5, /* Pseudo-id for RIP (x64 only), rm bits = 5. */ /* Calling conventions. */ RID_SP = RID_ESP, From a68c4118572529e0223cad3d4f2d214a54b1ab7a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 13 Oct 2016 18:36:39 +0200 Subject: [PATCH 08/94] Fix GC step size calculation. Thanks to Igor Ehrlich. --- src/lj_gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_gc.c b/src/lj_gc.c index c5ff1f04..2c447c79 100644 --- a/src/lj_gc.c +++ b/src/lj_gc.c @@ -307,7 +307,7 @@ static size_t propagatemark(global_State *g) if (gc_traverse_tab(g, t) > 0) black2gray(o); /* Keep weak tables gray. */ return sizeof(GCtab) + sizeof(TValue) * t->asize + - sizeof(Node) * (t->hmask + 1); + (t->hmask ? sizeof(Node) * (t->hmask + 1) : 0); } else if (LJ_LIKELY(gct == ~LJ_TFUNC)) { GCfunc *fn = gco2func(o); gc_traverse_func(g, fn); From 6a25014c1c33448cabdc013ccb9e5c4fc98a0238 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 16 Oct 2016 21:04:38 +0200 Subject: [PATCH 09/94] LJ_FR2: Fix slot 1 handling. Contributed by Peter Cawley. --- src/jit/dump.lua | 2 ++ src/lj_record.c | 4 ++-- src/lj_snap.c | 16 ++++++++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/jit/dump.lua b/src/jit/dump.lua index fbadccec..a8bc2af2 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -338,6 +338,8 @@ local function formatk(tr, idx, sn) elseif t == 21 then -- int64_t s = sub(tostring(k), 1, -3) if sub(s, 1, 1) ~= "-" then s = "+"..s end + elseif sn == 0x1057fff then -- SNAP(1, SNAP_FRAME | SNAP_NORESTORE, REF_NIL) + return "----" -- Special case for LJ_FR2 slot 1. else s = tostring(k) -- For primitives. end diff --git a/src/lj_record.c b/src/lj_record.c index 48018f42..a858ffa9 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -105,7 +105,7 @@ static void rec_check_slots(jit_State *J) lua_assert(tref_isfunc(tr)); #if LJ_FR2 } else if (s == 1) { - lua_assert(0); + lua_assert((tr & ~TREF_FRAME) == 0); #endif } else if ((tr & TREF_FRAME)) { GCfunc *fn = gco2func(frame_gc(tv)); @@ -747,7 +747,7 @@ void lj_record_tailcall(jit_State *J, BCReg func, ptrdiff_t nargs) } /* Move func + args down. */ if (LJ_FR2 && J->baseslot == 2) - J->base[func+1] = 0; + J->base[func+1] = TREF_FRAME; memmove(&J->base[-1-LJ_FR2], &J->base[func], sizeof(TRef)*(J->maxslot+1+LJ_FR2)); /* Note: the new TREF_FRAME is now at J->base[-1] (even for slot #0). */ /* Tailcalls can form a loop, so count towards the loop unroll limit. */ diff --git a/src/lj_snap.c b/src/lj_snap.c index 48259972..8ca6deb7 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -69,9 +69,13 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots) TRef tr = J->slot[s]; IRRef ref = tref_ref(tr); #if LJ_FR2 - if (s == 1) continue; + if (s == 1) { /* Ignore slot 1 in LJ_FR2 mode, except if tailcalled. */ + if ((tr & TREF_FRAME)) + map[n++] = SNAP(1, SNAP_FRAME | SNAP_NORESTORE, REF_NIL); + continue; + } if ((tr & (TREF_FRAME | TREF_CONT)) && !ref) { - TValue *base = J->L->base - J->baseslot; + cTValue *base = J->L->base - J->baseslot; tr = J->slot[s] = (tr & 0xff0000) | lj_ir_k64(J, IR_KNUM, base[s].u64); ref = tref_ref(tr); } @@ -470,7 +474,11 @@ void lj_snap_replay(jit_State *J, GCtrace *T) goto setslot; bloomset(seen, ref); if (irref_isk(ref)) { - tr = snap_replay_const(J, ir); + /* See special treatment of LJ_FR2 slot 1 in snapshot_slots() above. */ + if (LJ_FR2 && (sn == SNAP(1, SNAP_FRAME | SNAP_NORESTORE, REF_NIL))) + tr = 0; + else + tr = snap_replay_const(J, ir); } else if (!regsp_used(ir->prev)) { pass23 = 1; lua_assert(s != 0); @@ -484,7 +492,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T) } setslot: J->slot[s] = tr | (sn&(SNAP_CONT|SNAP_FRAME)); /* Same as TREF_* flags. */ - J->framedepth += ((sn & (SNAP_CONT|SNAP_FRAME)) && s); + J->framedepth += ((sn & (SNAP_CONT|SNAP_FRAME)) && (s != LJ_FR2)); if ((sn & SNAP_FRAME)) J->baseslot = s+1; } From bdcaf4bfd97d61461a5bc892d5fed3b0ac7ff256 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 19 Oct 2016 09:48:38 +0200 Subject: [PATCH 10/94] LJ_GC64: Fix HREF for pointers. Contributed by Peter Cawley. --- src/lj_asm_x86.h | 11 +++++++++++ src/lj_tab.c | 1 - 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 50784daa..7931ffb5 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1246,7 +1246,18 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) #endif } else { emit_rr(as, XO_MOV, tmp, key); +#if LJ_GC64 + emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15); + if ((as->flags & JIT_F_BMI2)) { + emit_i8(as, 32); + emit_mrm(as, XV_RORX|VEX_64, dest, key); + } else { + emit_shifti(as, XOg_SHR|REX_64, dest, 32); + emit_rr(as, XO_MOV, dest|REX_64, key|REX_64); + } +#else emit_rmro(as, XO_LEA, dest, key, HASH_BIAS); +#endif } } } diff --git a/src/lj_tab.c b/src/lj_tab.c index 8011212f..71be0a90 100644 --- a/src/lj_tab.c +++ b/src/lj_tab.c @@ -28,7 +28,6 @@ static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash) #define hashlohi(t, lo, hi) hashmask((t), hashrot((lo), (hi))) #define hashnum(t, o) hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1)) -#define hashptr(t, p) hashlohi((t), u32ptr(p), u32ptr(p) + HASH_BIAS) #if LJ_GC64 #define hashgcref(t, r) \ hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32)) From 716f2daef8019ce53d75d2c376c74b8f478fd5c5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 20 Oct 2016 20:55:12 +0200 Subject: [PATCH 11/94] LJ_GC64: Various followup fixes. Contributed by Peter Cawley. --- src/lj_asm_x86.h | 17 +++++++++-------- src/lj_record.c | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 7931ffb5..1b94371e 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1247,14 +1247,15 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) } else { emit_rr(as, XO_MOV, tmp, key); #if LJ_GC64 - emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15); - if ((as->flags & JIT_F_BMI2)) { - emit_i8(as, 32); - emit_mrm(as, XV_RORX|VEX_64, dest, key); - } else { - emit_shifti(as, XOg_SHR|REX_64, dest, 32); - emit_rr(as, XO_MOV, dest|REX_64, key|REX_64); - } + checkmclim(as); + emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15); + if ((as->flags & JIT_F_BMI2)) { + emit_i8(as, 32); + emit_mrm(as, XV_RORX|VEX_64, dest, key); + } else { + emit_shifti(as, XOg_SHR|REX_64, dest, 32); + emit_rr(as, XO_MOV, dest|REX_64, key|REX_64); + } #else emit_rmro(as, XO_LEA, dest, key, HASH_BIAS); #endif diff --git a/src/lj_record.c b/src/lj_record.c index a858ffa9..448db0cf 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -2263,6 +2263,8 @@ void lj_record_ins(jit_State *J) rc = lj_ir_kint(J, (int32_t)(int16_t)rc); break; case BC_KNIL: + if (LJ_FR2 && ra > J->maxslot) + J->base[ra-1] = 0; while (ra <= rc) J->base[ra++] = TREF_NIL; if (rc >= J->maxslot) J->maxslot = rc+1; From 7a58a8fb3d3d5808c54d096ab772113bf9024ae8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 13 Nov 2016 20:03:01 +0100 Subject: [PATCH 12/94] Report parent of stitched trace. Thanks to Nick Zavaritsky. --- src/jit/dump.lua | 2 +- src/jit/v.lua | 2 +- src/lj_trace.c | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/jit/dump.lua b/src/jit/dump.lua index a8bc2af2..1adf7095 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -556,7 +556,7 @@ local function dump_trace(what, tr, func, pc, otr, oex) if what == "start" then if dumpmode.H then out:write('
\n') end
     out:write("---- TRACE ", tr, " ", what)
-    if otr then out:write(" ", otr, "/", oex) end
+    if otr then out:write(" ", otr, "/", oex == -1 and "stitch" or oex) end
     out:write(" ", fmtfunc(func, pc), "\n")
   elseif what == "stop" or what == "abort" then
     out:write("---- TRACE ", tr, " ", what)
diff --git a/src/jit/v.lua b/src/jit/v.lua
index 60c8b05a..b07ec7c0 100644
--- a/src/jit/v.lua
+++ b/src/jit/v.lua
@@ -99,7 +99,7 @@ end
 local function dump_trace(what, tr, func, pc, otr, oex)
   if what == "start" then
     startloc = fmtfunc(func, pc)
-    startex = otr and "("..otr.."/"..oex..") " or ""
+    startex = otr and "("..otr.."/"..(oex == -1 and "stitch" or oex)..") " or ""
   else
     if what == "abort" then
       local loc = fmtfunc(func, pc)
diff --git a/src/lj_trace.c b/src/lj_trace.c
index 87146832..11e54d97 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -446,6 +446,12 @@ static void trace_start(jit_State *J)
     if (J->parent) {
       setintV(L->top++, J->parent);
       setintV(L->top++, J->exitno);
+    } else {
+      BCOp op = bc_op(*J->pc);
+      if (op == BC_CALLM || op == BC_CALL || op == BC_ITERC) {
+	setintV(L->top++, J->exitno);  /* Parent of stitched trace. */
+	setintV(L->top++, -1);
+      }
     }
   );
   lj_record_setup(J);

From 5400c1e42469cdb3cb5df691baa877b762b27704 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Wed, 16 Nov 2016 11:18:10 +0100
Subject: [PATCH 13/94] MIPS: Fix TSETR barrier.

Thanks to tongwell.
---
 src/vm_mips.dasc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc
index 6f5a83dd..7dc42905 100644
--- a/src/vm_mips.dasc
+++ b/src/vm_mips.dasc
@@ -4317,7 +4317,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_next2
     |
     |7:  // Possible table write barrier for the value. Skip valiswhite check.
-    |  barrierback TAB:RB, TMP3, TMP0, <2
+    |  barrierback TAB:CARG2, TMP3, TMP0, <2
     break;
 
   case BC_TSETM:

From e577db52c543303543c9e30e8ebe0c244e1b85c8 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sat, 19 Nov 2016 19:53:46 +0100
Subject: [PATCH 14/94] Increase range of GG_State loads via IR_FLOAD with
 REF_NIL.

Require 32 bit alignment and store offset/4 instead.
Otherwise this can overflow the 10 bit limit for the FOLD op2 key.
---
 src/lj_asm_mips.h | 2 +-
 src/lj_asm_ppc.h  | 2 +-
 src/lj_asm_x86.h  | 4 ++--
 src/lj_ir.c       | 6 ++++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h
index cf446346..0ae5e287 100644
--- a/src/lj_asm_mips.h
+++ b/src/lj_asm_mips.h
@@ -901,7 +901,7 @@ static void asm_fload(ASMState *as, IRIns *ir)
   int32_t ofs;
   if (ir->op1 == REF_NIL) {
     idx = RID_JGL;
-    ofs = ir->op2 - 32768;
+    ofs = (ir->op2 << 2) - 32768;
   } else {
     idx = ra_alloc1(as, ir->op1, RSET_GPR);
     if (ir->op2 == IRFL_TAB_ARRAY) {
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h
index 46821515..1ac882ca 100644
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -809,7 +809,7 @@ static void asm_fload(ASMState *as, IRIns *ir)
   int32_t ofs;
   if (ir->op1 == REF_NIL) {
     idx = RID_JGL;
-    ofs = ir->op2 - 32768;
+    ofs = (ir->op2 << 2) - 32768;
   } else {
     idx = ra_alloc1(as, ir->op1, RSET_GPR);
     if (ir->op2 == IRFL_TAB_ARRAY) {
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 1b94371e..381eac9c 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -234,10 +234,10 @@ static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
   as->mrm.idx = RID_NONE;
   if (ir->op1 == REF_NIL) {
 #if LJ_GC64
-    as->mrm.ofs = (int32_t)ir->op2 - GG_OFS(dispatch);
+    as->mrm.ofs = (int32_t)(ir->op2 << 2) - GG_OFS(dispatch);
     as->mrm.base = RID_DISPATCH;
 #else
-    as->mrm.ofs = (int32_t)ir->op2 + ptr2addr(J2GG(as->J));
+    as->mrm.ofs = (int32_t)(ir->op2 << 2) + ptr2addr(J2GG(as->J));
     as->mrm.base = RID_NONE;
 #endif
     return;
diff --git a/src/lj_ir.c b/src/lj_ir.c
index 87fd0f4d..c5c521be 100644
--- a/src/lj_ir.c
+++ b/src/lj_ir.c
@@ -145,10 +145,12 @@ TRef lj_ir_call(jit_State *J, IRCallID id, ...)
   return emitir(CCI_OPTYPE(ci), tr, id);
 }
 
-/* Load field of type t from GG_State + offset. */
+/* Load field of type t from GG_State + offset. Must be 32 bit aligned. */
 LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs)
 {
-  lua_assert(ofs >= IRFL__MAX && ofs < REF_BIAS);
+  lua_assert((ofs & 3) == 0);
+  ofs >>= 2;
+  lua_assert(ofs >= IRFL__MAX && ofs <= 0x3ff);  /* 10 bit FOLD key limit. */
   lj_ir_set(J, IRT(IR_FLOAD, t), REF_NIL, ofs);
   return lj_opt_fold(J);
 }

From 202713a63808856b07136d7639324a6dd548d37e Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sat, 19 Nov 2016 20:53:31 +0100
Subject: [PATCH 15/94] Fix amalgamated build.

---
 src/lj_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lj_state.h b/src/lj_state.h
index e128d321..1e5c8b34 100644
--- a/src/lj_state.h
+++ b/src/lj_state.h
@@ -28,7 +28,7 @@ static LJ_AINLINE void lj_state_checkstack(lua_State *L, MSize need)
 
 LJ_FUNC lua_State *lj_state_new(lua_State *L);
 LJ_FUNC void LJ_FASTCALL lj_state_free(global_State *g, lua_State *L);
-#if LJ_64
+#if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC))
 LJ_FUNC lua_State *lj_state_newstate(lua_Alloc f, void *ud);
 #endif
 

From 13642b75ac37957d9e2a37b35ebec69d6d4b3bc1 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 22:14:09 +0100
Subject: [PATCH 16/94] Whitespace.

---
 src/lj_strfmt_num.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lj_strfmt_num.c b/src/lj_strfmt_num.c
index 04769258..7b33f930 100644
--- a/src/lj_strfmt_num.c
+++ b/src/lj_strfmt_num.c
@@ -138,7 +138,7 @@ static uint32_t nd_mul2k(uint32_t* nd, uint32_t ndhi, uint32_t k,
     }
     if (carry_in) {
       nd[++ndhi] = carry_in; carry_in = 0;
-      if(start++ == ndlo) ++ndlo;
+      if (start++ == ndlo) ++ndlo;
     }
     k -= ND_MUL2K_MAX_SHIFT;
   }

From 04b60707d7d117da22b40736a353e2a10179108a Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 22:16:08 +0100
Subject: [PATCH 17/94] ARM64: Add JIT compiler backend.

Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
Sponsored by Cisco Systems, Inc.
---
 Makefile              |    4 +-
 src/jit/dis_arm64.lua | 1215 +++++++++++++++++++++++++++
 src/lj_arch.h         |    1 -
 src/lj_asm.c          |    4 +
 src/lj_asm_arm64.h    | 1823 +++++++++++++++++++++++++++++++++++++++++
 src/lj_ccall.c        |    2 +-
 src/lj_dispatch.h     |    1 +
 src/lj_emit_arm64.h   |  397 +++++++++
 src/lj_gdbjit.c       |   12 +
 src/lj_target.h       |    4 +-
 src/lj_target_arm64.h |  221 ++++-
 src/vm_arm64.dasc     |  227 ++++-
 12 files changed, 3887 insertions(+), 24 deletions(-)
 create mode 100644 src/jit/dis_arm64.lua
 create mode 100644 src/lj_asm_arm64.h
 create mode 100644 src/lj_emit_arm64.h

diff --git a/Makefile b/Makefile
index 6dfbbde4..5e640d94 100644
--- a/Makefile
+++ b/Makefile
@@ -86,8 +86,8 @@ FILE_MAN= luajit.1
 FILE_PC= luajit.pc
 FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
 FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
-	      dis_x86.lua dis_x64.lua dis_arm.lua dis_ppc.lua \
-	      dis_mips.lua dis_mipsel.lua vmdef.lua
+	      dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
+	      dis_ppc.lua dis_mips.lua dis_mipsel.lua vmdef.lua
 
 ifeq (,$(findstring Windows,$(OS)))
   HOST_SYS:= $(shell uname -s)
diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua
new file mode 100644
index 00000000..909b33bc
--- /dev/null
+++ b/src/jit/dis_arm64.lua
@@ -0,0 +1,1215 @@
+----------------------------------------------------------------------------
+-- LuaJIT ARM64 disassembler module.
+--
+-- Copyright (C) 2005-2016 Mike Pall. All rights reserved.
+-- Released under the MIT license. See Copyright Notice in luajit.h
+--
+-- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+-- Sponsored by Cisco Systems, Inc.
+----------------------------------------------------------------------------
+-- This is a helper module used by the LuaJIT machine code dumper module.
+--
+-- It disassembles most user-mode AArch64 instructions.
+-- NYI: Advanced SIMD and VFP instructions.
+------------------------------------------------------------------------------
+
+local type, tonumber = type, tonumber
+local sub, byte, format = string.sub, string.byte, string.format
+local match, gmatch, gsub = string.match, string.gmatch, string.gsub
+local rep = string.rep
+local concat = table.concat
+local bit = require("bit")
+local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex
+local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
+local ror = bit.ror
+
+------------------------------------------------------------------------------
+-- Opcode maps
+------------------------------------------------------------------------------
+
+local map_adr = { -- PC-relative addressing.
+  shift = 31, mask = 1,
+  [0] = "adrDBx", "adrpDBx"
+}
+
+local map_addsubi = { -- Add/subtract immediate.
+  shift = 29, mask = 3,
+  [0] = "add|movDNIg", "adds|cmnD0NIg", "subDNIg", "subs|cmpD0NIg",
+}
+
+local map_logi = { -- Logical immediate.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
+  }
+}
+
+local map_movwi = { -- Move wide immediate.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
+    }, false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
+  },
+}
+
+local map_bitf = { -- Bitfield.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12w",
+      "bfm|bfi|bfxilDN13w",
+      "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12w"
+    }
+  },
+  {
+    shift = 22, mask = 1,
+    {
+      shift = 29, mask = 3,
+      [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12x",
+      "bfm|bfi|bfxilDN13x",
+      "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12x"
+    }
+  }
+}
+
+local map_datai = { -- Data processing - immediate.
+  shift = 23, mask = 7,
+  [0] = map_adr, map_adr, map_addsubi, false,
+  map_logi, map_movwi, map_bitf,
+  {
+    shift = 15, mask = 0x1c0c1,
+    [0] = "extr|rorDNM4w", [0x10080] = "extr|rorDNM4x",
+    [0x10081] = "extr|rorDNM4x"
+  }
+}
+
+local map_logsr = { -- Logical, shifted register.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = {
+	shift = 21, mask = 7,
+	[0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+	"andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+	     "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+	"eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+	"ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+      }
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = {
+      shift = 21, mask = 7,
+      [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+      "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+      "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+      "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+      "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+    }
+  }
+}
+
+local map_assh = {
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = {
+	shift = 22, mask = 3,
+	[0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg",
+	      "adds|cmnD0NMSg", "adds|cmnD0NMg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
+	      "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
+      },
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg", "adds|cmnD0NMSg",
+	    "adds|cmnD0NMg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
+	    "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
+    }
+  }
+}
+
+local map_addsubsh = { -- Add/subtract, shifted register.
+  shift = 22, mask = 3,
+  [0] = map_assh, map_assh, map_assh
+}
+
+local map_addsubex = { -- Add/subtract, extended register.
+  shift = 22, mask = 3,
+  [0] = {
+    shift = 29, mask = 3,
+    [0] = "addDNMXg", "adds|cmnD0NMXg", "subDNMXg", "subs|cmpD0NMzXg",
+  }
+}
+
+local map_addsubc = { -- Add/subtract, with carry.
+  shift = 10, mask = 63,
+  [0] = {
+    shift = 29, mask = 3,
+    [0] = "adcDNMg", "adcsDNMg", "sbc|ngcDN0Mg", "sbcs|ngcsDN0Mg",
+  }
+}
+
+local map_ccomp = {
+  shift = 4, mask = 1,
+  [0] = {
+    shift = 10, mask = 3,
+    [0] = { -- Conditional compare register.
+      shift = 29, mask = 3,
+      "ccmnNMVCg", false, "ccmpNMVCg",
+    },
+    [2] = {  -- Conditional compare immediate.
+      shift = 29, mask = 3,
+      "ccmnN5VCg", false, "ccmpN5VCg",
+    }
+  }
+}
+
+local map_csel = { -- Conditional select.
+  shift = 11, mask = 1,
+  [0] = {
+    shift = 10, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "cselDNMzCg", false, "csinv|cinv|csetmDNMcg", false,
+    },
+    {
+      shift = 29, mask = 3,
+      [0] = "csinc|cinc|csetDNMcg", false, "csneg|cnegDNMcg", false,
+    }
+  }
+}
+
+local map_data1s = { -- Data processing, 1 source.
+  shift = 29, mask = 1,
+  [0] = {
+    shift = 31, mask = 1,
+    [0] = {
+      shift = 10, mask = 0x7ff,
+      [0] = "rbitDNg", "rev16DNg", "revDNw", false, "clzDNg", "clsDNg"
+    },
+    {
+      shift = 10, mask = 0x7ff,
+      [0] = "rbitDNg", "rev16DNg", "rev32DNx", "revDNx", "clzDNg", "clsDNg"
+    }
+  }
+}
+
+local map_data2s = { -- Data processing, 2 sources.
+  shift = 29, mask = 1,
+  [0] = {
+    shift = 10, mask = 63,
+    false, "udivDNMg", "sdivDNMg", false, false, false, false, "lslDNMg",
+    "lsrDNMg", "asrDNMg", "rorDNMg"
+  }
+}
+
+local map_data3s = { -- Data processing, 3 sources.
+  shift = 29, mask = 7,
+  [0] = {
+    shift = 21, mask = 7,
+    [0] = {
+      shift = 15, mask = 1,
+      [0] = "madd|mulDNMA0g", "msub|mnegDNMA0g"
+    }
+  }, false, false, false,
+  {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 21, mask = 7,
+      [0] = "madd|mulDNMA0g", "smaddl|smullDxNMwA0x", "smulhDNMx", false,
+      false, "umaddl|umullDxNMwA0x", "umulhDNMx"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "msub|mnegDNMA0g", "smsubl|smneglDxNMwA0x", false, false,
+      false, "umsubl|umneglDxNMwA0x"
+    }
+  }
+}
+
+local map_datar = { -- Data processing, register.
+  shift = 28, mask = 1,
+  [0] = {
+    shift = 24, mask = 1,
+    [0] = map_logsr,
+    {
+      shift = 21, mask = 1,
+      [0] = map_addsubsh, map_addsubex
+    }
+  },
+  {
+    shift = 21, mask = 15,
+    [0] = map_addsubc, false, map_ccomp, false, map_csel, false,
+    {
+      shift = 30, mask = 1,
+      [0] = map_data2s, map_data1s
+    },
+    false, map_data3s, map_data3s, map_data3s, map_data3s, map_data3s,
+    map_data3s, map_data3s, map_data3s
+  }
+}
+
+local map_lrl = { -- Load register, literal.
+  shift = 26, mask = 1,
+  [0] = {
+    shift = 30, mask = 3,
+    [0] = "ldrDwB", "ldrDxB", "ldrswDxB"
+  },
+  {
+    shift = 30, mask = 3,
+    [0] = "ldrDsB", "ldrDdB"
+  }
+}
+
+local map_lsriind = { -- Load/store register, immediate pre/post-indexed.
+  shift = 30, mask = 3,
+  [0] = {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strbDwzL", "ldrbDwzL", "ldrsbDxzL", "ldrsbDwzL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strhDwzL", "ldrhDwzL", "ldrshDxzL", "ldrshDwzL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strDwzL", "ldrDwzL", "ldrswDxzL"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "strDszL", "ldrDszL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strDxzL", "ldrDxzL"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "strDdzL", "ldrDdzL"
+    }
+  }
+}
+
+local map_lsriro = {
+  shift = 21, mask = 1,
+  [0] = {  -- Load/store register immediate.
+    shift = 10, mask = 3,
+    [0] = { -- Unscaled immediate.
+      shift = 26, mask = 1,
+      [0] = {
+	shift = 30, mask = 3,
+	[0] = {
+	  shift = 22, mask = 3,
+	  [0] = "sturbDwK", "ldurbDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturhDwK", "ldurhDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturDwK", "ldurDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturDxK", "ldurDxK"
+	}
+      }
+    }, map_lsriind, false, map_lsriind
+  },
+  {  -- Load/store register, register offset.
+    shift = 10, mask = 3,
+    [2] = {
+      shift = 26, mask = 1,
+      [0] = {
+	shift = 30, mask = 3,
+	[1] = {
+	  shift = 22, mask = 3,
+	  [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO"
+	},
+	[2] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDwO", "ldrDwO", "ldrswDxO"
+	},
+	[3] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDxO", "ldrDxO"
+	}
+      },
+      {
+	shift = 30, mask = 3,
+	[2] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDsO", "ldrDsO"
+	},
+	[3] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDdO", "ldrDdO"
+	}
+      }
+    }
+  }
+}
+
+local map_lsp = { -- Load/store register pair, offset.
+  shift = 22, mask = 1,
+  [0] = {
+    shift = 30, mask = 3,
+    [0] = {
+      shift = 26, mask = 1,
+      [0] = "stpDzAzwP", "stpDzAzsP",
+    },
+    {
+      shift = 26, mask = 1,
+      "stpDzAzdP"
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "stpDzAzxP"
+    }
+  },
+  {
+    shift = 30, mask = 3,
+    [0] = {
+      shift = 26, mask = 1,
+      [0] = "ldpDzAzwP", "ldpDzAzsP",
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "ldpswDAxP", "ldpDzAzdP"
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "ldpDzAzxP"
+    }
+  }
+}
+
+local map_ls = { -- Loads and stores.
+  shift = 24, mask = 0x31,
+  [0x10] = map_lrl, [0x30] = map_lsriro,
+  [0x20] = {
+    shift = 23, mask = 3,
+    map_lsp, map_lsp, map_lsp
+  },
+  [0x21] = {
+    shift = 23, mask = 3,
+    map_lsp, map_lsp, map_lsp
+  },
+  [0x31] = {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 30, mask = 3,
+      [0] = {
+	shift = 22, mask = 3,
+	[0] = "strbDwzU", "ldrbDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strhDwzU", "ldrhDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strDwzU", "ldrDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strDxzU", "ldrDxzU"
+      }
+    },
+    {
+      shift = 30, mask = 3,
+      [2] = {
+	shift = 22, mask = 3,
+	[0] = "strDszU", "ldrDszU"
+      },
+      [3] = {
+	shift = 22, mask = 3,
+	[0] = "strDdzU", "ldrDdzU"
+      }
+    }
+  },
+}
+
+local map_datafp = { -- Data processing, SIMD and FP.
+  shift = 28, mask = 7,
+  { -- 001
+    shift = 24, mask = 1,
+    [0] = {
+      shift = 21, mask = 1,
+      {
+	shift = 10, mask = 3,
+	[0] = {
+	  shift = 12, mask = 1,
+	  [0] = {
+	    shift = 13, mask = 1,
+	    [0] = {
+	      shift = 14, mask = 1,
+	      [0] = {
+		shift = 15, mask = 1,
+		[0] = { -- FP/int conversion.
+		  shift = 31, mask = 1,
+		  [0] = {
+		    shift = 16, mask = 0xff,
+		    [0x20] = "fcvtnsDwNs", [0x21] = "fcvtnuDwNs",
+		    [0x22] = "scvtfDsNw", [0x23] = "ucvtfDsNw",
+		    [0x24] = "fcvtasDwNs", [0x25] = "fcvtauDwNs",
+		    [0x26] = "fmovDwNs", [0x27] = "fmovDsNw",
+		    [0x28] = "fcvtpsDwNs", [0x29] = "fcvtpuDwNs",
+		    [0x30] = "fcvtmsDwNs", [0x31] = "fcvtmuDwNs",
+		    [0x38] = "fcvtzsDwNs", [0x39] = "fcvtzuDwNs",
+		    [0x60] = "fcvtnsDwNd", [0x61] = "fcvtnuDwNd",
+		    [0x62] = "scvtfDdNw", [0x63] = "ucvtfDdNw",
+		    [0x64] = "fcvtasDwNd", [0x65] = "fcvtauDwNd",
+		    [0x68] = "fcvtpsDwNd", [0x69] = "fcvtpuDwNd",
+		    [0x70] = "fcvtmsDwNd", [0x71] = "fcvtmuDwNd",
+		    [0x78] = "fcvtzsDwNd", [0x79] = "fcvtzuDwNd"
+		  },
+		  {
+		    shift = 16, mask = 0xff,
+		    [0x20] = "fcvtnsDxNs", [0x21] = "fcvtnuDxNs",
+		    [0x22] = "scvtfDsNx", [0x23] = "ucvtfDsNx",
+		    [0x24] = "fcvtasDxNs", [0x25] = "fcvtauDxNs",
+		    [0x28] = "fcvtpsDxNs", [0x29] = "fcvtpuDxNs",
+		    [0x30] = "fcvtmsDxNs", [0x31] = "fcvtmuDxNs",
+		    [0x38] = "fcvtzsDxNs", [0x39] = "fcvtzuDxNs",
+		    [0x60] = "fcvtnsDxNd", [0x61] = "fcvtnuDxNd",
+		    [0x62] = "scvtfDdNx", [0x63] = "ucvtfDdNx",
+		    [0x64] = "fcvtasDxNd", [0x65] = "fcvtauDxNd",
+		    [0x66] = "fmovDxNd", [0x67] = "fmovDdNx",
+		    [0x68] = "fcvtpsDxNd", [0x69] = "fcvtpuDxNd",
+		    [0x70] = "fcvtmsDxNd", [0x71] = "fcvtmuDxNd",
+		    [0x78] = "fcvtzsDxNd", [0x79] = "fcvtzuDxNd"
+		  }
+		}
+	      },
+	      { -- FP data-processing, 1 source.
+		shift = 31, mask = 1,
+		[0] = {
+		  shift = 22, mask = 3,
+		  [0] = {
+		    shift = 15, mask = 63,
+		    [0] = "fmovDNf", "fabsDNf", "fnegDNf",
+		    "fsqrtDNf", false, "fcvtDdNs", false, false,
+		    "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
+		    "frintaDNf", false, "frintxDNf", "frintiDNf",
+		  },
+		  {
+		    shift = 15, mask = 63,
+		    [0] = "fmovDNf", "fabsDNf", "fnegDNf",
+		    "fsqrtDNf", "fcvtDsNd", false, false, false,
+		    "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
+		    "frintaDNf", false, "frintxDNf", "frintiDNf",
+		  }
+		}
+	      }
+	    },
+	    { -- FP compare.
+	      shift = 31, mask = 1,
+	      [0] = {
+		shift = 14, mask = 3,
+		[0] = {
+		  shift = 23, mask = 1,
+		  [0] = {
+		    shift = 0, mask = 31,
+		    [0] = "fcmpNMf", [8] = "fcmpNZf",
+		    [16] = "fcmpeNMf", [24] = "fcmpeNZf",
+		  }
+		}
+	      }
+	    }
+	  },
+	  { -- FP immediate.
+	    shift = 31, mask = 1,
+	    [0] = {
+	      shift = 5, mask = 31,
+	      [0] = {
+		shift = 23, mask = 1,
+		[0] = "fmovDFf"
+	      }
+	    }
+	  }
+	},
+	{ -- FP conditional compare.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = {
+	      shift = 4, mask = 1,
+	      [0] = "fccmpNMVCf", "fccmpeNMVCf"
+	    }
+	  }
+	},
+	{ -- FP data-processing, 2 sources.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = {
+	      shift = 12, mask = 15,
+	      [0] = "fmulDNMf", "fdivDNMf", "faddDNMf", "fsubDNMf",
+	      "fmaxDNMf", "fminDNMf", "fmaxnmDNMf", "fminnmDNMf",
+	      "fnmulDNMf"
+	    }
+	  }
+	},
+	{ -- FP conditional select.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = "fcselDNMCf"
+	  }
+	}
+      }
+    },
+    { -- FP data-processing, 3 sources.
+      shift = 31, mask = 1,
+      [0] = {
+	shift = 15, mask = 1,
+	[0] = {
+	  shift = 21, mask = 5,
+	  [0] = "fmaddDNMAf", "fnmaddDNMAf"
+	},
+	{
+	  shift = 21, mask = 5,
+	  [0] = "fmsubDNMAf", "fnmsubDNMAf"
+	}
+      }
+    }
+  }
+}
+
+local map_br = { -- Branches, exception generating and system instructions.
+  shift = 29, mask = 7,
+  [0] = "bB",
+  { -- Compare & branch, immediate.
+    shift = 24, mask = 3,
+    [0] = "cbzDBg", "cbnzDBg", "tbzDTBw", "tbnzDTBw"
+  },
+  { -- Conditional branch, immediate.
+    shift = 24, mask = 3,
+    [0] = {
+      shift = 4, mask = 1,
+      [0] = {
+	shift = 0, mask = 15,
+	[0] = "beqB", "bneB", "bhsB", "bloB", "bmiB", "bplB", "bvsB", "bvcB",
+	"bhiB", "blsB", "bgeB", "bltB", "bgtB", "bleB", "balB"
+      }
+    }
+  }, false, "blB",
+  { -- Compare & branch, immediate.
+    shift = 24, mask = 3,
+    [0] = "cbzDBg", "cbnzDBg", "tbzDTBx", "tbnzDTBx"
+  },
+  {
+    shift = 24, mask = 3,
+    [0] = { -- Exception generation.
+      shift = 0, mask = 0xe0001f,
+      [0x200000] = "brkW"
+    },
+    { -- System instructions.
+      shift = 0, mask = 0x3fffff,
+      [0x03201f] = "nop"
+    },
+    { -- Unconditional branch, register.
+      shift = 0, mask = 0xfffc1f,
+      [0x1f0000] = "brNx", [0x3f0000] = "blrNx",
+      [0x5f0000] = "retNx"
+    },
+  }
+}
+
+local map_init = {
+  shift = 25, mask = 15,
+  [0] = false, false, false, false, map_ls, map_datar, map_ls, map_datafp,
+  map_datai, map_datai, map_br, map_br, map_ls, map_datar, map_ls, map_datafp
+}
+
+------------------------------------------------------------------------------
+
+local map_regs = { x = {}, w = {}, d = {}, s = {} }
+
+for i=0,30 do
+  map_regs.x[i] = "x"..i
+  map_regs.w[i] = "w"..i
+  map_regs.d[i] = "d"..i
+  map_regs.s[i] = "s"..i
+end
+map_regs.x[31] = "sp"
+map_regs.w[31] = "wsp"
+map_regs.d[31] = "d31"
+map_regs.s[31] = "s31"
+
+local map_cond = {
+  [0] = "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+  "hi", "ls", "ge", "lt", "gt", "le", "al",
+}
+
+local map_shift = { [0] = "lsl", "lsr", "asr", }
+
+local map_extend = {
+  [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
+}
+
+------------------------------------------------------------------------------
+
+-- Output a nicely formatted line with an opcode and operands.
+local function putop(ctx, text, operands)
+  local pos = ctx.pos
+  local extra = ""
+  if ctx.rel then
+    local sym = ctx.symtab[ctx.rel]
+    if sym then
+      extra = "\t->"..sym
+    end
+  end
+  if ctx.hexdump > 0 then
+    ctx.out(format("%08x  %s  %-5s %s%s\n",
+      ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra))
+  else
+    ctx.out(format("%08x  %-5s %s%s\n",
+      ctx.addr+pos, text, concat(operands, ", "), extra))
+  end
+  ctx.pos = pos + 4
+end
+
+-- Fallback for unknown opcodes.
+local function unknown(ctx)
+  return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
+end
+
+local function match_reg(p, pat, regnum)
+  return map_regs[match(pat, p.."%w-([xwds])")][regnum]
+end
+
+local function fmt_hex32(x)
+  if x < 0 then
+    return tohex(x)
+  else
+    return format("%x", x)
+  end
+end
+
+local imm13_rep = { 0x55555555, 0x11111111, 0x01010101, 0x00010001, 0x00000001 }
+
+local function decode_imm13(op)
+  local imms = band(rshift(op, 10), 63)
+  local immr = band(rshift(op, 16), 63)
+  if band(op, 0x00400000) == 0 then
+    local len = 5
+    if imms >= 56 then
+      if imms >= 60 then len = 1 else len = 2 end
+    elseif imms >= 48 then len = 3 elseif imms >= 32 then len = 4 end
+    local l = lshift(1, len)-1
+    local s = band(imms, l)
+    local r = band(immr, l)
+    local imm = ror(rshift(-1, 31-s), r)
+    if len ~= 5 then imm = band(imm, lshift(1, l)-1) + rshift(imm, 31-l) end
+    imm = imm * imm13_rep[len]
+    local ix = fmt_hex32(imm)
+    if rshift(op, 31) ~= 0 then
+      return ix..tohex(imm)
+    else
+      return ix
+    end
+  else
+    local lo, hi = -1, 0
+    if imms < 32 then lo = rshift(-1, 31-imms) else hi = rshift(-1, 63-imms) end
+    if immr ~= 0 then
+      lo, hi = ror(lo, immr), ror(hi, immr)
+      local x = immr == 32 and 0 or band(bxor(lo, hi), lshift(-1, 32-immr))
+      lo, hi = bxor(lo, x), bxor(hi, x)
+      if immr >= 32 then lo, hi = hi, lo end
+    end
+    if hi ~= 0 then
+      return fmt_hex32(hi)..tohex(lo)
+    else
+      return fmt_hex32(lo)
+    end
+  end
+end
+
+local function parse_immpc(op, name)
+  if name == "b" or name == "bl" then
+    return arshift(lshift(op, 6), 4)
+  elseif name == "adr" or name == "adrp" then
+    local immlo = band(rshift(op, 29), 3)
+    local immhi = lshift(arshift(lshift(op, 8), 13), 2)
+    return bor(immhi, immlo)
+  elseif name == "tbz" or name == "tbnz" then
+    return lshift(arshift(lshift(op, 13), 18), 2)
+  else
+    return lshift(arshift(lshift(op, 8), 13), 2)
+  end
+end
+
+local function parse_fpimm8(op)
+  local sign = band(op, 0x100000) == 0 and 1 or -1
+  local exp = bxor(rshift(arshift(lshift(op, 12), 5), 24), 0x80) - 131
+  local frac = 16+band(rshift(op, 13), 15)
+  return sign * frac * 2^exp
+end
+
+local function prefer_bfx(sf, uns, imms, immr)
+  if imms < immr or imms == 31 or imms == 63 then
+    return false
+  end
+  if immr == 0 then
+    if sf == 0 and (imms == 7 or imms == 15) then
+      return false
+    end
+    if sf ~= 0 and uns == 0 and (imms == 7 or imms == 15 or imms == 31) then
+      return false
+    end
+  end
+  return true
+end
+
+-- Disassemble a single instruction.
+local function disass_ins(ctx)
+  local pos = ctx.pos
+  local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
+  local op = bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
+  local operands = {}
+  local suffix = ""
+  local last, name, pat
+  local vr
+  local map_reg
+  ctx.op = op
+  ctx.rel = nil
+  last = nil
+  local opat
+  opat = map_init[band(rshift(op, 25), 15)]
+  while type(opat) ~= "string" do
+    if not opat then return unknown(ctx) end
+    opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._
+  end
+  name, pat = match(opat, "^([a-z0-9]*)(.*)")
+  local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)")
+  if altname then pat = pat2 end
+  if sub(pat, 1, 1) == "." then
+    local s2, p2 = match(pat, "^([a-z0-9.]*)(.*)")
+    suffix = suffix..s2
+    pat = p2
+  end
+
+  local rt = match(pat, "[gf]")
+  if rt then
+    if rt == "g" then
+      map_reg = band(op, 0x80000000) ~= 0 and map_regs.x or map_regs.w
+    else
+      map_reg = band(op, 0x400000) ~= 0 and map_regs.d or map_regs.s
+    end
+  end
+
+  local second0, immr
+
+  for p in gmatch(pat, ".") do
+    local x = nil
+    if p == "D" then
+      local regnum = band(op, 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "N" then
+      local regnum = band(rshift(op, 5), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "M" then
+      local regnum = band(rshift(op, 16), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "A" then
+      local regnum = band(rshift(op, 10), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "B" then
+      local addr = ctx.addr + pos + parse_immpc(op, name)
+      ctx.rel = addr
+      x = "0x"..tohex(addr)
+    elseif p == "T" then
+      x = bor(band(rshift(op, 26), 32), band(rshift(op, 19), 31))
+    elseif p == "V" then
+      x = band(op, 15)
+    elseif p == "C" then
+      x = map_cond[band(rshift(op, 12), 15)]
+    elseif p == "c" then
+      local rn = band(rshift(op, 5), 31)
+      local rm = band(rshift(op, 16), 31)
+      local cond = band(rshift(op, 12), 15)
+      local invc = bxor(cond, 1)
+      x = map_cond[cond]
+      if altname and cond ~= 14 and cond ~= 15 then
+	local a1, a2 = match(altname, "([^|]*)|(.*)")
+	if rn == rm then
+	  local n = #operands
+	  operands[n] = nil
+	  x = map_cond[invc]
+	  if rn ~= 31 then
+	    if a1 then name = a1 else name = altname end
+	  else
+	    operands[n-1] = nil
+	    name = a2
+	  end
+	end
+      end
+    elseif p == "W" then
+      x = band(rshift(op, 5), 0xffff)
+    elseif p == "Y" then
+      x = band(rshift(op, 5), 0xffff)
+      local hw = band(rshift(op, 21), 3)
+      if altname and (hw == 0 or x ~= 0) then
+	name = altname
+      end
+    elseif p == "L" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local imm9 = arshift(lshift(op, 11), 23)
+      if band(op, 0x800) ~= 0 then
+	x = "["..rn..", #"..imm9.."]!"
+      else
+	x = "["..rn.."], #"..imm9
+      end
+    elseif p == "U" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local sz = band(rshift(op, 30), 3)
+      local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
+      if imm12 ~= 0 then
+	x = "["..rn..", #"..imm12.."]"
+      else
+	x = "["..rn.."]"
+      end
+    elseif p == "K" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local imm9 = arshift(lshift(op, 11), 23)
+      if imm9 ~= 0 then
+	x = "["..rn..", #"..imm9.."]"
+      else
+	x = "["..rn.."]"
+      end
+    elseif p == "O" then
+      local rn, rm = map_regs.x[band(rshift(op, 5), 31)]
+      local m = band(rshift(op, 13), 1)
+      if m == 0 then
+	rm = map_regs.w[band(rshift(op, 16), 31)]
+      else
+	rm = map_regs.x[band(rshift(op, 16), 31)]
+      end
+      x = "["..rn..", "..rm
+      local opt = band(rshift(op, 13), 7)
+      local s = band(rshift(op, 12), 1)
+      local sz = band(rshift(op, 30), 3)
+      -- extension to be applied
+      if opt == 3 then
+       if s == 0 then x = nil
+       else x = x..", lsl #"..sz.."]" end
+      elseif opt == 2 or opt == 6 or opt == 7 then
+	if s == 0 then x = x..", "..map_extend[opt].."]"
+	else x = x..", "..map_extend[opt].." #"..sz.."]" end
+      else
+	x = x.."]"
+      end
+    elseif p == "P" then
+      local opcv, sh = rshift(op, 26), 2
+      if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
+      local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local ind = band(rshift(op, 23), 3)
+      if ind == 1 then
+	x = "["..rn.."], #"..imm7
+      elseif ind == 2 then
+	if imm7 == 0 then
+	  x = "["..rn.."]"
+	else
+	  x = "["..rn..", #"..imm7.."]"
+	end
+      elseif ind == 3 then
+	x = "["..rn..", #"..imm7.."]!"
+      end
+    elseif p == "I" then
+      local shf = band(rshift(op, 22), 3)
+      local imm12 = band(rshift(op, 10), 0x0fff)
+      local n = #operands
+      local rn, rd = band(rshift(op, 5), 31), band(op, 31)
+      if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then
+	name = altname
+	x = nil
+      elseif shf == 0 then
+	x = imm12
+      elseif shf == 1 then
+	x = imm12..", lsl #12"
+      end
+    elseif p == "i" then
+      x = "#0x"..decode_imm13(op)
+    elseif p == "1" then
+      immr = band(rshift(op, 16), 63)
+      x = immr
+    elseif p == "2" then
+      x = band(rshift(op, 10), 63)
+      if altname then
+	local a1, a2, a3, a4, a5, a6 =
+	  match(altname, "([^|]*)|([^|]*)|([^|]*)|([^|]*)|([^|]*)|(.*)")
+	local sf = band(rshift(op, 26), 32)
+	local uns = band(rshift(op, 30), 1)
+	if prefer_bfx(sf, uns, x, immr) then
+	  name = a2
+	  x = x - immr + 1
+	elseif immr == 0 and x == 7 then
+	  local n = #operands
+	  operands[n] = nil
+	  if sf ~= 0 then
+	    operands[n-1] = gsub(operands[n-1], "x", "w")
+	  end
+	  last = operands[n-1]
+	  name = a6
+	  x = nil
+	elseif immr == 0 and x == 15 then
+	  local n = #operands
+	  operands[n] = nil
+	  if sf ~= 0 then
+	    operands[n-1] = gsub(operands[n-1], "x", "w")
+	  end
+	  last = operands[n-1]
+	  name = a5
+	  x = nil
+	elseif x == 31 or x == 63 then
+	  if x == 31 and immr == 0 and name == "sbfm" then
+	    name = a4
+	    local n = #operands
+	    operands[n] = nil
+	    if sf ~= 0 then
+	      operands[n-1] = gsub(operands[n-1], "x", "w")
+	    end
+	    last = operands[n-1]
+	  else
+	    name = a3
+	  end
+	  x = nil
+	elseif band(x, 31) ~= 31 and immr == x+1 and name == "ubfm" then
+	  name = a4
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = nil
+	elseif x < immr then
+	  name = a1
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = x + 1
+	end
+      end
+    elseif p == "3" then
+      x = band(rshift(op, 10), 63)
+      if altname then
+	local a1, a2 = match(altname, "([^|]*)|(.*)")
+	if x < immr then
+	  name = a1
+	  local sf = band(rshift(op, 26), 32)
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = x + 1
+	elseif x >= immr then
+	  name = a2
+	  x = x - immr + 1
+	end
+      end
+    elseif p == "4" then
+      x = band(rshift(op, 10), 63)
+      local rn = band(rshift(op, 5), 31)
+      local rm = band(rshift(op, 16), 31)
+      if altname and rn == rm then
+	local n = #operands
+	operands[n] = nil
+	last = operands[n-1]
+	name = altname
+      end
+    elseif p == "5" then
+      x = band(rshift(op, 16), 31)
+    elseif p == "S" then
+      x = band(rshift(op, 10), 63)
+      if x == 0 then x = nil
+      else x = map_shift[band(rshift(op, 22), 3)].." #"..x end
+    elseif p == "X" then
+      local opt = band(rshift(op, 13), 7)
+      -- Width specifier .
+      if opt ~= 3 and opt ~= 7 then
+	last = map_regs.w[band(rshift(op, 16), 31)]
+	operands[#operands] = last
+      end
+      x = band(rshift(op, 10), 7)
+      -- Extension.
+      if opt == 2 + band(rshift(op, 31), 1) and
+	 band(rshift(op, second0 and 5 or 0), 31) == 31 then
+	if x == 0 then x = nil
+	else x = "lsl #"..x end
+      else
+	if x == 0 then x = map_extend[band(rshift(op, 13), 7)]
+	else x = map_extend[band(rshift(op, 13), 7)].." #"..x end
+      end
+    elseif p == "R" then
+      x = band(rshift(op,21), 3)
+      if x == 0 then x = nil
+      else x = "lsl #"..x*16 end
+    elseif p == "z" then
+      local n = #operands
+      if operands[n] == "sp" then operands[n] = "xzr"
+      elseif operands[n] == "wsp" then operands[n] = "wzr"
+      end
+    elseif p == "Z" then
+      x = 0
+    elseif p == "F" then
+      x = parse_fpimm8(op)
+    elseif p == "g" or p == "f" or p == "x" or p == "w" or
+	   p == "d" or p == "s" then
+      -- These are handled in D/N/M/A.
+    elseif p == "0" then
+      if last == "sp" or last == "wsp" then
+	local n = #operands
+	operands[n] = nil
+	last = operands[n-1]
+	if altname then
+	  local a1, a2 = match(altname, "([^|]*)|(.*)")
+	  if not a1 then
+	    name = altname
+	  elseif second0 then
+	    name, altname = a2, a1
+	  else
+	    name, altname = a1, a2
+	  end
+	end
+      end
+      second0 = true
+    else
+      assert(false)
+    end
+    if x then
+      last = x
+      if type(x) == "number" then x = "#"..x end
+      operands[#operands+1] = x
+    end
+  end
+
+  return putop(ctx, name..suffix, operands)
+end
+
+------------------------------------------------------------------------------
+
+-- Disassemble a block of code.
+local function disass_block(ctx, ofs, len)
+  if not ofs then ofs = 0 end
+  local stop = len and ofs+len or #ctx.code
+  ctx.pos = ofs
+  ctx.rel = nil
+  while ctx.pos < stop do disass_ins(ctx) end
+end
+
+-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
+local function create(code, addr, out)
+  local ctx = {}
+  ctx.code = code
+  ctx.addr = addr or 0
+  ctx.out = out or io.write
+  ctx.symtab = {}
+  ctx.disass = disass_block
+  ctx.hexdump = 8
+  return ctx
+end
+
+-- Simple API: disassemble code (a string) at address and output via out.
+local function disass(code, addr, out)
+  create(code, addr, out):disass()
+end
+
+-- Return register name for RID.
+local function regname(r)
+  if r < 32 then return map_regs.x[r] end
+  return map_regs.d[r-32]
+end
+
+-- Public module functions.
+return {
+  create = create,
+  disass = disass,
+  regname = regname
+}
+
diff --git a/src/lj_arch.h b/src/lj_arch.h
index cc5a0a66..3df602e3 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -226,7 +226,6 @@
 #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
 #define LJ_TARGET_GC64		1
 #define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
-#define LJ_ARCH_NOJIT		1	/* NYI */
 
 #define LJ_ARCH_VERSION		80
 
diff --git a/src/lj_asm.c b/src/lj_asm.c
index 7ce58924..2cb5abea 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -171,6 +171,8 @@ IRFLDEF(FLOFS)
 #include "lj_emit_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_emit_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_emit_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_emit_ppc.h"
 #elif LJ_TARGET_MIPS
@@ -1563,6 +1565,8 @@ static void asm_loop(ASMState *as)
 #include "lj_asm_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_asm_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_asm_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_asm_ppc.h"
 #elif LJ_TARGET_MIPS
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
new file mode 100644
index 00000000..0a2f5306
--- /dev/null
+++ b/src/lj_asm_arm64.h
@@ -0,0 +1,1823 @@
+/*
+** ARM64 IR assembler (SSA IR -> machine code).
+** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Register allocator extensions --------------------------------------- */
+
+/* Allocate a register with a hint. */
+static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
+{
+  Reg r = IR(ref)->r;
+  if (ra_noreg(r)) {
+    if (!ra_hashint(r) && !iscrossref(as, ref))
+      ra_sethint(IR(ref)->r, hint);  /* Propagate register hint. */
+    r = ra_allocref(as, ref, allow);
+  }
+  ra_noweak(as, r);
+  return r;
+}
+
+/* Allocate two source registers for three-operand instructions. */
+static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
+{
+  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
+  Reg left = irl->r, right = irr->r;
+  if (ra_hasreg(left)) {
+    ra_noweak(as, left);
+    if (ra_noreg(right))
+      right = ra_allocref(as, ir->op2, rset_exclude(allow, left));
+    else
+      ra_noweak(as, right);
+  } else if (ra_hasreg(right)) {
+    ra_noweak(as, right);
+    left = ra_allocref(as, ir->op1, rset_exclude(allow, right));
+  } else if (ra_hashint(right)) {
+    right = ra_allocref(as, ir->op2, allow);
+    left = ra_alloc1(as, ir->op1, rset_exclude(allow, right));
+  } else {
+    left = ra_allocref(as, ir->op1, allow);
+    right = ra_alloc1(as, ir->op2, rset_exclude(allow, left));
+  }
+  return left | (right << 8);
+}
+
+/* -- Guard handling ------------------------------------------------------ */
+
+/* Generate an exit stub group at the bottom of the reserved MCode memory. */
+static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
+{
+  MCode *mxp = as->mcbot;
+  int i;
+  if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop)
+    asm_mclimit(as);
+  /* str lr, [sp]; bl ->vm_exit_handler; .long group. */
+  *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP);
+  *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu);
+  mxp++;
+  *mxp++ = group*EXITSTUBS_PER_GROUP;
+  for (i = 0; i < EXITSTUBS_PER_GROUP; i++)
+    *mxp++ = A64I_B | ((-3-i)&0x03ffffffu);
+  lj_mcode_sync(as->mcbot, mxp);
+  lj_mcode_commitbot(as->J, mxp);
+  as->mcbot = mxp;
+  as->mclim = as->mcbot + MCLIM_REDZONE;
+  return mxp - EXITSTUBS_PER_GROUP;
+}
+
+/* Setup all needed exit stubs. */
+static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
+{
+  ExitNo i;
+  if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
+    lj_trace_err(as->J, LJ_TRERR_SNAPOV);
+  for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
+    if (as->J->exitstubgroup[i] == NULL)
+      as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
+}
+
+/* Emit conditional branch to exit for guard. */
+static void asm_guardcc(ASMState *as, A64CC cc)
+{
+  MCode *target = exitstub_addr(as->J, as->snapno);
+  MCode *p = as->mcp;
+  if (LJ_UNLIKELY(p == as->invmcp)) {
+    as->loopinv = 1;
+    *p = A64I_BL | ((target-p) & 0x03ffffffu);
+    emit_cond_branch(as, cc^1, p-1);
+    return;
+  }
+  /* No conditional calls. Emit b.cc/bl instead. */
+  /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */
+  emit_branch(as, A64I_BL, target);
+  emit_cond_branch(as, cc^1, p);
+}
+
+/* -- Operand fusion ------------------------------------------------------ */
+
+/* Limit linear search to this distance. Avoids O(n^2) behavior. */
+#define CONFLICT_SEARCH_LIM	31
+
+static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
+{
+  if (irref_isk(ref)) {
+    IRIns *ir = IR(ref);
+    if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
+      *k = ir->i;
+      return 1;
+    } else if (checki32((int64_t)ir_k64(ir)->u64)) {
+      *k = (int32_t)ir_k64(ir)->u64;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/* Check if there's no conflicting instruction between curins and ref. */
+static int noconflict(ASMState *as, IRRef ref, IROp conflict)
+{
+  IRIns *ir = as->ir;
+  IRRef i = as->curins;
+  if (i > ref + CONFLICT_SEARCH_LIM)
+    return 0;  /* Give up, ref is too far away. */
+  while (--i > ref)
+    if (ir[i].o == conflict)
+      return 0;  /* Conflict found. */
+  return 1;  /* Ok, no conflict. */
+}
+
+/* Fuse the array base of colocated arrays. */
+static int32_t asm_fuseabase(ASMState *as, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
+      !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
+    return (int32_t)sizeof(GCtab);
+  return 0;
+}
+
+#define FUSE_REG	0x40000000
+
+/* Fuse array/hash/upvalue reference into register+offset operand. */
+static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow,
+			  A64Ins ins)
+{
+  IRIns *ir = IR(ref);
+  if (ra_noreg(ir->r)) {
+    if (ir->o == IR_AREF) {
+      if (mayfuse(as, ref)) {
+	if (irref_isk(ir->op2)) {
+	  IRRef tab = IR(ir->op1)->op1;
+	  int32_t ofs = asm_fuseabase(as, tab);
+	  IRRef refa = ofs ? tab : ir->op1;
+	  ofs += 8*IR(ir->op2)->i;
+	  if (emit_checkofs(ins, ofs)) {
+	    *ofsp = ofs;
+	    return ra_alloc1(as, refa, allow);
+	  }
+	} else {
+	  Reg base = ra_alloc1(as, ir->op1, allow);
+	  *ofsp = FUSE_REG|ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
+	  return base;
+	}
+      }
+    } else if (ir->o == IR_HREFK) {
+      if (mayfuse(as, ref)) {
+	int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
+	if (emit_checkofs(ins, ofs)) {
+	  *ofsp = ofs;
+	  return ra_alloc1(as, ir->op1, allow);
+	}
+      }
+    } else if (ir->o == IR_UREFC) {
+      if (irref_isk(ir->op1)) {
+	GCfunc *fn = ir_kfunc(IR(ir->op1));
+	GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
+	int64_t ofs = glofs(as, &uv->tv);
+	if (emit_checkofs(ins, ofs)) {
+	  *ofsp = (int32_t)ofs;
+	  return RID_GL;
+	}
+      }
+    }
+  }
+  *ofsp = 0;
+  return ra_alloc1(as, ref, allow);
+}
+
+/* Fuse m operand into arithmetic/logic instructions. */
+static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  if (ra_hasreg(ir->r)) {
+    ra_noweak(as, ir->r);
+    return A64F_M(ir->r);
+  } else if (irref_isk(ref)) {
+    uint32_t m;
+    int64_t k = get_k64val(ir);
+    if ((ai & 0x1f000000) == 0x0a000000)
+      m = emit_isk13(k, irt_is64(ir->t));
+    else
+      m = emit_isk12(k);
+    if (m)
+      return m;
+  } else if (mayfuse(as, ref)) {
+    if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR && irref_isk(ir->op2)) ||
+	(ir->o == IR_ADD && ir->op1 == ir->op2)) {
+      A64Shift sh = ir->o == IR_BSHR ? A64SH_LSR :
+		    ir->o == IR_BSAR ? A64SH_ASR : A64SH_LSL;
+      int shift = ir->o == IR_ADD ? 1 :
+		    (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
+      IRIns *irl = IR(ir->op1);
+      if (sh == A64SH_LSL &&
+	  irl->o == IR_CONV &&
+	  irl->op2 == ((IRT_I64<op1)) {
+	Reg m = ra_alloc1(as, irl->op1, allow);
+	return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift);
+      } else {
+	Reg m = ra_alloc1(as, ir->op1, allow);
+	return A64F_M(m) | A64F_SH(sh, shift);
+      }
+    } else if (ir->o == IR_CONV &&
+	       ir->op2 == ((IRT_I64<op1, allow);
+      return A64F_M(m) | A64F_EX(A64EX_SXTW);
+    }
+  }
+  return A64F_M(ra_allocref(as, ref, allow));
+}
+
+/* Fuse XLOAD/XSTORE reference into load/store operand. */
+static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref,
+			 RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  Reg base;
+  int32_t ofs = 0;
+  if (ra_noreg(ir->r) && canfuse(as, ir)) {
+    if (ir->o == IR_ADD) {
+      if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs))
+	ref = ir->op1;
+      /* NYI: Fuse add with two registers. */
+    } else if (ir->o == IR_STRREF) {
+      if (asm_isk32(as, ir->op2, &ofs)) {
+	ref = ir->op1;
+      } else if (asm_isk32(as, ir->op1, &ofs)) {
+	ref = ir->op2;
+      } else {
+	/* NYI: Fuse ADD with constant. */
+	Reg rn = ra_alloc1(as, ir->op1, allow);
+	uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn));
+	emit_lso(as, ai, rd, rd, sizeof(GCstr));
+	emit_dn(as, A64I_ADDx^m, rd, rn);
+	return;
+      }
+      ofs += sizeof(GCstr);
+      if (!emit_checkofs(ai, ofs)) {
+	Reg rn = ra_alloc1(as, ref, allow);
+	Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn));
+	emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm);
+	return;
+      }
+    }
+  }
+  base = ra_alloc1(as, ref, allow);
+  emit_lso(as, ai, (rd & 31), base, ofs);
+}
+
+/* -- Calls --------------------------------------------------------------- */
+
+/* Generate a call to a C function. */
+static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
+{
+  uint32_t n, nargs = CCI_XNARGS(ci);
+  int32_t ofs = 0;
+  Reg gpr, fpr = REGARG_FIRSTFPR;
+  if ((void *)ci->func)
+    emit_call(as, (void *)ci->func);
+  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
+    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
+  gpr = REGARG_FIRSTGPR;
+  for (n = 0; n < nargs; n++) { /* Setup args. */
+    IRRef ref = args[n];
+    IRIns *ir = IR(ref);
+    if (ref) {
+      if (irt_isfp(ir->t)) {
+	if (fpr <= REGARG_LASTFPR) {
+	  lua_assert(rset_test(as->freeset, fpr)); /* Must have been evicted. */
+	  ra_leftov(as, fpr, ref);
+	  fpr++;
+	} else {
+	  Reg r = ra_alloc1(as, ref, RSET_FPR);
+	  emit_spstore(as, ir, r, ofs);
+	  ofs += 8;
+	}
+      } else {
+	if (gpr <= REGARG_LASTGPR) {
+	  lua_assert(rset_test(as->freeset, gpr)); /* Must have been evicted. */
+	  ra_leftov(as, gpr, ref);
+	  gpr++;
+	} else {
+	  Reg r = ra_alloc1(as, ref, RSET_GPR);
+	  emit_spstore(as, ir, r, ofs);
+	  ofs += 8;
+	}
+      }
+    }
+  }
+}
+
+/* Setup result reg/sp for call. Evict scratch regs. */
+static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
+{
+  RegSet drop = RSET_SCRATCH;
+  if (ra_hasreg(ir->r))
+    rset_clear(drop, ir->r); /* Dest reg handled below. */
+  ra_evictset(as, drop); /* Evictions must be performed first. */
+  if (ra_used(ir)) {
+    lua_assert(!irt_ispri(ir->t));
+    if (irt_isfp(ir->t)) {
+      if (ci->flags & CCI_CASTU64) {
+	Reg dest = ra_dest(as, ir, RSET_FPR) & 31;
+	emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D_R : A64I_FMOV_S_R,
+		dest, RID_RET);
+      } else {
+	ra_destreg(as, ir, RID_FPRET);
+      }
+    } else {
+      ra_destreg(as, ir, RID_RET);
+    }
+  }
+  UNUSED(ci);
+}
+
+static void asm_callx(ASMState *as, IRIns *ir)
+{
+  IRRef args[CCI_NARGS_MAX*2];
+  CCallInfo ci;
+  IRRef func;
+  IRIns *irf;
+  ci.flags = asm_callx_flags(as, ir);
+  asm_collectargs(as, ir, &ci, args);
+  asm_setupresult(as, ir, &ci);
+  func = ir->op2; irf = IR(func);
+  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
+  if (irref_isk(func)) {  /* Call to constant address. */
+    ci.func = (ASMFunction)(ir_k64(irf)->u64);
+  } else {  /* Need a non-argument register for indirect calls. */
+    Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+    emit_n(as, A64I_BLR, freg);
+    ci.func = (ASMFunction)(void *)0;
+  }
+  asm_gencall(as, &ci, args);
+}
+
+/* -- Returns ------------------------------------------------------------- */
+
+/* Return to lower frame. Guard that it goes to the right spot. */
+static void asm_retf(ASMState *as, IRIns *ir)
+{
+  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
+  void *pc = ir_kptr(IR(ir->op2));
+  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
+  as->topslot -= (BCReg)delta;
+  if ((int32_t)as->topslot < 0) as->topslot = 0;
+  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
+  /* Need to force a spill on REF_BASE now to update the stack slot. */
+  emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
+  emit_setgl(as, base, jit_base);
+  emit_addptr(as, base, -8*delta);
+  asm_guardcc(as, CC_NE);
+  emit_nm(as, A64I_CMPx, RID_TMP,
+	  ra_allock(as, i64ptr(pc), rset_exclude(RSET_GPR, base)));
+  emit_lso(as, A64I_LDRx, RID_TMP, base, -8);
+}
+
+/* -- Type conversions ---------------------------------------------------- */
+
+static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+{
+  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_guardcc(as, CC_NE);
+  emit_nm(as, A64I_FCMPd, (tmp & 31), (left & 31));
+  emit_dn(as, A64I_FCVT_F64_S32, (tmp & 31), dest);
+  emit_dn(as, A64I_FCVT_S32_F64, dest, (left & 31));
+}
+
+static void asm_tobit(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_FPR;
+  Reg left = ra_alloc1(as, ir->op1, allow);
+  Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
+  Reg tmp = ra_scratch(as, rset_clear(allow, right));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  emit_dn(as, A64I_FMOV_R_S, dest, (tmp & 31));
+  emit_dnm(as, A64I_FADDd, (tmp & 31), (left & 31), (right & 31));
+}
+
+static void asm_conv(ASMState *as, IRIns *ir)
+{
+  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+  int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64);
+  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
+  IRRef lref = ir->op1;
+  lua_assert(irt_type(ir->t) != st);
+  if (irt_isfp(ir->t)) {
+    Reg dest = ra_dest(as, ir, RSET_FPR);
+    if (stfp) {  /* FP to FP conversion. */
+      emit_dn(as, st == IRT_NUM ? A64I_FCVT_F32_F64 : A64I_FCVT_F64_F32,
+	      (dest & 31), (ra_alloc1(as, lref, RSET_FPR) & 31));
+    } else {  /* Integer to FP conversion. */
+      Reg left = ra_alloc1(as, lref, RSET_GPR);
+      A64Ins ai = irt_isfloat(ir->t) ?
+	(((IRT_IS64 >> st) & 1) ?
+	 (st == IRT_I64 ? A64I_FCVT_F32_S64 : A64I_FCVT_F32_U64) :
+	 (st == IRT_INT ? A64I_FCVT_F32_S32 : A64I_FCVT_F32_U32)) :
+	(((IRT_IS64 >> st) & 1) ?
+	 (st == IRT_I64 ? A64I_FCVT_F64_S64 : A64I_FCVT_F64_U64) :
+	 (st == IRT_INT ? A64I_FCVT_F64_S32 : A64I_FCVT_F64_U32));
+      emit_dn(as, ai, (dest & 31), left);
+    }
+  } else if (stfp) {  /* FP to integer conversion. */
+    if (irt_isguard(ir->t)) {
+      /* Checked conversions are only supported from number to int. */
+      lua_assert(irt_isint(ir->t) && st == IRT_NUM);
+      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
+    } else {
+      Reg left = ra_alloc1(as, lref, RSET_FPR);
+      Reg dest = ra_dest(as, ir, RSET_GPR);
+      A64Ins ai = irt_is64(ir->t) ?
+	(st == IRT_NUM ?
+	 (irt_isi64(ir->t) ? A64I_FCVT_S64_F64 : A64I_FCVT_U64_F64) :
+	 (irt_isi64(ir->t) ? A64I_FCVT_S64_F32 : A64I_FCVT_U64_F32)) :
+	(st == IRT_NUM ?
+	 (irt_isint(ir->t) ? A64I_FCVT_S32_F64 : A64I_FCVT_U32_F64) :
+	 (irt_isint(ir->t) ? A64I_FCVT_S32_F32 : A64I_FCVT_U32_F32));
+      emit_dn(as, ai, dest, (left & 31));
+    }
+  } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, lref, RSET_GPR);
+    A64Ins ai = st == IRT_I8 ? A64I_SXTBw :
+		st == IRT_U8 ? A64I_UXTBw :
+		st == IRT_I16 ? A64I_SXTHw : A64I_UXTHw;
+    lua_assert(irt_isint(ir->t) || irt_isu32(ir->t));
+    emit_dn(as, ai, dest, left);
+  } else {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    if (irt_is64(ir->t)) {
+      if (st64 || !(ir->op2 & IRCONV_SEXT)) {
+	/* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
+	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
+      } else {  /* 32 to 64 bit sign extension. */
+	Reg left = ra_alloc1(as, lref, RSET_GPR);
+	emit_dn(as, A64I_SXTW, dest, left);
+      }
+    } else {
+      if (st64) {
+	/* This is either a 32 bit reg/reg mov which zeroes the hiword
+	** or a load of the loword from a 64 bit address.
+	*/
+	Reg left = ra_alloc1(as, lref, RSET_GPR);
+	emit_dm(as, A64I_MOVw, dest, left);
+      } else {  /* 32/32 bit no-op (cast). */
+	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
+      }
+    }
+  }
+}
+
+static void asm_strto(ASMState *as, IRIns *ir)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
+  IRRef args[2];
+  Reg dest = 0, tmp;
+  int destused = ra_used(ir);
+  int32_t ofs = 0;
+  ra_evictset(as, RSET_SCRATCH);
+  if (destused) {
+    if (ra_hasspill(ir->s)) {
+      ofs = sps_scale(ir->s);
+      destused = 0;
+      if (ra_hasreg(ir->r)) {
+	ra_free(as, ir->r);
+	ra_modified(as, ir->r);
+	emit_spload(as, ir, ir->r, ofs);
+      }
+    } else {
+      dest = ra_dest(as, ir, RSET_FPR);
+    }
+  }
+  asm_guardcc(as, CC_EQ);
+  if (destused)
+    emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
+  emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET);
+  args[0] = ir->op1; /* GCstr *str */
+  args[1] = ASMREF_TMP1; /* TValue *n  */
+  asm_gencall(as, ci, args);
+  tmp = ra_releasetmp(as, ASMREF_TMP1);
+  emit_opk(as, A64I_ADDx, tmp, RID_SP, ofs, RSET_GPR);
+}
+
+/* -- Memory references --------------------------------------------------- */
+
+/* Get pointer to TValue. */
+static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (irt_isnum(ir->t)) {
+    if (irref_isk(ref)) {
+      /* Use the number constant itself as a TValue. */
+      ra_allockreg(as, i64ptr(ir_knum(ir)), dest);
+    } else {
+      /* Otherwise force a spill and use the spill slot. */
+      emit_opk(as, A64I_ADDx, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
+    }
+  } else {
+    /* Otherwise use g->tmptv to hold the TValue. */
+    RegSet allow = rset_exclude(RSET_GPR, dest);
+    Reg src;
+    if (irref_isk(ref)) {
+      TValue k;
+      lj_ir_kvalue(as->J->L, &k, ir);
+      src = ra_allock(as, k.u64, allow);
+      emit_lso(as, A64I_STRx, src, dest, 0);
+    } else {
+      Reg type;
+      if (irt_ispri(ir->t)) {
+	src = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+	emit_lso(as, A64I_STRx, src, dest, 0);
+      } else if (irt_isint(ir->t)) {
+	src = ra_alloc1(as, ref, allow);
+	type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
+	emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
+	emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
+      } else {
+	src = ra_alloc1(as, ref, allow);
+	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+	emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
+	emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
+      }
+    }
+    ra_allockreg(as, i64ptr(&J2G(as->J)->tmptv), dest);
+  }
+}
+
+static void asm_aref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg idx, base;
+  if (irref_isk(ir->op2)) {
+    IRRef tab = IR(ir->op1)->op1;
+    int32_t ofs = asm_fuseabase(as, tab);
+    IRRef refa = ofs ? tab : ir->op1;
+    uint32_t k = emit_isk12(ofs + 8*IR(ir->op2)->i);
+    if (k) {
+      base = ra_alloc1(as, refa, RSET_GPR);
+      emit_dn(as, A64I_ADDx^k, dest, base);
+      return;
+    }
+  }
+  base = ra_alloc1(as, ir->op1, RSET_GPR);
+  idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
+  emit_dnm(as, A64I_ADDx | A64F_EXSH(A64EX_UXTW, 3), dest, base, idx);
+}
+
+/* Inlined hash lookup. Specialized for key type and for const keys.
+** The equivalent C code is:
+**   Node *n = hashkey(t, key);
+**   do {
+**     if (lj_obj_equal(&n->key, key)) return &n->val;
+**   } while ((n = nextnode(n)));
+**   return niltv(L);
+*/
+static void asm_href(ASMState *as, IRIns *ir, IROp merge)
+{
+  RegSet allow = RSET_GPR;
+  int destused = ra_used(ir);
+  Reg dest = ra_dest(as, ir, allow);
+  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
+  Reg key = 0, tmp = RID_TMP;
+  IRRef refkey = ir->op2;
+  IRIns *irkey = IR(refkey);
+  int isk = irref_isk(ir->op2);
+  IRType1 kt = irkey->t;
+  uint32_t k = 0;
+  uint32_t khash;
+  MCLabel l_end, l_loop, l_next;
+  rset_clear(allow, tab);
+
+  if (!isk) {
+    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
+    rset_clear(allow, key);
+    if (!irt_isstr(kt)) {
+      tmp = ra_scratch(as, allow);
+      rset_clear(allow, tmp);
+    }
+  } else if (irt_isnum(kt)) {
+    int64_t val = (int64_t)ir_knum(irkey)->u64;
+    if (!(k = emit_isk12(val))) {
+      key = ra_allock(as, val, allow);
+      rset_clear(allow, key);
+    }
+  } else if (!irt_ispri(kt)) {
+    if (!(k = emit_isk12(irkey->i))) {
+      key = ra_alloc1(as, refkey, allow);
+      rset_clear(allow, key);
+    }
+  }
+
+  /* Key not found in chain: jump to exit (if merged) or load niltv. */
+  l_end = emit_label(as);
+  as->invmcp = NULL;
+  if (merge == IR_NE)
+    asm_guardcc(as, CC_AL);
+  else if (destused)
+    emit_loada(as, dest, niltvg(J2G(as->J)));
+
+  /* Follow hash chain until the end. */
+  l_loop = --as->mcp;
+  emit_n(as, A64I_CMPx^A64I_K12^0, dest);
+  emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+  l_next = emit_label(as);
+
+  /* Type and value comparison. */
+  if (merge == IR_EQ)
+    asm_guardcc(as, CC_EQ);
+  else
+    emit_cond_branch(as, CC_EQ, l_end);
+
+  if (irt_isnum(kt)) {
+    if (isk) {
+      /* Assumes -0.0 is already canonicalized to +0.0. */
+      if (k)
+	emit_n(as, A64I_CMPx^k, tmp);
+      else
+	emit_nm(as, A64I_CMPx, key, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+    } else {
+      Reg tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
+      Reg ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
+      rset_clear(allow, tisnum);
+      emit_nm(as, A64I_FCMPd, key, ftmp);
+      emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
+      emit_cond_branch(as, CC_LO, l_next);
+      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
+    }
+  } else if (irt_isaddr(kt)) {
+    Reg scr;
+    if (isk) {
+      int64_t kk = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
+      scr = ra_allock(as, kk, allow);
+      emit_nm(as, A64I_CMPx, scr, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+    } else {
+      scr = ra_scratch(as, allow);
+      emit_nm(as, A64I_CMPx, tmp, scr);
+      emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
+    }
+    rset_clear(allow, scr);
+  } else {
+    Reg type, scr;
+    lua_assert(irt_ispri(kt) && !irt_isnil(kt));
+    type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+    scr = ra_scratch(as, rset_clear(allow, type));
+    rset_clear(allow, scr);
+    emit_nm(as, A64I_CMPw, scr, type);
+    emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
+  }
+
+  *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE;
+  if (!isk && irt_isaddr(kt)) {
+    Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
+    rset_clear(allow, type);
+  }
+  /* Load main position relative to tab->node into dest. */
+  khash = isk ? ir_khash(irkey) : 1;
+  if (khash == 0) {
+    emit_lso(as, A64I_LDRx, dest, tab, offsetof(GCtab, node));
+  } else {
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 3), dest, tmp, dest);
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 1), dest, dest, dest);
+    emit_lso(as, A64I_LDRx, tmp, tab, offsetof(GCtab, node));
+    if (isk) {
+      Reg tmphash = ra_allock(as, khash, allow);
+      emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
+      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+    } else if (irt_isstr(kt)) {
+      /* Fetch of str->hash is cheaper than ra_allock. */
+      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+      emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, hash));
+      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+    } else {  /* Must match with hash*() in lj_tab.c. */
+      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+      emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
+      emit_dnm(as, A64I_SUBw, dest, dest, tmp);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
+      emit_dnm(as, A64I_EORw, dest, dest, tmp);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
+      emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
+      emit_dnm(as, A64I_EORw, tmp, tmp, dest);
+      if (irt_isnum(kt)) {
+	emit_dnm(as, A64I_ADDw, dest, dest, dest);
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+	emit_dm(as, A64I_MOVw, tmp, dest);
+	emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
+      } else {
+	checkmclim(as);
+	emit_dm(as, A64I_MOVw, tmp, key);
+	emit_dnm(as, A64I_EORw, dest, dest,
+		 ra_allock(as, irt_toitype(kt) << 15, allow));
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+	emit_dm(as, A64I_MOVx, dest, key);
+      }
+    }
+  }
+}
+
+static void asm_hrefk(ASMState *as, IRIns *ir)
+{
+  IRIns *kslot = IR(ir->op2);
+  IRIns *irkey = IR(kslot->op1);
+  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
+  int32_t kofs = ofs + (int32_t)offsetof(Node, key);
+  int bigofs = !emit_checkofs(A64I_LDRx, ofs);
+  RegSet allow = RSET_GPR;
+  Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
+  Reg node = ra_alloc1(as, ir->op1, allow);
+  Reg key = ra_scratch(as, rset_clear(allow, node));
+  Reg idx = node;
+  uint64_t k;
+  lua_assert(ofs % sizeof(Node) == 0);
+  rset_clear(allow, key);
+  if (bigofs) {
+    idx = dest;
+    rset_clear(allow, dest);
+    kofs = (int32_t)offsetof(Node, key);
+  } else if (ra_hasreg(dest)) {
+    emit_opk(as, A64I_ADDx, dest, node, ofs, allow);
+  }
+  asm_guardcc(as, CC_NE);
+  if (irt_ispri(irkey->t)) {
+    k = ~((int64_t)~irt_toitype(irkey->t) << 47);
+  } else if (irt_isnum(irkey->t)) {
+    k = ir_knum(irkey)->u64;
+  } else {
+    k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
+  }
+  emit_nm(as, A64I_CMPx, key, ra_allock(as, k, allow));
+  emit_lso(as, A64I_LDRx, key, idx, kofs);
+  if (bigofs)
+    emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR);
+}
+
+static void asm_uref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  if (irref_isk(ir->op1)) {
+    GCfunc *fn = ir_kfunc(IR(ir->op1));
+    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+    emit_lsptr(as, A64I_LDRx, dest, v);
+  } else {
+    Reg uv = ra_scratch(as, RSET_GPR);
+    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+    if (ir->o == IR_UREFC) {
+      asm_guardcc(as, CC_NE);
+      emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
+      emit_opk(as, A64I_ADDx, dest, uv,
+	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
+      emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+    } else {
+      emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
+    }
+    emit_lso(as, A64I_LDRx, uv, func,
+	     (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
+  }
+}
+
+static void asm_fref(ASMState *as, IRIns *ir)
+{
+  UNUSED(as); UNUSED(ir);
+  lua_assert(!ra_used(ir));
+}
+
+static void asm_strref(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_GPR;
+  Reg dest = ra_dest(as, ir, allow);
+  Reg base = ra_alloc1(as, ir->op1, allow);
+  IRIns *irr = IR(ir->op2);
+  int32_t ofs = sizeof(GCstr);
+  uint32_t m;
+  rset_clear(allow, base);
+  if (irref_isk(ir->op2) && (m = emit_isk12(ofs + irr->i))) {
+    emit_dn(as, A64I_ADDx^m, dest, base);
+  } else {
+    emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, dest);
+    emit_dnm(as, A64I_ADDx, dest, base, ra_alloc1(as, ir->op2, allow));
+  }
+}
+
+/* -- Loads and stores ---------------------------------------------------- */
+
+static A64Ins asm_fxloadins(IRIns *ir)
+{
+  switch (irt_type(ir->t)) {
+  case IRT_I8: return A64I_LDRB ^ A64I_LS_S;
+  case IRT_U8: return A64I_LDRB;
+  case IRT_I16: return A64I_LDRH ^ A64I_LS_S;
+  case IRT_U16: return A64I_LDRH;
+  case IRT_NUM: return A64I_LDRd;
+  case IRT_FLOAT: return A64I_LDRs;
+  default: return irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw;
+  }
+}
+
+static A64Ins asm_fxstoreins(IRIns *ir)
+{
+  switch (irt_type(ir->t)) {
+  case IRT_I8: case IRT_U8: return A64I_STRB;
+  case IRT_I16: case IRT_U16: return A64I_STRH;
+  case IRT_NUM: return A64I_STRd;
+  case IRT_FLOAT: return A64I_STRs;
+  default: return irt_is64(ir->t) ? A64I_STRx : A64I_STRw;
+  }
+}
+
+static void asm_fload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg idx;
+  A64Ins ai = asm_fxloadins(ir);
+  int32_t ofs;
+  if (ir->op1 == REF_NIL) {
+    idx = RID_GL;
+    ofs = (ir->op2 << 2) - GG_OFS(g);
+  } else {
+    idx = ra_alloc1(as, ir->op1, RSET_GPR);
+    if (ir->op2 == IRFL_TAB_ARRAY) {
+      ofs = asm_fuseabase(as, ir->op1);
+      if (ofs) {  /* Turn the t->array load into an add for colocated arrays. */
+	emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, idx);
+	return;
+      }
+    }
+    ofs = field_ofs[ir->op2];
+  }
+  emit_lso(as, ai, (dest & 31), idx, ofs);
+}
+
+static void asm_fstore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
+    IRIns *irf = IR(ir->op1);
+    Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
+    int32_t ofs = field_ofs[irf->op2];
+    emit_lso(as, asm_fxstoreins(ir), (src & 31), idx, ofs);
+  }
+}
+
+static void asm_xload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+  lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED));
+  asm_fusexref(as, asm_fxloadins(ir), dest, ir->op1, RSET_GPR);
+}
+
+static void asm_xstore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
+		 rset_exclude(RSET_GPR, src));
+  }
+}
+
+static void asm_ahuvload(ASMState *as, IRIns *ir)
+{
+  Reg idx, tmp, type;
+  int32_t ofs = 0;
+  RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+  lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
+	     irt_isint(ir->t));
+  if (ra_used(ir)) {
+    Reg dest = ra_dest(as, ir, allow);
+    tmp = irt_isnum(ir->t) ? ra_scratch(as, rset_clear(gpr, dest)) : dest;
+    if (irt_isaddr(ir->t)) {
+      emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
+    } else if (irt_isnum(ir->t)) {
+      emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+    } else if (irt_isint(ir->t)) {
+      emit_dm(as, A64I_MOVw, dest, dest);
+    }
+  } else {
+    tmp = ra_scratch(as, gpr);
+  }
+  type = ra_scratch(as, rset_clear(gpr, tmp));
+  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
+  /* Always do the type check, even if the load result is unused. */
+  asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE);
+  if (irt_type(ir->t) >= IRT_NUM) {
+    lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t));
+    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	    ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp);
+  } else if (irt_isaddr(ir->t)) {
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+  } else if (irt_isnil(ir->t)) {
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+  } else {
+    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	    ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp);
+  }
+  if (ofs & FUSE_REG)
+    emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
+  else
+    emit_lso(as, A64I_LDRx, tmp, idx, ofs);
+}
+
+static void asm_ahustore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    RegSet allow = RSET_GPR;
+    Reg idx, src = RID_NONE, tmp = RID_TMP, type = RID_NONE;
+    int32_t ofs = 0;
+    if (irt_isnum(ir->t)) {
+      src = ra_alloc1(as, ir->op2, RSET_FPR);
+      idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd);
+      if (ofs & FUSE_REG)
+	emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31));
+      else
+	emit_lso(as, A64I_STRd, (src & 31), idx, ofs);
+    } else {
+      if (!irt_ispri(ir->t)) {
+	src = ra_alloc1(as, ir->op2, allow);
+	rset_clear(allow, src);
+	if (irt_isinteger(ir->t))
+	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	else
+	  type = ra_allock(as, irt_toitype(ir->t), allow);
+      } else {
+	tmp = type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t)<<47), allow);
+      }
+      idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type),
+			   A64I_STRx);
+      if (ofs & FUSE_REG)
+	emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
+      else
+	emit_lso(as, A64I_STRx, tmp, idx, ofs);
+      if (ra_hasreg(src)) {
+	if (irt_isinteger(ir->t)) {
+	  emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), tmp, type, src);
+	} else {
+	  emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, src, type);
+	}
+      }
+    }
+  }
+}
+
+static void asm_sload(ASMState *as, IRIns *ir)
+{
+  int32_t ofs = 8*((int32_t)ir->op1-2);
+  IRType1 t = ir->t;
+  Reg dest = RID_NONE, base;
+  RegSet allow = RSET_GPR;
+  lua_assert(!(ir->op2 & IRSLOAD_PARENT));  /* Handled by asm_head_side(). */
+  lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
+  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
+    dest = ra_scratch(as, RSET_FPR);
+    asm_tointg(as, ir, dest);
+    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
+  } else if (ra_used(ir)) {
+    Reg tmp = RID_NONE;
+    if ((ir->op2 & IRSLOAD_CONVERT))
+      tmp = ra_scratch(as, irt_isint(t) ? RSET_FPR : RSET_GPR);
+    lua_assert((irt_isnum(t)) || irt_isint(t) || irt_isaddr(t));
+    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
+    base = ra_alloc1(as, REF_BASE, rset_clear(allow, dest));
+    if (irt_isaddr(t)) {
+      emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
+    } else if ((ir->op2 & IRSLOAD_CONVERT)) {
+      if (irt_isint(t)) {
+	emit_dn(as, A64I_FCVT_S32_F64, dest, (tmp & 31));
+	/* If value is already loaded for type check, move it to FPR. */
+	if ((ir->op2 & IRSLOAD_TYPECHECK))
+	  emit_dn(as, A64I_FMOV_D_R, (tmp & 31), dest);
+	else
+	  dest = tmp;
+	t.irt = IRT_NUM;  /* Check for original type. */
+      } else {
+	emit_dn(as, A64I_FCVT_F64_S32, (dest & 31), tmp);
+	dest = tmp;
+	t.irt = IRT_INT;  /* Check for original type. */
+      }
+    } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
+      emit_dm(as, A64I_MOVw, dest, dest);
+    }
+    goto dotypecheck;
+  }
+  base = ra_alloc1(as, REF_BASE, allow);
+dotypecheck:
+  rset_clear(allow, base);
+  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+    Reg tmp;
+    if (ra_hasreg(dest) && rset_test(RSET_GPR, dest)) {
+      tmp = dest;
+    } else {
+      tmp = ra_scratch(as, allow);
+      rset_clear(allow, tmp);
+    }
+    if (irt_isnum(t) && !(ir->op2 & IRSLOAD_CONVERT))
+      emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+    /* Need type check, even if the load result is unused. */
+    asm_guardcc(as, irt_isnum(t) ? CC_LS : CC_NE);
+    if (irt_type(t) >= IRT_NUM) {
+      lua_assert(irt_isinteger(t) || irt_isnum(t));
+      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	      ra_allock(as, LJ_TISNUM << 15, allow), tmp);
+    } else if (irt_isnil(t)) {
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+    } else if (irt_ispri(t)) {
+      emit_nm(as, A64I_CMPx,
+	      ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
+    } else {
+      Reg type = ra_scratch(as, allow);
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
+      emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+    }
+    emit_lso(as, A64I_LDRx, tmp, base, ofs);
+    return;
+  }
+  if (ra_hasreg(dest)) {
+    emit_lso(as, irt_isnum(t) ? A64I_LDRd :
+	     (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base, ofs);
+  }
+}
+
+/* -- Allocations --------------------------------------------------------- */
+
+#if LJ_HASFFI
+static void asm_cnew(ASMState *as, IRIns *ir)
+{
+  CTState *cts = ctype_ctsG(J2G(as->J));
+  CTypeID id = (CTypeID)IR(ir->op1)->i;
+  CTSize sz;
+  CTInfo info = lj_ctype_info(cts, id, &sz);
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
+  IRRef args[4];
+  RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
+  lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL));
+
+  as->gcsteps++;
+  asm_setupresult(as, ir, ci);  /* GCcdata * */
+  /* Initialize immutable cdata object. */
+  if (ir->o == IR_CNEWI) {
+    int32_t ofs = sizeof(GCcdata);
+    Reg r = ra_alloc1(as, ir->op2, allow);
+    lua_assert(sz == 4 || sz == 8);
+    emit_lso(as, sz == 8 ? A64I_STRx : A64I_STRw, r, RID_RET, ofs);
+  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
+    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
+    args[0] = ASMREF_L;     /* lua_State *L */
+    args[1] = ir->op1;      /* CTypeID id   */
+    args[2] = ir->op2;      /* CTSize sz    */
+    args[3] = ASMREF_TMP1;  /* CTSize align */
+    asm_gencall(as, ci, args);
+    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
+    return;
+  }
+
+  /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
+  {
+    Reg r = (id < 65536) ? RID_X1 : ra_allock(as, id, allow);
+    emit_lso(as, A64I_STRB, RID_TMP, RID_RET, offsetof(GCcdata, gct));
+    emit_lso(as, A64I_STRH, r, RID_RET, offsetof(GCcdata, ctypeid));
+    emit_d(as, A64I_MOVZw | A64F_U16(~LJ_TCDATA), RID_TMP);
+    if (id < 65536) emit_d(as, A64I_MOVZw | A64F_U16(id), RID_X1);
+  }
+  args[0] = ASMREF_L;     /* lua_State *L */
+  args[1] = ASMREF_TMP1;  /* MSize size   */
+  asm_gencall(as, ci, args);
+  ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)),
+	       ra_releasetmp(as, ASMREF_TMP1));
+}
+#else
+#define asm_cnew(as, ir)	((void)0)
+#endif
+
+/* -- Write barriers ------------------------------------------------------ */
+
+static void asm_tbar(ASMState *as, IRIns *ir)
+{
+  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
+  Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab));
+  Reg gr = ra_allock(as, i64ptr(J2G(as->J)),
+		     rset_exclude(rset_exclude(RSET_GPR, tab), link));
+  Reg mark = RID_TMP;
+  MCLabel l_end = emit_label(as);
+  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
+  emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+  emit_lso(as, A64I_STRx, tab, gr,
+	   (int32_t)offsetof(global_State, gc.grayagain));
+  emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark);
+  emit_lso(as, A64I_LDRx, link, gr,
+	   (int32_t)offsetof(global_State, gc.grayagain));
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), mark);
+  emit_lso(as, A64I_LDRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+}
+
+static void asm_obar(ASMState *as, IRIns *ir)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
+  IRRef args[2];
+  MCLabel l_end;
+  RegSet allow = RSET_GPR;
+  Reg obj, val, tmp;
+  /* No need for other object barriers (yet). */
+  lua_assert(IR(ir->op1)->o == IR_UREFC);
+  ra_evictset(as, RSET_SCRATCH);
+  l_end = emit_label(as);
+  args[0] = ASMREF_TMP1;  /* global_State *g */
+  args[1] = ir->op1;      /* TValue *tv      */
+  asm_gencall(as, ci, args);
+  ra_allockreg(as, i64ptr(J2G(as->J)), ra_releasetmp(as, ASMREF_TMP1) );
+  obj = IR(ir->op1)->r;
+  tmp = ra_scratch(as, rset_exclude(allow, obj));
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
+  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
+  emit_lso(as, A64I_LDRB, tmp, obj,
+     (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
+  emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
+}
+
+/* -- Arithmetic and logic operations ------------------------------------- */
+
+static void asm_fparith(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+  right = (left >> 8); left &= 255;
+  emit_dnm(as, ai, (dest & 31), (left & 31), (right & 31));
+}
+
+static void asm_fpunary(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
+  emit_dn(as, ai, (dest & 31), (left & 31));
+}
+
+static void asm_fpmath(ASMState *as, IRIns *ir)
+{
+  IRFPMathOp fpm = (IRFPMathOp)ir->op2;
+  if (fpm == IRFPM_SQRT) {
+    asm_fpunary(as, ir, A64I_FSQRTd);
+  } else if (fpm <= IRFPM_TRUNC) {
+    asm_fpunary(as, ir, fpm == IRFPM_FLOOR ? A64I_FRINTMd :
+			fpm == IRFPM_CEIL ? A64I_FRINTPd : A64I_FRINTZd);
+  } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) {
+    return;
+  } else {
+    asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
+  }
+}
+
+static int asm_swapops(ASMState *as, IRRef lref, IRRef rref)
+{
+  IRIns *ir;
+  if (irref_isk(rref))
+    return 0;  /* Don't swap constants to the left. */
+  if (irref_isk(lref))
+    return 1;  /* But swap constants to the right. */
+  ir = IR(rref);
+  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<o >= IR_BSHL && ir->o <= IR_BSAR) ||
+      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<op1, rref = ir->op2;
+  Reg left, dest = ra_dest(as, ir, RSET_GPR);
+  uint32_t m;
+  if ((ai & ~A64I_S) != A64I_SUBw && asm_swapops(as, lref, rref)) {
+    IRRef tmp = lref; lref = rref; rref = tmp;
+  }
+  left = ra_hintalloc(as, lref, dest, RSET_GPR);
+  if (irt_is64(ir->t)) ai |= A64I_X;
+  m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
+  if (irt_isguard(ir->t)) {  /* For IR_ADDOV etc. */
+    asm_guardcc(as, CC_VS);
+    ai |= A64I_S;
+  }
+  emit_dn(as, ai^m, dest, left);
+}
+
+static void asm_intop_s(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  if (as->flagmcp == as->mcp) {  /* Drop cmp r, #0. */
+    as->flagmcp = NULL;
+    as->mcp++;
+    ai |= A64I_S;
+  }
+  asm_intop(as, ir, ai);
+}
+
+static void asm_intneg(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+  emit_dm(as, irt_is64(ir->t) ? A64I_NEGx : A64I_NEGw, dest, left);
+}
+
+/* NYI: use add/shift for MUL(OV) with constants. FOLD only does 2^k. */
+static void asm_intmul(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
+  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+  if (irt_isguard(ir->t)) {  /* IR_MULOV */
+    asm_guardcc(as, CC_NE);
+    emit_dm(as, A64I_MOVw, dest, dest);  /* Zero-extend. */
+    emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
+    emit_dnm(as, A64I_SMULL, dest, right, left);
+  } else {
+    emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
+  }
+}
+
+static void asm_add(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FADDd);
+    return;
+  }
+  asm_intop_s(as, ir, A64I_ADDw);
+}
+
+static void asm_sub(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FSUBd);
+    return;
+  }
+  asm_intop_s(as, ir, A64I_SUBw);
+}
+
+static void asm_mul(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FMULd);
+    return;
+  }
+  asm_intmul(as, ir);
+}
+
+static void asm_div(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isnum(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
+					  IRCALL_lj_carith_divu64);
+  else
+#endif
+    asm_fparith(as, ir, A64I_FDIVd);
+}
+
+static void asm_pow(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isnum(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+					  IRCALL_lj_carith_powu64);
+  else
+#endif
+    asm_callid(as, ir, IRCALL_lj_vm_powi);
+}
+
+#define asm_addov(as, ir)	asm_add(as, ir)
+#define asm_subov(as, ir)	asm_sub(as, ir)
+#define asm_mulov(as, ir)	asm_mul(as, ir)
+
+#define asm_abs(as, ir)		asm_fpunary(as, ir, A64I_FABS)
+#define asm_atan2(as, ir)	asm_callid(as, ir, IRCALL_atan2)
+#define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
+
+static void asm_mod(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isint(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
+					  IRCALL_lj_carith_modu64);
+  else
+#endif
+    asm_callid(as, ir, IRCALL_lj_vm_modi);
+}
+
+static void asm_neg(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fpunary(as, ir, A64I_FNEGd);
+    return;
+  }
+  asm_intneg(as, ir);
+}
+
+static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  if (as->flagmcp == as->mcp && ai == A64I_ANDw) {
+    /* Try to drop cmp r, #0. */
+    as->flagmcp = NULL;
+    as->mcp++;
+    ai += A64I_ANDSw - A64I_ANDw;
+  }
+  if (ir->op2 == 0) {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR);
+    if (irt_is64(ir->t)) ai |= A64I_X;
+    emit_d(as, ai^m, dest);
+  } else {
+    asm_intop(as, ir, ai);
+  }
+}
+
+#define asm_bnot(as, ir)	asm_bitop(as, ir, A64I_MVNw)
+#define asm_band(as, ir)	asm_bitop(as, ir, A64I_ANDw)
+#define asm_bor(as, ir)		asm_bitop(as, ir, A64I_ORRw)
+#define asm_bxor(as, ir)	asm_bitop(as, ir, A64I_EORw)
+
+static void asm_bswap(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+  emit_dn(as, irt_is64(ir->t) ? A64I_REVx : A64I_REVw, dest, left);
+}
+
+static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh)
+{
+  int shmask = irt_is64(ir->t) ? 63 : 31;
+  if (irref_isk(ir->op2)) {  /* Constant shifts. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+    int32_t shift = (IR(ir->op2)->i & shmask);
+
+    if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw;
+    switch (sh) {
+    case A64SH_LSL:
+      emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left);
+      break;
+    case A64SH_LSR: case A64SH_ASR:
+      emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left);
+      break;
+    case A64SH_ROR:
+      emit_dnm(as, ai | A64F_IMMS(shift), dest, left, left);
+      break;
+    }
+  } else {  /* Variable-length shifts. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+    emit_dnm(as, (shmask == 63 ? A64I_SHRx : A64I_SHRw) | A64F_BSH(sh), dest, left, right);
+  }
+}
+
+#define asm_bshl(as, ir)	asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSL)
+#define asm_bshr(as, ir)	asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSR)
+#define asm_bsar(as, ir)	asm_bitshift(as, ir, A64I_SBFMw, A64SH_ASR)
+#define asm_bror(as, ir)	asm_bitshift(as, ir, A64I_EXTRw, A64SH_ROR)
+#define asm_brol(as, ir)	lua_assert(0)
+
+static void asm_intmin_max(ASMState *as, IRIns *ir, A64CC cc)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+  emit_dnm(as, A64I_CSELw|A64F_CC(cc), dest, left, right);
+  emit_nm(as, A64I_CMPw, left, right);
+}
+
+static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc)
+{
+  Reg dest = (ra_dest(as, ir, RSET_FPR) & 31);
+  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+  right = ((left >> 8) & 31); left &= 31;
+  emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right);
+  emit_nm(as, A64I_FCMPd, left, right);
+}
+
+static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc)
+{
+  if (irt_isnum(ir->t))
+    asm_fpmin_max(as, ir, fcc);
+  else
+    asm_intmin_max(as, ir, cc);
+}
+
+#define asm_max(as, ir)		asm_min_max(as, ir, CC_GT, CC_HI)
+#define asm_min(as, ir)		asm_min_max(as, ir, CC_LT, CC_LO)
+
+/* -- Comparisons --------------------------------------------------------- */
+
+/* Map of comparisons to flags. ORDER IR. */
+static const uint8_t asm_compmap[IR_ABC+1] = {
+  /* op  FP swp  int cc   FP cc */
+  /* LT       */ CC_GE + (CC_HS << 4),
+  /* GE    x  */ CC_LT + (CC_HI << 4),
+  /* LE       */ CC_GT + (CC_HI << 4),
+  /* GT    x  */ CC_LE + (CC_HS << 4),
+  /* ULT   x  */ CC_HS + (CC_LS << 4),
+  /* UGE      */ CC_LO + (CC_LO << 4),
+  /* ULE   x  */ CC_HI + (CC_LO << 4),
+  /* UGT      */ CC_LS + (CC_LS << 4),
+  /* EQ       */ CC_NE + (CC_NE << 4),
+  /* NE       */ CC_EQ + (CC_EQ << 4),
+  /* ABC      */ CC_LS + (CC_LS << 4)  /* Same as UGT. */
+};
+
+/* FP comparisons. */
+static void asm_fpcomp(ASMState *as, IRIns *ir)
+{
+  Reg left, right;
+  A64Ins ai;
+  int swp = ((ir->o ^ (ir->o >> 2)) & ~(ir->o >> 3) & 1);
+  if (!swp && irref_isk(ir->op2) && ir_knum(IR(ir->op2))->u64 == 0) {
+    left = (ra_alloc1(as, ir->op1, RSET_FPR) & 31);
+    right = 0;
+    ai = A64I_FCMPZd;
+  } else {
+    left = ra_alloc2(as, ir, RSET_FPR);
+    if (swp) {
+      right = (left & 31); left = ((left >> 8) & 31);
+    } else {
+      right = ((left >> 8) & 31); left &= 31;
+    }
+    ai = A64I_FCMPd;
+  }
+  asm_guardcc(as, (asm_compmap[ir->o] >> 4));
+  emit_nm(as, ai, left, right);
+}
+
+/* Integer comparisons. */
+static void asm_intcomp(ASMState *as, IRIns *ir)
+{
+  A64CC oldcc, cc = (asm_compmap[ir->o] & 15);
+  A64Ins ai = irt_is64(ir->t) ? A64I_CMPx : A64I_CMPw;
+  IRRef lref = ir->op1, rref = ir->op2;
+  Reg left;
+  uint32_t m;
+  int cmpprev0 = 0;
+  lua_assert(irt_is64(ir->t) || irt_isint(ir->t) ||
+	     irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t));
+  if (asm_swapops(as, lref, rref)) {
+    IRRef tmp = lref; lref = rref; rref = tmp;
+    if (cc >= CC_GE) cc ^= 7;  /* LT <-> GT, LE <-> GE */
+    else if (cc > CC_NE) cc ^= 11;  /* LO <-> HI, LS <-> HS */
+  }
+  oldcc = cc;
+  if (irref_isk(rref) && IR(rref)->i == 0) {
+    IRIns *irl = IR(lref);
+    if (cc == CC_GE) cc = CC_PL;
+    else if (cc == CC_LT) cc = CC_MI;
+    else if (cc > CC_NE) goto notst;  /* Other conds don't work with tst. */
+    cmpprev0 = (irl+1 == ir);
+    /* Combine comp(BAND(left, right), 0) into tst left, right. */
+    if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) {
+      IRRef blref = irl->op1, brref = irl->op2;
+      uint32_t m2 = 0;
+      Reg bleft;
+      if (asm_swapops(as, blref, brref)) {
+	Reg tmp = blref; blref = brref; brref = tmp;
+      }
+      if (irref_isk(brref)) {
+	/* NYI: use tbz/tbnz, if applicable. */
+	m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t));
+	if (!m2)
+	  goto notst;  /* Not beneficial if we miss a constant operand. */
+      }
+      bleft = ra_alloc1(as, blref, RSET_GPR);
+      ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
+      if (!m2)
+	m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
+      asm_guardcc(as, cc);
+      emit_n(as, ai^m2, bleft);
+      return;
+    }
+    /* NYI: use cbz/cbnz for EQ/NE 0. */
+  }
+notst:
+  left = ra_alloc1(as, lref, RSET_GPR);
+  m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
+  asm_guardcc(as, cc);
+  emit_n(as, ai^m, left);
+  /* Signed comparison with zero and referencing previous ins? */
+  if (cmpprev0 && (oldcc <= CC_NE || oldcc >= CC_GE))
+    as->flagmcp = as->mcp;  /* Allow elimination of the compare. */
+}
+
+static void asm_comp(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t))
+    asm_fpcomp(as, ir);
+  else
+    asm_intcomp(as, ir);
+}
+
+#define asm_equal(as, ir)	asm_comp(as, ir)
+
+/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
+
+/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
+static void asm_hiop(ASMState *as, IRIns *ir)
+{
+  UNUSED(as); UNUSED(ir); lua_assert(0);  /* Unused on 64 bit. */
+}
+
+/* -- Profiling ----------------------------------------------------------- */
+
+static void asm_prof(ASMState *as, IRIns *ir)
+{
+  uint32_t k = emit_isk13(HOOK_PROFILE, 0);
+  lua_assert(k != 0);
+  UNUSED(ir);
+  asm_guardcc(as, CC_NE);
+  emit_n(as, A64I_TSTw^k, RID_TMP);
+  emit_lsptr(as, A64I_LDRB, RID_TMP, (void *)&J2G(as->J)->hookmask);
+}
+
+/* -- Stack handling ------------------------------------------------------ */
+
+/* Check Lua stack size for overflow. Use exit handler as fallback. */
+static void asm_stack_check(ASMState *as, BCReg topslot,
+			    IRIns *irp, RegSet allow, ExitNo exitno)
+{
+  Reg pbase;
+  uint32_t k;
+  if (irp) {
+    if (!ra_hasspill(irp->s)) {
+      pbase = irp->r;
+      lua_assert(ra_hasreg(pbase));
+    } else if (allow) {
+      pbase = rset_pickbot(allow);
+    } else {
+      pbase = RID_RET;
+      emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0);  /* Restore temp register. */
+    }
+  } else {
+    pbase = RID_BASE;
+  }
+  emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno));
+  emit_cond_branch(as, CC_LS^1, as->mcp+1);
+  k = emit_isk12((8*topslot));
+  lua_assert(k);
+  emit_n(as, A64I_CMPx^k, RID_TMP);
+  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
+  emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
+	   (int32_t)offsetof(lua_State, maxstack));
+  if (irp) {  /* Must not spill arbitrary registers in head of side trace. */
+    if (ra_hasspill(irp->s))
+      emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
+    emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
+    if (ra_hasspill(irp->s) && !allow)
+      emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0);  /* Save temp register. */
+  } else {
+    emit_getgl(as, RID_TMP, cur_L);
+  }
+}
+
+/* Restore Lua stack from on-trace state. */
+static void asm_stack_restore(ASMState *as, SnapShot *snap)
+{
+  SnapEntry *map = &as->T->snapmap[snap->mapofs];
+#ifdef LUA_USE_ASSERT
+  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
+#endif
+  MSize n, nent = snap->nent;
+  /* Store the value of all modified slots to the Lua stack. */
+  for (n = 0; n < nent; n++) {
+    SnapEntry sn = map[n];
+    BCReg s = snap_slot(sn);
+    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
+    IRRef ref = snap_ref(sn);
+    IRIns *ir = IR(ref);
+    if ((sn & SNAP_NORESTORE))
+      continue;
+    if (irt_isnum(ir->t)) {
+      Reg src = ra_alloc1(as, ref, RSET_FPR);
+      emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
+    } else {
+      RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
+      lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t));
+      if (!irref_isk(ref)) {
+	Reg type, src;
+	if (irt_is64(ir->t)) {
+	  type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+	  src = ra_alloc1(as, ref, rset_exclude(allow, type));
+	  emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
+	  emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
+	} else if (irt_isinteger(ir->t)) {
+	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	  src = ra_alloc1(as, ref, rset_exclude(allow, type));
+	  emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
+	  emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
+	} else {
+	  type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+	  emit_lso(as, A64I_STRx, type, RID_BASE, ofs);
+	}
+      } else {
+	TValue k;
+	lj_ir_kvalue(as->J->L, &k, ir);
+	emit_lso(as, A64I_STRx,
+		 ra_allock(as, tvisnil(&k) ? -1 : (int64_t)k.u64, allow),
+		 RID_BASE, ofs);
+      }
+    }
+    checkmclim(as);
+  }
+  lua_assert(map + nent == flinks);
+}
+
+/* -- GC handling --------------------------------------------------------- */
+
+/* Check GC threshold and do one or more GC steps. */
+static void asm_gc_check(ASMState *as)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
+  IRRef args[2];
+  MCLabel l_end;
+  Reg tmp1, tmp2;
+  ra_evictset(as, RSET_SCRATCH);
+  l_end = emit_label(as);
+  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
+  asm_guardcc(as, CC_NE);  /* Assumes asm_snap_prep() already done. */
+  emit_n(as, A64I_CMPx^A64I_K12, RID_RET);
+  args[0] = ASMREF_TMP1;  /* global_State *g */
+  args[1] = ASMREF_TMP2;  /* MSize steps     */
+  asm_gencall(as, ci, args);
+  tmp1 = ra_releasetmp(as, ASMREF_TMP1);
+  tmp2 = ra_releasetmp(as, ASMREF_TMP2);
+  emit_loadi(as, tmp2, as->gcsteps);
+  /* Jump around GC step if GC total < GC threshold. */
+  emit_cond_branch(as, CC_LS, l_end);
+  emit_nm(as, A64I_CMPx, RID_TMP, tmp2);
+  emit_lso(as, A64I_LDRx, tmp2, tmp1,
+	   (int32_t)offsetof(global_State, gc.threshold));
+  emit_lso(as, A64I_LDRx, RID_TMP, tmp1,
+	   (int32_t)offsetof(global_State, gc.total));
+  ra_allockreg(as, i64ptr(J2G(as->J)), tmp1);
+  as->gcsteps = 0;
+  checkmclim(as);
+}
+
+/* -- Loop handling ------------------------------------------------------- */
+
+/* Fixup the loop branch. */
+static void asm_loop_fixup(ASMState *as)
+{
+  MCode *p = as->mctop;
+  MCode *target = as->mcp;
+  if (as->loopinv) {  /* Inverted loop branch? */
+    ptrdiff_t delta = target - (p - 2);
+    lua_assert(((delta + 0x40000) >> 19) == 0);
+    /* asm_guardcc already inverted the b.cc and patched the final bl. */
+    p[-2] |= ((uint32_t)delta & 0x7ffff) << 5;
+  } else {
+    ptrdiff_t delta = target - (p - 1);
+    p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu);
+  }
+}
+
+/* -- Head of trace ------------------------------------------------------- */
+
+/* Reload L register from g->cur_L. */
+static void asm_head_lreg(ASMState *as)
+{
+  IRIns *ir = IR(ASMREF_L);
+  if (ra_used(ir)) {
+    Reg r = ra_dest(as, ir, RSET_GPR);
+    emit_getgl(as, r, cur_L);
+    ra_evictk(as);
+  }
+}
+
+/* Coalesce BASE register for a root trace. */
+static void asm_head_root_base(ASMState *as)
+{
+  IRIns *ir;
+  asm_head_lreg(as);
+  ir = IR(REF_BASE);
+  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+    ra_spill(as, ir);
+  ra_destreg(as, ir, RID_BASE);
+}
+
+/* Coalesce BASE register for a side trace. */
+static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
+{
+  IRIns *ir;
+  asm_head_lreg(as);
+  ir = IR(REF_BASE);
+  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+    ra_spill(as, ir);
+  if (ra_hasspill(irp->s)) {
+    rset_clear(allow, ra_dest(as, ir, allow));
+  } else {
+    Reg r = irp->r;
+    lua_assert(ra_hasreg(r));
+    rset_clear(allow, r);
+    if (r != ir->r && !rset_test(as->freeset, r))
+      ra_restore(as, regcost_ref(as->cost[r]));
+    ra_destreg(as, ir, r);
+  }
+  return allow;
+}
+
+/* -- Tail of trace ------------------------------------------------------- */
+
+/* Fixup the tail code. */
+static void asm_tail_fixup(ASMState *as, TraceNo lnk)
+{
+  MCode *p = as->mctop;
+  MCode *target;
+  /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */
+  int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED));
+  if (spadj == 0) {
+    as->mctop = --p;
+  } else {
+    /* Patch stack adjustment. */
+    uint32_t k = emit_isk12(spadj);
+    lua_assert(k);
+    p[-2] = (A64I_ADDx^k) | A64F_D(RID_SP) | A64F_N(RID_SP);
+  }
+  /* Patch exit branch. */
+  target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
+  p[-1] = A64I_B | (((target-p)+1)&0x03ffffffu);
+}
+
+/* Prepare tail of code. */
+static void asm_tail_prep(ASMState *as)
+{
+  MCode *p = as->mctop - 1;  /* Leave room for exit branch. */
+  if (as->loopref) {
+    as->invmcp = as->mcp = p;
+  } else {
+    as->mcp = p-1;  /* Leave room for stack pointer adjustment. */
+    as->invmcp = NULL;
+  }
+  *p = 0;  /* Prevent load/store merging. */
+}
+
+/* -- Trace setup --------------------------------------------------------- */
+
+/* Ensure there are enough stack slots for call arguments. */
+static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
+{
+  IRRef args[CCI_NARGS_MAX*2];
+  uint32_t i, nargs = CCI_XNARGS(ci);
+  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+  asm_collectargs(as, ir, ci, args);
+  for (i = 0; i < nargs; i++) {
+    if (args[i] && irt_isfp(IR(args[i])->t)) {
+      if (nfpr > 0) nfpr--; else nslots += 2;
+    } else {
+      if (ngpr > 0) ngpr--; else nslots += 2;
+    }
+  }
+  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
+    as->evenspill = nslots;
+  return REGSP_HINT(RID_RET);
+}
+
+static void asm_setup_target(ASMState *as)
+{
+  /* May need extra exit for asm_stack_check on side traces. */
+  asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0));
+}
+
+/* -- Trace patching ------------------------------------------------------ */
+
+/* Patch exit jumps of existing machine code to a new target. */
+void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
+{
+  MCode *p = T->mcode;
+  MCode *pe = (MCode *)((char *)p + T->szmcode);
+  MCode *cstart = NULL, *cend = p;
+  MCode *mcarea = lj_mcode_patch(J, p, 0);
+  MCode *px = exitstub_addr(J, exitno);
+  for (; p < pe; p++) {
+    /* Look for bl exitstub, replace with b target. */
+    uint32_t ins = *p;
+    if ((ins & 0xfc000000u) == 0x94000000u &&
+	((ins ^ (px-p)) & 0x03ffffffu) == 0) {
+      *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu);
+      cend = p+1;
+      if (!cstart) cstart = p;
+    }
+  }
+  lua_assert(cstart != NULL);
+  lj_mcode_sync(cstart, cend);
+  lj_mcode_patch(J, mcarea, 1);
+}
+
diff --git a/src/lj_ccall.c b/src/lj_ccall.c
index b599be33..a3ae8b05 100644
--- a/src/lj_ccall.c
+++ b/src/lj_ccall.c
@@ -331,7 +331,7 @@
 
 #define CCALL_HANDLE_COMPLEXARG \
   /* Pass complex by value in separate (!) FPRs or on stack. */ \
-  isfp = ctr->size == 2*sizeof(float) ? 2 : 1;
+  isfp = sz == 2*sizeof(float) ? 2 : 1;
 
 #define CCALL_HANDLE_REGARG \
   if (LJ_TARGET_IOS && isva) { \
diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h
index 82708077..362d6202 100644
--- a/src/lj_dispatch.h
+++ b/src/lj_dispatch.h
@@ -107,6 +107,7 @@ typedef struct GG_State {
 #define J2G(J)		(&J2GG(J)->g)
 #define G2J(gl)		(&G2GG(gl)->J)
 #define L2J(L)		(&L2GG(L)->J)
+#define GG_G2J		(GG_OFS(J) - GG_OFS(g))
 #define GG_G2DISP	(GG_OFS(dispatch) - GG_OFS(g))
 #define GG_DISP2G	(GG_OFS(g) - GG_OFS(dispatch))
 #define GG_DISP2J	(GG_OFS(J) - GG_OFS(dispatch))
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
new file mode 100644
index 00000000..eb8f7fc7
--- /dev/null
+++ b/src/lj_emit_arm64.h
@@ -0,0 +1,397 @@
+/*
+** ARM64 instruction emitter.
+** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Constant encoding --------------------------------------------------- */
+
+static uint64_t get_k64val(IRIns *ir)
+{
+  if (ir->o == IR_KINT64) {
+    return ir_kint64(ir)->u64;
+  } else if (ir->o == IR_KGC) {
+    return (uint64_t)ir_kgc(ir);
+  } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
+    return (uint64_t)ir_kptr(ir);
+  } else {
+    lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL);
+    return ir->i;  /* Sign-extended. */
+  }
+}
+
+/* Encode constant in K12 format for data processing instructions. */
+static uint32_t emit_isk12(int64_t n)
+{
+  uint64_t k = (n < 0) ? -n : n;
+  uint32_t m = (n < 0) ? 0x40000000 : 0;
+  if (k < 0x1000) {
+    return A64I_K12|m|A64F_U12(k);
+  } else if ((k & 0xfff000) == k) {
+    return A64I_K12|m|0x400000|A64F_U12(k>>12);
+  }
+  return 0;
+}
+
+#define emit_clz64(n)	__builtin_clzll(n)
+#define emit_ctz64(n)	__builtin_ctzll(n)
+
+/* Encode constant in K13 format for logical data processing instructions. */
+static uint32_t emit_isk13(uint64_t n, int is64)
+{
+  int inv = 0, w = 128, lz, tz;
+  if (n & 1) { n = ~n; w = 64; inv = 1; }  /* Avoid wrap-around of ones. */
+  if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
+  do {  /* Find the repeat width. */
+    if (is64 && (uint32_t)(n^(n>>32))) break;
+    n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break;
+    n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
+    n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
+    n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
+    n = n & 0x3; w = 2;
+  } while (0);
+  lz = emit_clz64(n);
+  tz = emit_ctz64(n);
+  if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
+  if (inv)
+    return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
+  else
+    return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
+}
+
+static uint32_t emit_isfpk64(uint64_t n)
+{
+  uint64_t etop9 = ((n >> 54) & 0x1ff);
+  if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
+    return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
+  }
+  return ~0u;
+}
+
+/* -- Emit basic instructions --------------------------------------------- */
+
+static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
+}
+
+static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
+}
+
+static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
+{
+  *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_d(ASMState *as, A64Ins ai, Reg rd)
+{
+  *--as->mcp = ai | A64F_D(rd);
+}
+
+static void emit_n(ASMState *as, A64Ins ai, Reg rn)
+{
+  *--as->mcp = ai | A64F_N(rn);
+}
+
+static int emit_checkofs(A64Ins ai, int64_t ofs)
+{
+  int scale = (ai >> 30) & 3;
+  if (ofs < 0 || (ofs & ((1<= -256 && ofs <= 255) ? -1 : 0;
+  } else {
+    return (ofs < (4096<> 30) & 3;
+  lua_assert(ot);
+  /* Combine LDR/STR pairs to LDP/STP. */
+  if ((sc == 2 || sc == 3) &&
+      (!(ai & 0x400000) || rd != rn) &&
+      as->mcp != as->mcloop) {
+    uint32_t prev = *as->mcp & ~A64F_D(31);
+    int ofsm = ofs - (1<>sc)) ||
+	prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
+      aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
+    } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
+	       prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
+      aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
+      ofsm = ofs;
+    } else {
+      goto nopair;
+    }
+    if (ofsm >= (-64<mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
+	(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
+      return;
+    }
+  }
+nopair:
+  if (ot == 1)
+    *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
+  else
+    *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
+}
+
+/* -- Emit loads/stores --------------------------------------------------- */
+
+/* Prefer rematerialization of BASE/L from global_State over spills. */
+#define emit_canremat(ref)	((ref) <= ASMREF_L)
+
+/* Try to find an N-step delta relative to other consts with N < lim. */
+static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
+{
+  RegSet work = ~as->freeset & RSET_GPR;
+  if (lim <= 1) return 0;  /* Can't beat that. */
+  while (work) {
+    Reg r = rset_picktop(work);
+    IRRef ref = regcost_ref(as->cost[r]);
+    lua_assert(r != rd);
+    if (ref < REF_TRUE) {
+      uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
+				     get_k64val(IR(ref));
+      int64_t delta = (int64_t)(k - kx);
+      if (delta == 0) {
+	emit_dm(as, A64I_MOVx, rd, r);
+	return 1;
+      } else {
+	uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
+	if (k12) {
+	  emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
+	  return 1;
+	}
+	/* Do other ops or multi-step deltas pay off? Probably not.
+	** E.g. XOR rarely helps with pointer consts.
+	*/
+      }
+    }
+    rset_clear(work, r);
+  }
+  return 0;  /* Failed. */
+}
+
+static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
+{
+  uint32_t k13 = emit_isk13(u64, is64);
+  if (k13) {  /* Can the constant be represented as a bitmask immediate? */
+    emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
+  } else {
+    int i, zeros = 0, ones = 0, neg;
+    if (!is64) u64 = (int64_t)(int32_t)u64;  /* Sign-extend. */
+    /* Count homogeneous 16 bit fragments. */
+    for (i = 0; i < 4; i++) {
+      uint64_t frag = (u64 >> i*16) & 0xffff;
+      zeros += (frag == 0);
+      ones += (frag == 0xffff);
+    }
+    neg = ones > zeros;  /* Use MOVN if it pays off. */
+    if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
+      int shift = 0, lshift = 0;
+      uint64_t n64 = neg ? ~u64 : u64;
+      if (n64 != 0) {
+	/* Find first/last fragment to be filled. */
+	shift = (63-emit_clz64(n64)) & ~15;
+	lshift = emit_ctz64(n64) & ~15;
+      }
+      /* MOVK requires the original value (u64). */
+      while (shift > lshift) {
+	uint32_t u16 = (u64 >> shift) & 0xffff;
+	/* Skip fragments that are correctly filled by MOVN/MOVZ. */
+	if (u16 != (neg ? 0xffff : 0))
+	  emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
+	shift -= 16;
+      }
+      /* But MOVN needs an inverted value (n64). */
+      emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
+		 A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
+    }
+  }
+}
+
+/* Load a 32 bit constant into a GPR. */
+#define emit_loadi(as, rd, i)	emit_loadk(as, rd, i, 0)
+
+/* Load a 64 bit constant into a GPR. */
+#define emit_loadu64(as, rd, i)	emit_loadk(as, rd, i, A64I_X)
+
+#define emit_loada(as, r, addr)	emit_loadu64(as, (r), (uintptr_t)(addr))
+
+#define glofs(as, k) \
+  ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
+#define mcpofs(as, k) \
+  ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp))
+#define checkmcpofs(as, k) \
+  ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0)
+
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+
+/* Get/set from constant pointer. */
+static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
+{
+  /* First, check if ip + offset is in range. */
+  if ((ai & 0x00400000) && checkmcpofs(as, p)) {
+    emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
+  } else {
+    Reg base = RID_GL;  /* Next, try GL + offset. */
+    int64_t ofs = glofs(as, p);
+    if (!emit_checkofs(ai, ofs)) {  /* Else split up into base reg + offset. */
+      int64_t i64 = i64ptr(p);
+      base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
+      ofs = i64 & 0x7fffull;
+    }
+    emit_lso(as, ai, r, base, ofs);
+  }
+}
+
+/* Load 64 bit IR constant into register. */
+static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
+{
+  const uint64_t *k = &ir_k64(ir)->u64;
+  int64_t ofs;
+  if (r >= RID_MAX_GPR) {
+    uint32_t fpk = emit_isfpk64(*k);
+    if (fpk != ~0u) {
+      emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
+      return;
+    }
+  }
+  ofs = glofs(as, k);
+  if (emit_checkofs(A64I_LDRx, ofs)) {
+    emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
+	     (r & 31), RID_GL, ofs);
+  } else {
+    if (r >= RID_MAX_GPR) {
+      emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
+      r = RID_TMP;
+    }
+    if (checkmcpofs(as, k))
+      emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
+    else
+      emit_loadu64(as, r, *k);
+  }
+}
+
+/* Get/set global_State fields. */
+#define emit_getgl(as, r, field) \
+  emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
+#define emit_setgl(as, r, field) \
+  emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
+
+/* Trace number is determined from pc of exit instruction. */
+#define emit_setvmstate(as, i)	UNUSED(i)
+
+/* -- Emit control-flow instructions -------------------------------------- */
+
+/* Label for internal jumps. */
+typedef MCode *MCLabel;
+
+/* Return label pointing to current PC. */
+#define emit_label(as)		((as)->mcp)
+
+static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x40000) >> 19) == 0);
+  *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond;
+  as->mcp = p;
+}
+
+static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x02000000) >> 26) == 0);
+  *--p = ai | ((uint32_t)delta & 0x03ffffffu);
+  as->mcp = p;
+}
+
+#define emit_jmp(as, target)	emit_branch(as, A64I_B, (target))
+
+static void emit_call(ASMState *as, void *target)
+{
+  MCode *p = --as->mcp;
+  ptrdiff_t delta = (char *)target - (char *)p;
+  if ((((delta>>2) + 0x02000000) >> 26) == 0) {
+    *p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu);
+  } else {  /* Target out of range: need indirect call. But don't use R0-R7. */
+    Reg r = ra_allock(as, i64ptr(target),
+		      RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+    *p = A64I_BLR | A64F_N(r);
+  }
+}
+
+/* -- Emit generic operations --------------------------------------------- */
+
+/* Generic move between two regs. */
+static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
+{
+  if (dst >= RID_MAX_GPR) {
+    emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
+	    (dst & 31), (src & 31));
+    return;
+  }
+  if (as->mcp != as->mcloop) {  /* Swap early registers for loads/stores. */
+    MCode ins = *as->mcp, swp = (src^dst);
+    if ((ins & 0xbf800000) == 0xb9000000) {
+      if (!((ins ^ (dst << 5)) & 0x000003e0))
+	*as->mcp = ins ^ (swp << 5);  /* Swap N in load/store. */
+      if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
+	*as->mcp = ins ^ swp;  /* Swap D in store. */
+    }
+  }
+  emit_dm(as, A64I_MOVx, dst, src);
+}
+
+/* Generic load of register with base and (small) offset address. */
+static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+  if (r >= RID_MAX_GPR)
+    emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
+  else
+    emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
+}
+
+/* Generic store of register with base and (small) offset address. */
+static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+  if (r >= RID_MAX_GPR)
+    emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
+  else
+    emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
+}
+
+/* Emit an arithmetic operation with a constant operand. */
+static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
+		     int32_t i, RegSet allow)
+{
+  uint32_t k = emit_isk12(i);
+  if (k)
+    emit_dn(as, ai^k, dest, src);
+  else
+    emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
+}
+
+/* Add offset to pointer. */
+static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
+{
+  if (ofs)
+    emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
+		 ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
+}
+
+#define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
+
diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
index 8b72be7d..8bc2474c 100644
--- a/src/lj_gdbjit.c
+++ b/src/lj_gdbjit.c
@@ -296,6 +296,9 @@ enum {
 #elif LJ_TARGET_ARM
   DW_REG_SP = 13,
   DW_REG_RA = 14,
+#elif LJ_TARGET_ARM64
+  DW_REG_SP = 31,
+  DW_REG_RA = 30,
 #elif LJ_TARGET_PPC
   DW_REG_SP = 1,
   DW_REG_RA = 65,
@@ -374,6 +377,8 @@ static const ELFheader elfhdr_template = {
   .machine = 62,
 #elif LJ_TARGET_ARM
   .machine = 40,
+#elif LJ_TARGET_ARM64
+  .machine = 183,
 #elif LJ_TARGET_PPC
   .machine = 20,
 #elif LJ_TARGET_MIPS
@@ -563,6 +568,13 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
       int i;
       for (i = 11; i >= 4; i--) { DB(DW_CFA_offset|i); DUV(2+(11-i)); }
     }
+#elif LJ_TARGET_ARM64
+    {
+      int i;
+      DB(DW_CFA_offset|31); DUV(2);
+      for (i = 28; i >= 19; i--) { DB(DW_CFA_offset|i); DUV(3+(28-i)); }
+      for (i = 15; i >= 8; i--) { DB(DW_CFA_offset|32|i); DUV(28-i); }
+    }
 #elif LJ_TARGET_PPC
     {
       int i;
diff --git a/src/lj_target.h b/src/lj_target.h
index abea8d5b..c069eb95 100644
--- a/src/lj_target.h
+++ b/src/lj_target.h
@@ -55,7 +55,7 @@ typedef uint32_t RegSP;
 /* Bitset for registers. 32 registers suffice for most architectures.
 ** Note that one set holds bits for both GPRs and FPRs.
 */
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
+#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
 typedef uint64_t RegSet;
 #else
 typedef uint32_t RegSet;
@@ -69,7 +69,7 @@ typedef uint32_t RegSet;
 #define rset_set(rs, r)		(rs |= RID2RSET(r))
 #define rset_clear(rs, r)	(rs &= ~RID2RSET(r))
 #define rset_exclude(rs, r)	(rs & ~RID2RSET(r))
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
+#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
 #define rset_picktop(rs)	((Reg)(__builtin_clzll(rs)^63))
 #define rset_pickbot(rs)	((Reg)__builtin_ctzll(rs))
 #else
diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h
index 57ab134f..0cef06d5 100644
--- a/src/lj_target_arm64.h
+++ b/src/lj_target_arm64.h
@@ -55,7 +55,8 @@ enum {
 
 /* Make use of all registers, except for x18, fp, lr and sp. */
 #define RSET_FIXED \
-  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP))
+  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)|\
+   RID2RSET(RID_GL))
 #define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
 #define RSET_FPR	RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
 #define RSET_ALL	(RSET_GPR|RSET_FPR)
@@ -73,25 +74,235 @@ enum {
 #define REGARG_LASTFPR		RID_D7
 #define REGARG_NUMFPR		8
 
+/* -- Spill slots --------------------------------------------------------- */
+
+/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
+**
+** SPS_FIXED: Available fixed spill slots in interpreter frame.
+** This definition must match with the vm_arm64.dasc file.
+** Pre-allocate some slots to avoid sp adjust in every root trace.
+**
+** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots.
+*/
+#define SPS_FIXED	4
+#define SPS_FIRST	2
+
+#define SPOFS_TMP	0
+
+#define sps_scale(slot)		(4 * (int32_t)(slot))
+#define sps_align(slot)		(((slot) - SPS_FIXED + 3) & ~3)
+
+/* -- Exit state ---------------------------------------------------------- */
+
+/* This definition must match with the *.dasc file(s). */
+typedef struct {
+  lua_Number fpr[RID_NUM_FPR];	/* Floating-point registers. */
+  intptr_t gpr[RID_NUM_GPR];	/* General-purpose registers. */
+  int32_t spill[256];		/* Spill slots. */
+} ExitState;
+
+/* PC after instruction that caused an exit. Used to find the trace number. */
+#define EXITSTATE_PCREG		RID_LR
+/* Highest exit + 1 indicates stack check. */
+#define EXITSTATE_CHECKEXIT	1
+
+#define EXITSTUB_SPACING	4
+#define EXITSTUBS_PER_GROUP	32
+
+
 /* -- Instructions -------------------------------------------------------- */
 
 /* Instruction fields. */
 #define A64F_D(r)	(r)
-#define A64F_N(r)       ((r) << 5)
-#define A64F_A(r)       ((r) << 10)
-#define A64F_M(r)       ((r) << 16)
+#define A64F_N(r)	((r) << 5)
+#define A64F_A(r)	((r) << 10)
+#define A64F_M(r)	((r) << 16)
+#define A64F_IMMS(x)	((x) << 10)
+#define A64F_IMMR(x)	((x) << 16)
 #define A64F_U16(x)	((x) << 5)
+#define A64F_U12(x)	((x) << 10)
 #define A64F_S26(x)	(x)
 #define A64F_S19(x)	((x) << 5)
+#define A64F_S9(x)	((x) << 12)
+#define A64F_SH(sh, x)	(((sh) << 22) | ((x) << 10))
+#define A64F_EX(ex)	(A64I_EX | ((ex) << 13))
+#define A64F_EXSH(ex,x)	(A64I_EX | ((ex) << 13) | ((x) << 10))
+#define A64F_FP8(x)	((x) << 13)
+#define A64F_CC(cc)	((cc) << 12)
+#define A64F_LSL16(x)	(((x) / 16) << 21)
+#define A64F_BSH(sh)	((sh) << 10)
 
 typedef enum A64Ins {
+  A64I_S = 0x20000000,
+  A64I_X = 0x80000000,
+  A64I_EX = 0x00200000,
+  A64I_K12 = 0x1a000000,
+  A64I_K13 = 0x18000000,
+  A64I_LS_U = 0x01000000,
+  A64I_LS_S = 0x00800000,
+  A64I_LS_R = 0x01200800,
+  A64I_LS_UXTWx = 0x00005000,
+  A64I_LS_LSLx = 0x00007000,
+
+  A64I_ADDw = 0x0b000000,
+  A64I_ADDx = 0x8b000000,
+  A64I_ADDSw = 0x2b000000,
+  A64I_ADDSx = 0xab000000,
+  A64I_NEGw = 0x4b0003e0,
+  A64I_NEGx = 0xcb0003e0,
+  A64I_SUBw = 0x4b000000,
+  A64I_SUBx = 0xcb000000,
+  A64I_SUBSw = 0x6b000000,
+  A64I_SUBSx = 0xeb000000,
+
+  A64I_MULw = 0x1b007c00,
+  A64I_MULx = 0x9b007c00,
+  A64I_SMULL = 0x9b207c00,
+
+  A64I_ANDw = 0x0a000000,
+  A64I_ANDx = 0x8a000000,
+  A64I_ANDSw = 0x6a000000,
+  A64I_ANDSx = 0xea000000,
+  A64I_EORw = 0x4a000000,
+  A64I_EORx = 0xca000000,
+  A64I_ORRw = 0x2a000000,
+  A64I_ORRx = 0xaa000000,
+  A64I_TSTw  = 0x6a00001f,
+  A64I_TSTx  = 0xea00001f,
+
+  A64I_CMPw = 0x6b00001f,
+  A64I_CMPx = 0xeb00001f,
+  A64I_CMNw = 0x2b00001f,
+  A64I_CMNx = 0xab00001f,
+  A64I_CCMPw = 0x7a400000,
+  A64I_CCMPx = 0xfa400000,
+  A64I_CSELw = 0x1a800000,
+  A64I_CSELx = 0x9a800000,
+
+  A64I_ASRw = 0x13007c00,
+  A64I_ASRx = 0x9340fc00,
+  A64I_LSLx = 0xd3400000,
+  A64I_LSRx = 0xd340fc00,
+  A64I_SHRw = 0x1ac02000,
+  A64I_SHRx = 0x9ac02000,	/* lsl/lsr/asr/ror x0, x0, x0 */
+  A64I_REVw = 0x5ac00800,
+  A64I_REVx = 0xdac00c00,
+
+  A64I_EXTRw = 0x13800000,
+  A64I_EXTRx = 0x93c00000,
+  A64I_SBFMw = 0x13000000,
+  A64I_SBFMx = 0x93400000,
+  A64I_SXTBw = 0x13001c00,
+  A64I_SXTHw = 0x13003c00,
+  A64I_SXTW = 0x93407c00,
+  A64I_UBFMw = 0x53000000,
+  A64I_UBFMx = 0xd3400000,
+  A64I_UXTBw = 0x53001c00,
+  A64I_UXTHw = 0x53003c00,
+
+  A64I_MOVw = 0x2a0003e0,
+  A64I_MOVx = 0xaa0003e0,
+  A64I_MVNw = 0x2a2003e0,
+  A64I_MVNx = 0xaa2003e0,
+  A64I_MOVKw = 0x72800000,
+  A64I_MOVKx = 0xf2800000,
   A64I_MOVZw = 0x52800000,
   A64I_MOVZx = 0xd2800000,
+  A64I_MOVNw = 0x12800000,
+  A64I_MOVNx = 0x92800000,
+
+  A64I_LDRB = 0x39400000,
+  A64I_LDRH = 0x79400000,
+  A64I_LDRw = 0xb9400000,
+  A64I_LDRx = 0xf9400000,
   A64I_LDRLw = 0x18000000,
   A64I_LDRLx = 0x58000000,
-  A64I_NOP = 0xd503201f,
+  A64I_STRB = 0x39000000,
+  A64I_STRH = 0x79000000,
+  A64I_STRw = 0xb9000000,
+  A64I_STRx = 0xf9000000,
+  A64I_STPw = 0x29000000,
+  A64I_STPx = 0xa9000000,
+  A64I_LDPw = 0x29400000,
+  A64I_LDPx = 0xa9400000,
+
   A64I_B = 0x14000000,
+  A64I_BCC = 0x54000000,
+  A64I_BL = 0x94000000,
   A64I_BR = 0xd61f0000,
+  A64I_BLR = 0xd63f0000,
+
+  A64I_NOP = 0xd503201f,
+
+  /* FP */
+  A64I_FADDd = 0x1e602800,
+  A64I_FSUBd = 0x1e603800,
+  A64I_FMADDd = 0x1f400000,
+  A64I_FMSUBd = 0x1f408000,
+  A64I_FNMADDd = 0x1f600000,
+  A64I_FNMSUBd = 0x1f608000,
+  A64I_FMULd = 0x1e600800,
+  A64I_FDIVd = 0x1e601800,
+  A64I_FNEGd = 0x1e614000,
+  A64I_FABS = 0x1e60c000,
+  A64I_FSQRTd = 0x1e61c000,
+  A64I_LDRs = 0xbd400000,
+  A64I_LDRd = 0xfd400000,
+  A64I_STRs = 0xbd000000,
+  A64I_STRd = 0xfd000000,
+  A64I_LDPs = 0x2d400000,
+  A64I_LDPd = 0x6d400000,
+  A64I_STPs = 0x2d000000,
+  A64I_STPd = 0x6d000000,
+  A64I_FCMPd = 0x1e602000,
+  A64I_FCMPZd = 0x1e602008,
+  A64I_FCSELd = 0x1e600c00,
+  A64I_FRINTMd = 0x1e654000,
+  A64I_FRINTPd = 0x1e64c000,
+  A64I_FRINTZd = 0x1e65c000,
+
+  A64I_FCVT_F32_F64 = 0x1e624000,
+  A64I_FCVT_F64_F32 = 0x1e22c000,
+  A64I_FCVT_F32_S32 = 0x1e220000,
+  A64I_FCVT_F64_S32 = 0x1e620000,
+  A64I_FCVT_F32_U32 = 0x1e230000,
+  A64I_FCVT_F64_U32 = 0x1e630000,
+  A64I_FCVT_F32_S64 = 0x9e220000,
+  A64I_FCVT_F64_S64 = 0x9e620000,
+  A64I_FCVT_F32_U64 = 0x9e230000,
+  A64I_FCVT_F64_U64 = 0x9e630000,
+  A64I_FCVT_S32_F64 = 0x1e780000,
+  A64I_FCVT_S32_F32 = 0x1e380000,
+  A64I_FCVT_U32_F64 = 0x1e790000,
+  A64I_FCVT_U32_F32 = 0x1e390000,
+  A64I_FCVT_S64_F64 = 0x9e780000,
+  A64I_FCVT_S64_F32 = 0x9e380000,
+  A64I_FCVT_U64_F64 = 0x9e790000,
+  A64I_FCVT_U64_F32 = 0x9e390000,
+
+  A64I_FMOV_S = 0x1e204000,
+  A64I_FMOV_D = 0x1e604000,
+  A64I_FMOV_R_S = 0x1e260000,
+  A64I_FMOV_S_R = 0x1e270000,
+  A64I_FMOV_R_D = 0x9e660000,
+  A64I_FMOV_D_R = 0x9e670000,
+  A64I_FMOV_DI = 0x1e601000,
 } A64Ins;
 
+typedef enum A64Shift {
+  A64SH_LSL, A64SH_LSR, A64SH_ASR, A64SH_ROR
+} A64Shift;
+
+typedef enum A64Extend {
+  A64EX_UXTB, A64EX_UXTH, A64EX_UXTW, A64EX_UXTX,
+  A64EX_SXTB, A64EX_SXTH, A64EX_SXTW, A64EX_SXTX,
+} A64Extend;
+
+/* ARM condition codes. */
+typedef enum A64CC {
+  CC_EQ, CC_NE, CC_CS, CC_CC, CC_MI, CC_PL, CC_VS, CC_VC,
+  CC_HI, CC_LS, CC_GE, CC_LT, CC_GT, CC_LE, CC_AL,
+  CC_HS = CC_CS, CC_LO = CC_CC
+} A64CC;
+
 #endif
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 7a881bdd..a6227bf7 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -236,12 +236,17 @@
 |.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro
 |.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro
 |
-#define GL_J(field)	(GG_OFS(J) + (int)offsetof(jit_State, field))
+#define GL_J(field)	(GG_G2J + (int)offsetof(jit_State, field))
 |
 #define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
 |
 |.macro hotcheck, delta
-|  NYI
+|  lsr CARG1, PC, #1
+|  and CARG1, CARG1, #126
+|  add CARG1, CARG1, #GG_G2DISP+GG_DISP2HOT
+|  ldrh CARG2w, [GL, CARG1]
+|  subs CARG2, CARG2, #delta
+|  strh CARG2w, [GL, CARG1]
 |.endmacro
 |
 |.macro hotloop
@@ -869,7 +874,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  bl extern lj_meta_for	// (lua_State *L, TValue *base)
   |  ldr INSw, [PC, #-4]
   |.if JIT
-  |   uxtb TMP0, INS
+  |   uxtb TMP0w, INSw
   |.endif
   |  decode_RA RA, INS
   |  decode_RD RC, INS
@@ -1732,7 +1737,20 @@ static void build_subroutines(BuildCtx *ctx)
   |//-----------------------------------------------------------------------
   |
   |->vm_record:				// Dispatch target for recording phase.
-  |  NYI
+  |.if JIT
+  |  ldrb CARG1w, GL->hookmask
+  |  tst CARG1, #HOOK_VMEVENT		// No recording while in vmevent.
+  |  bne >5
+  |  // Decrement the hookcount for consistency, but always do the call.
+  |   ldr CARG2w, GL->hookcount
+  |  tst CARG1, #HOOK_ACTIVE
+  |  bne >1
+  |   sub CARG2w, CARG2w, #1
+  |  tst CARG1, #LUA_MASKLINE|LUA_MASKCOUNT
+  |  beq >1
+  |   str CARG2w, GL->hookcount
+  |  b >1
+  |.endif
   |
   |->vm_rethook:			// Dispatch target for return hooks.
   |  ldrb TMP2w, GL->hookmask
@@ -1774,7 +1792,21 @@ static void build_subroutines(BuildCtx *ctx)
   |  b <4
   |
   |->vm_hotloop:			// Hot loop counter underflow.
-  |  NYI
+  |.if JIT
+  |  ldr LFUNC:CARG3, [BASE, FRAME_FUNC]  // Same as curr_topL(L).
+  |   add CARG1, GL, #GG_G2DISP+GG_DISP2J
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |   str PC, SAVE_PC
+  |  ldr CARG3, LFUNC:CARG3->pc
+  |   mov CARG2, PC
+  |   str L, [GL, #GL_J(L)]
+  |  ldrb CARG3w, [CARG3, #PC2PROTO(framesize)]
+  |   str BASE, L->base
+  |  add CARG3, BASE, CARG3, lsl #3
+  |  str CARG3, L->top
+  |  bl extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
+  |  b <3
+  |.endif
   |
   |->vm_callhook:			// Dispatch target for call hooks.
   |  mov CARG2, PC
@@ -1804,7 +1836,54 @@ static void build_subroutines(BuildCtx *ctx)
   |  br CRET1
   |
   |->cont_stitch:			// Trace stitching.
-  |  NYI
+  |.if JIT
+  |  // RA = resultptr, CARG4 = meta base
+  |   ldr RB, SAVE_MULTRES
+  |  ldr INSw, [PC, #-4]
+  |    ldr TRACE:CARG3, [CARG4, #-40]	// Save previous trace.
+  |   subs RB, RB, #8
+  |  decode_RA RC, INS			// Call base.
+  |    and CARG3, CARG3, #LJ_GCVMASK
+  |   beq >2
+  |1:  // Move results down.
+  |  ldr CARG1, [RA]
+  |    add RA, RA, #8
+  |   subs RB, RB, #8
+  |  str CARG1, [BASE, RC, lsl #3]
+  |    add RC, RC, #1
+  |   bne <1
+  |2:
+  |   decode_RA RA, INS
+  |   decode_RB RB, INS
+  |   add RA, RA, RB
+  |3:
+  |   cmp RA, RC
+  |   bhi >9				// More results wanted?
+  |
+  |  ldrh RAw, TRACE:CARG3->traceno
+  |  ldrh RCw, TRACE:CARG3->link
+  |  cmp RCw, RAw
+  |  beq ->cont_nop			// Blacklisted.
+  |  cmp RCw, #0
+  |  bne =>BC_JLOOP			// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  mov CARG1, #GL_J(exitno)
+  |  str RA, [GL, CARG1]
+  |  mov CARG1, #GL_J(L)
+  |  str L, [GL, CARG1]
+  |  str BASE, L->base
+  |  add CARG1, GL, #GG_G2J
+  |  mov CARG2, PC
+  |  bl extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
+  |  ldr BASE, L->base
+  |  b ->cont_nop
+  |
+  |9:  // Fill up results with nil.
+  |  str TISNIL, [BASE, RC, lsl #3]
+  |  add RC, RC, #1
+  |  b <3
+  |.endif
   |
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
@@ -1822,10 +1901,120 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Trace exit handler -------------------------------------------------
   |//-----------------------------------------------------------------------
   |
+  |.macro savex_, a, b
+  |  stp d..a, d..b, [sp, #a*8]
+  |  stp x..a, x..b, [sp, #32*8+a*8]
+  |.endmacro
+  |
   |->vm_exit_handler:
-  |  NYI
+  |.if JIT
+  |  sub     sp, sp, #(64*8)
+  |  savex_, 0, 1
+  |  savex_, 2, 3
+  |  savex_, 4, 5
+  |  savex_, 6, 7
+  |  savex_, 8, 9
+  |  savex_, 10, 11
+  |  savex_, 12, 13
+  |  savex_, 14, 15
+  |  savex_, 16, 17
+  |  savex_, 18, 19
+  |  savex_, 20, 21
+  |  savex_, 22, 23
+  |  savex_, 24, 25
+  |  savex_, 26, 27
+  |  savex_, 28, 29
+  |  stp d30, d31, [sp, #30*8]
+  |  ldr CARG1, [sp, #64*8]	// Load original value of lr.
+  |   add CARG3, sp, #64*8	// Recompute original value of sp.
+  |   mv_vmstate CARG4, EXIT
+  |  ldr CARG2w, [CARG1, #-4]!	// Get exit instruction.
+  |   stp CARG1, CARG3, [sp, #62*8]	// Store exit pc/sp in RID_LR/RID_SP.
+  |  lsl CARG2, CARG2, #38
+  |  add CARG1, CARG1, CARG2, asr #36
+  |   ldr CARG2w, [lr]		// Load exit stub group offset.
+  |   sub CARG1, CARG1, lr
+  |   sub CARG1, CARG1, #4
+  |  ldr L, GL->cur_L
+  |   add CARG1, CARG2, CARG1, lsr #2	// Compute exit number.
+  |    ldr BASE, GL->jit_base
+  |   st_vmstate CARG4
+  |   str CARG1w, [GL, #GL_J(exitno)]
+  |    str BASE, L->base
+  |  str L, [GL, #GL_J(L)]
+  |   str xzr, GL->jit_base
+  |  add CARG1, GL, #GG_G2J
+  |  mov CARG2, sp
+  |  bl extern lj_trace_exit		// (jit_State *J, ExitState *ex)
+  |  // Returns MULTRES (unscaled) or negated error code.
+  |  ldr CARG2, L->cframe
+  |   ldr BASE, L->base
+  |  and sp, CARG2, #CFRAME_RAWMASK
+  |   ldr PC, SAVE_PC			// Get SAVE_PC.
+  |  str L, SAVE_L			// Set SAVE_L (on-trace resume/yield).
+  |  b >1
+  |.endif
+  |
   |->vm_exit_interp:
-  |  NYI
+  |  // CARG1 = MULTRES or negated error code, BASE, PC and GL set.
+  |.if JIT
+  |  ldr L, SAVE_L
+  |1:
+  |  cmp CARG1w, #0
+  |  blt >9				// Check for error from exit.
+  |   lsl RC, CARG1, #3
+  |  ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
+  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+  |    movn TISNIL, #0
+  |  and LFUNC:CARG2, CARG2, #LJ_GCVMASK
+  |   str RC, SAVE_MULTRES
+  |   str BASE, L->base
+  |  ldr CARG2, LFUNC:CARG2->pc
+  |   str xzr, GL->jit_base
+  |    mv_vmstate CARG4, INTERP
+  |  ldr KBASE, [CARG2, #PC2PROTO(k)]
+  |  // Modified copy of ins_next which handles function header dispatch, too.
+  |  ldrb RBw, [PC]
+  |   ldr INSw, [PC], #4
+  |    st_vmstate CARG4
+  |  cmp RBw, #BC_FUNCC+2		// Fast function?
+  |   add TMP1, GL, INS, uxtb #3
+  |  bhs >4
+  |2:
+  |  cmp RBw, #BC_FUNCF			// Function header?
+  |  add TMP0, GL, RB, uxtb #3
+  |  ldr RB, [TMP0, #GG_G2DISP]
+  |   decode_RA RA, INS
+  |   lsr TMP0, INS, #16
+  |   csel RC, TMP0, RC, lo
+  |   blo >5
+  |   ldr CARG3, [BASE, FRAME_FUNC]
+  |   sub RC, RC, #8
+  |   add RA, BASE, RA, lsl #3	// Yes: RA = BASE+framesize*8, RC = nargs*8
+  |   and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |5:
+  |  br RB
+  |
+  |4:  // Check frame below fast function.
+  |  ldr CARG1, [BASE, FRAME_PC]
+  |  ands CARG2, CARG1, #FRAME_TYPE
+  |  bne <2			// Trace stitching continuation?
+  |  // Otherwise set KBASE for Lua function below fast function.
+  |  ldr CARG3, [CARG1, #-4]
+  |  decode_RA CARG1, CARG3
+  |  sub CARG2, BASE, CARG1, lsl #3
+  |  ldr LFUNC:CARG3, [CARG2, #-32]
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |  ldr CARG3, LFUNC:CARG3->pc
+  |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+  |  b <2
+  |
+  |9:  // Rethrow error from the right C frame.
+  |  neg CARG2, CARG1
+  |  mov CARG1, L
+  |  bl extern lj_err_throw		// (lua_State *L, int errcode)
+  |.endif
   |
   |//-----------------------------------------------------------------------
   |//-- Math helper functions ----------------------------------------------
@@ -3387,6 +3576,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     if (op == BC_FORI) {
       |  csel PC, RC, PC, gt
     } else if (op == BC_JFORI) {
+      |  mov PC, RC
       |  ldrh RCw, [RC, #-2]
     } else if (op == BC_IFORL) {
       |  csel PC, RC, PC, le
@@ -3488,7 +3678,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_JLOOP:
     |.if JIT
-    |  NYI
+    |  // RA = base (ignored), RC = traceno
+    |  ldr CARG1, [GL, #GL_J(trace)]
+    |   mov CARG2, #0  // Traces on ARM64 don't store the trace #, so use 0.
+    |  ldr TRACE:RC, [CARG1, RC, lsl #3]
+    |   st_vmstate CARG2
+    |  ldr RA, TRACE:RC->mcode
+    |   str BASE, GL->jit_base
+    |   str L, GL->tmpbuf.L
+    |  sub sp, sp, #16	// See SPS_FIXED. Avoids sp adjust in every root trace.
+    |  br RA
     |.endif
     break;
 
@@ -3546,10 +3745,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_IFUNCV:
     |  // BASE = new base, RA = BASE+framesize*8, CARG3 = LFUNC, RC = nargs*8
     |  ldr CARG1, L->maxstack
+    |   movn TMP0, #~LJ_TFUNC
     |   add TMP2, BASE, RC
+    |   add LFUNC:CARG3, CARG3, TMP0, lsl #47
     |  add RA, RA, RC
     |   add TMP0, RC, #16+FRAME_VARG
-    |   str LFUNC:CARG3, [TMP2], #8	// Store (untagged) copy of LFUNC.
+    |   str LFUNC:CARG3, [TMP2], #8	// Store (tagged) copy of LFUNC.
     |    ldr KBASE, [PC, #-4+PC2PROTO(k)]
     |  cmp RA, CARG1
     |   str TMP0, [TMP2], #8		// Store delta + FRAME_VARG.
@@ -3736,8 +3937,8 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.uleb128 0x1\n"
 	"\t.sleb128 -8\n"
 	"\t.byte 30\n"				/* Return address is in lr. */
-	"\t.uleb128 1\n"                        /* augmentation length */
-	"\t.byte 0x1b\n"                        /* pcrel|sdata4 */
+	"\t.uleb128 1\n"			/* augmentation length */
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
 	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
 	"\t.align 3\n"
 	".LECIE2:\n\n");
@@ -3748,7 +3949,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.long .LASFDE3-.Lframe2\n"
 	"\t.long lj_vm_ffi_call-.\n"
 	"\t.long %d\n"
-	"\t.uleb128 0\n"                        /* augmentation length */
+	"\t.uleb128 0\n"			/* augmentation length */
 	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
 	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
 	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */

From 7a0c3a1127229788fa4cfc436e1b3cbee36b7eb4 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 23:17:45 +0100
Subject: [PATCH 18/94] ARM64: Allow full VA range for mcode allocation.

---
 src/lj_mcode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lj_mcode.c b/src/lj_mcode.c
index 3eaee054..de785a09 100644
--- a/src/lj_mcode.c
+++ b/src/lj_mcode.c
@@ -206,6 +206,9 @@ static void mcode_protect(jit_State *J, int prot)
 
 #if LJ_TARGET_X64
 #define mcode_validptr(p)	((p) && (uintptr_t)(p) < (uintptr_t)1<<47)
+#elif LJ_TARGET_ARM64
+/* We have no clue about the valid VA range. It could be 39 - 52 bits. */
+#define mcode_validptr(p)	(p)
 #else
 #define mcode_validptr(p)	((p) && (uintptr_t)(p) < 0xffff0000)
 #endif

From 2b77da35bc77e2d34062d9168884095d9145a993 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 23:32:17 +0100
Subject: [PATCH 19/94] ARM64: Reject special case in emit_isk13().

---
 src/lj_emit_arm64.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
index eb8f7fc7..52e75559 100644
--- a/src/lj_emit_arm64.h
+++ b/src/lj_emit_arm64.h
@@ -46,7 +46,9 @@ static uint32_t emit_isk13(uint64_t n, int is64)
   if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
   do {  /* Find the repeat width. */
     if (is64 && (uint32_t)(n^(n>>32))) break;
-    n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break;
+    n = (uint32_t)n;
+    if (!n) return 0;  /* Ditto when passing n=0xffffffff and is64=0. */
+    w = 32; if ((n^(n>>16)) & 0xffff) break;
     n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
     n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
     n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;

From a56654460d9ca636536c8204a358207a698e625b Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Mon, 21 Nov 2016 15:43:17 +0100
Subject: [PATCH 20/94] Generalize deferred constant handling in backend to 64
 bit.

---
 src/lj_asm.c       | 34 ++++++++++++++++++++++++++++++----
 src/lj_emit_arm.h  |  2 +-
 src/lj_emit_mips.h |  4 ++--
 src/lj_emit_ppc.h  |  2 +-
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/lj_asm.c b/src/lj_asm.c
index 2cb5abea..f0a11ca8 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -91,7 +91,7 @@ typedef struct ASMState {
   MCode *realign;	/* Realign loop if not NULL. */
 
 #ifdef RID_NUM_KREF
-  int32_t krefk[RID_NUM_KREF];
+  intptr_t krefk[RID_NUM_KREF];
 #endif
   IRRef1 phireg[RID_MAX];  /* PHI register references. */
   uint16_t parentmap[LJ_MAX_JSLOTS];  /* Parent instruction to RegSP map. */
@@ -144,7 +144,7 @@ static LJ_AINLINE void checkmclim(ASMState *as)
 #define ra_krefreg(ref)		((Reg)(RID_MIN_KREF + (Reg)(ref)))
 #define ra_krefk(as, ref)	(as->krefk[(ref)])
 
-static LJ_AINLINE void ra_setkref(ASMState *as, Reg r, int32_t k)
+static LJ_AINLINE void ra_setkref(ASMState *as, Reg r, intptr_t k)
 {
   IRRef ref = (IRRef)(r - RID_MIN_KREF);
   as->krefk[ref] = k;
@@ -324,7 +324,11 @@ static Reg ra_rematk(ASMState *as, IRRef ref)
     lua_assert(!rset_test(as->freeset, r));
     ra_free(as, r);
     ra_modified(as, r);
+#if LJ_64
+    emit_loadu64(as, r, ra_krefk(as, ref));
+#else
     emit_loadi(as, r, ra_krefk(as, ref));
+#endif
     return r;
   }
   ir = IR(ref);
@@ -526,7 +530,7 @@ static void ra_evictk(ASMState *as)
 
 #ifdef RID_NUM_KREF
 /* Allocate a register for a constant. */
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow)
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow)
 {
   /* First try to find a register which already holds the same constant. */
   RegSet pick, work = ~as->freeset & RSET_GPR;
@@ -535,9 +539,31 @@ static Reg ra_allock(ASMState *as, int32_t k, RegSet allow)
     IRRef ref;
     r = rset_pickbot(work);
     ref = regcost_ref(as->cost[r]);
+#if LJ_64
+    if (ref < ASMREF_L) {
+      if (ra_iskref(ref)) {
+	if (k == ra_krefk(as, ref))
+	  return r;
+      } else {
+	IRIns *ir = IR(ref);
+	if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) ||
+#if LJ_GC64
+	    (ir->o == IR_KINT && k == ir->i) ||
+	    (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) ||
+	    ((ir->o == IR_KPTR || ir->o == IR_KKPTR) &&
+	     k == (intptr_t)ir_kptr(ir))
+#else
+	    (ir->o != IR_KINT64 && k == ir->i)
+#endif
+	   )
+	  return r;
+      }
+    }
+#else
     if (ref < ASMREF_L &&
 	k == (ra_iskref(ref) ? ra_krefk(as, ref) : IR(ref)->i))
       return r;
+#endif
     rset_clear(work, r);
   }
   pick = as->freeset & allow;
@@ -557,7 +583,7 @@ static Reg ra_allock(ASMState *as, int32_t k, RegSet allow)
 }
 
 /* Allocate a specific register for a constant. */
-static void ra_allockreg(ASMState *as, int32_t k, Reg r)
+static void ra_allockreg(ASMState *as, intptr_t k, Reg r)
 {
   Reg kr = ra_allock(as, k, RID2RSET(r));
   if (kr != r) {
diff --git a/src/lj_emit_arm.h b/src/lj_emit_arm.h
index 496b37a3..f7c93d84 100644
--- a/src/lj_emit_arm.h
+++ b/src/lj_emit_arm.h
@@ -207,7 +207,7 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i)
 
 #define emit_loada(as, r, addr)		emit_loadi(as, (r), i32ptr((addr)))
 
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow);
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
 
 /* Get/set from constant pointer. */
 static void emit_lsptr(ASMState *as, ARMIns ai, Reg r, void *p)
diff --git a/src/lj_emit_mips.h b/src/lj_emit_mips.h
index d35f830b..93c35ea4 100644
--- a/src/lj_emit_mips.h
+++ b/src/lj_emit_mips.h
@@ -94,8 +94,8 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i)
 
 #define emit_loada(as, r, addr)		emit_loadi(as, (r), i32ptr((addr)))
 
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow);
-static void ra_allockreg(ASMState *as, int32_t k, Reg r);
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+static void ra_allockreg(ASMState *as, intptr_t k, Reg r);
 
 /* Get/set from constant pointer. */
 static void emit_lsptr(ASMState *as, MIPSIns mi, Reg r, void *p, RegSet allow)
diff --git a/src/lj_emit_ppc.h b/src/lj_emit_ppc.h
index 5163012a..34a6efb7 100644
--- a/src/lj_emit_ppc.h
+++ b/src/lj_emit_ppc.h
@@ -98,7 +98,7 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i)
 
 #define emit_loada(as, r, addr)		emit_loadi(as, (r), i32ptr((addr)))
 
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow);
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
 
 /* Get/set from constant pointer. */
 static void emit_lsptr(ASMState *as, PPCIns pi, Reg r, void *p, RegSet allow)

From c3cae04153213d9779c5563056b017a19da2d283 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Mon, 21 Nov 2016 16:02:10 +0100
Subject: [PATCH 21/94] Update contact info.

---
 doc/contact.html | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/contact.html b/doc/contact.html
index 9f358805..bb81cf06 100644
--- a/doc/contact.html
+++ b/doc/contact.html
@@ -59,8 +59,15 @@
 
 

+If you want to report bugs, propose fixes or suggest enhancements, +please use the +GitHub issue tracker. +

+

Please send general questions to the » LuaJIT mailing list. +

+

You can also send any questions you have directly to me:

From 81259898ea177bb7b4becebf3d7686603f6b373b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 24 Nov 2016 18:56:19 +0100 Subject: [PATCH 22/94] ARM64: Emit more efficient trace exits. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 73 +++++++++++++++++++------------------------ src/lj_target_arm64.h | 14 ++++++--- src/vm_arm64.dasc | 31 +++++++++--------- 3 files changed, 57 insertions(+), 61 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 0a2f5306..19b3331d 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -47,53 +47,41 @@ static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow) /* -- Guard handling ------------------------------------------------------ */ -/* Generate an exit stub group at the bottom of the reserved MCode memory. */ -static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) -{ - MCode *mxp = as->mcbot; - int i; - if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop) - asm_mclimit(as); - /* str lr, [sp]; bl ->vm_exit_handler; .long group. */ - *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP); - *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); - mxp++; - *mxp++ = group*EXITSTUBS_PER_GROUP; - for (i = 0; i < EXITSTUBS_PER_GROUP; i++) - *mxp++ = A64I_B | ((-3-i)&0x03ffffffu); - lj_mcode_sync(as->mcbot, mxp); - lj_mcode_commitbot(as->J, mxp); - as->mcbot = mxp; - as->mclim = as->mcbot + MCLIM_REDZONE; - return mxp - EXITSTUBS_PER_GROUP; -} - /* Setup all needed exit stubs. */ static void asm_exitstub_setup(ASMState *as, ExitNo nexits) { ExitNo i; - if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) - lj_trace_err(as->J, LJ_TRERR_SNAPOV); - for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++) - if (as->J->exitstubgroup[i] == NULL) - as->J->exitstubgroup[i] = asm_exitstub_gen(as, i); + MCode *mxp = as->mctop; + if (mxp - (nexits + 3 + MCLIM_REDZONE) < as->mclim) + asm_mclimit(as); + /* 1: str lr,[sp]; bl ->vm_exit_handler; movz w0,traceno; bl <1; bl <1; ... */ + for (i = nexits-1; (int32_t)i >= 0; i--) + *--mxp = A64I_BL|((-3-i)&0x03ffffffu); + *--mxp = A64I_MOVZw|A64F_U16(as->T->traceno); + mxp--; + *mxp = A64I_BL|(((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); + *--mxp = A64I_STRx|A64F_D(RID_LR)|A64F_N(RID_SP); + as->mctop = mxp; +} + +static MCode *asm_exitstub_addr(ASMState *as, ExitNo exitno) +{ + /* Keep this in-sync with exitstub_trace_addr(). */ + return as->mctop + exitno + 3; } /* Emit conditional branch to exit for guard. */ static void asm_guardcc(ASMState *as, A64CC cc) { - MCode *target = exitstub_addr(as->J, as->snapno); + MCode *target = asm_exitstub_addr(as, as->snapno); MCode *p = as->mcp; if (LJ_UNLIKELY(p == as->invmcp)) { as->loopinv = 1; - *p = A64I_BL | ((target-p) & 0x03ffffffu); + *p = A64I_B | ((target-p) & 0x03ffffffu); emit_cond_branch(as, cc^1, p-1); return; } - /* No conditional calls. Emit b.cc/bl instead. */ - /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */ - emit_branch(as, A64I_BL, target); - emit_cond_branch(as, cc^1, p); + emit_cond_branch(as, cc, target); } /* -- Operand fusion ------------------------------------------------------ */ @@ -1568,8 +1556,7 @@ static void asm_stack_check(ASMState *as, BCReg topslot, } else { pbase = RID_BASE; } - emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno)); - emit_cond_branch(as, CC_LS^1, as->mcp+1); + emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno)); k = emit_isk12((8*topslot)); lua_assert(k); emit_n(as, A64I_CMPx^k, RID_TMP); @@ -1744,7 +1731,8 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk) /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */ int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED)); if (spadj == 0) { - as->mctop = --p; + *--p = A64I_NOP; + as->mctop = p; } else { /* Patch stack adjustment. */ uint32_t k = emit_isk12(spadj); @@ -1805,13 +1793,18 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MCode *pe = (MCode *)((char *)p + T->szmcode); MCode *cstart = NULL, *cend = p; MCode *mcarea = lj_mcode_patch(J, p, 0); - MCode *px = exitstub_addr(J, exitno); + MCode *px = exitstub_trace_addr(T, exitno); for (; p < pe; p++) { - /* Look for bl exitstub, replace with b target. */ + /* Look for bcc/b exitstub, replace with bcc/b target. */ uint32_t ins = *p; - if ((ins & 0xfc000000u) == 0x94000000u && - ((ins ^ (px-p)) & 0x03ffffffu) == 0) { - *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu); + if ((ins & 0xff000000u) == 0x54000000u && + ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); + cend = p+1; + if (!cstart) cstart = p; + } else if ((ins & 0xfc000000u) == 0x14000000u && + ((ins ^ (px-p)) & 0x03ffffffu) == 0) { + *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); cend = p+1; if (!cstart) cstart = p; } diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 0cef06d5..1cd02fe8 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -101,14 +101,18 @@ typedef struct { int32_t spill[256]; /* Spill slots. */ } ExitState; -/* PC after instruction that caused an exit. Used to find the trace number. */ -#define EXITSTATE_PCREG RID_LR /* Highest exit + 1 indicates stack check. */ #define EXITSTATE_CHECKEXIT 1 -#define EXITSTUB_SPACING 4 -#define EXITSTUBS_PER_GROUP 32 - +/* Return the address of a per-trace exit stub. */ +static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) +{ + while (*p == 0xd503201f) p++; /* Skip A64I_NOP. */ + return p + 3 + exitno; +} +/* Avoid dependence on lj_jit.h if only including lj_target.h. */ +#define exitstub_trace_addr(T, exitno) \ + exitstub_trace_addr_((MCode *)((char *)(T)->mcode + (T)->szmcode), (exitno)) /* -- Instructions -------------------------------------------------------- */ diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index a6227bf7..86c78fa5 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -1927,22 +1927,21 @@ static void build_subroutines(BuildCtx *ctx) | stp d30, d31, [sp, #30*8] | ldr CARG1, [sp, #64*8] // Load original value of lr. | add CARG3, sp, #64*8 // Recompute original value of sp. - | mv_vmstate CARG4, EXIT - | ldr CARG2w, [CARG1, #-4]! // Get exit instruction. - | stp CARG1, CARG3, [sp, #62*8] // Store exit pc/sp in RID_LR/RID_SP. - | lsl CARG2, CARG2, #38 - | add CARG1, CARG1, CARG2, asr #36 - | ldr CARG2w, [lr] // Load exit stub group offset. - | sub CARG1, CARG1, lr - | sub CARG1, CARG1, #4 - | ldr L, GL->cur_L - | add CARG1, CARG2, CARG1, lsr #2 // Compute exit number. - | ldr BASE, GL->jit_base - | st_vmstate CARG4 - | str CARG1w, [GL, #GL_J(exitno)] - | str BASE, L->base - | str L, [GL, #GL_J(L)] - | str xzr, GL->jit_base + | mv_vmstate CARG4, EXIT + | stp xzr, CARG3, [sp, #62*8] // Store 0/sp in RID_LR/RID_SP. + | sub CARG1, CARG1, lr + | ldr L, GL->cur_L + | lsr CARG1, CARG1, #2 + | ldr BASE, GL->jit_base + | sub CARG1, CARG1, #2 + | ldr CARG2w, [lr] // Load trace number. + | st_vmstate CARG4 + | str BASE, L->base + | ubfx CARG2w, CARG2w, #5, #16 + | str CARG1w, [GL, #GL_J(exitno)] + | str CARG2w, [GL, #GL_J(parent)] + | str L, [GL, #GL_J(L)] + | str xzr, GL->jit_base | add CARG1, GL, #GG_G2J | mov CARG2, sp | bl extern lj_trace_exit // (jit_State *J, ExitState *ex) From d7243e1de0cb86608956a9af107aff829ad99aeb Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 24 Nov 2016 19:14:17 +0100 Subject: [PATCH 23/94] Eliminate use of lightuserdata derived from static data pointers. Required for >47 bit VA, e.g. ARM64. --- src/lib_debug.c | 8 ++++---- src/lib_package.c | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lib_debug.c b/src/lib_debug.c index cda7137e..6628d943 100644 --- a/src/lib_debug.c +++ b/src/lib_debug.c @@ -283,13 +283,13 @@ LJLIB_CF(debug_setuservalue) /* ------------------------------------------------------------------------ */ -static const char KEY_HOOK = 'h'; +#define KEY_HOOK ((void *)0x3004) static void hookf(lua_State *L, lua_Debug *ar) { static const char *const hooknames[] = {"call", "return", "line", "count", "tail return"}; - lua_pushlightuserdata(L, (void *)&KEY_HOOK); + lua_pushlightuserdata(L, KEY_HOOK); lua_rawget(L, LUA_REGISTRYINDEX); if (lua_isfunction(L, -1)) { lua_pushstring(L, hooknames[(int)ar->event]); @@ -334,7 +334,7 @@ LJLIB_CF(debug_sethook) count = luaL_optint(L, arg+3, 0); func = hookf; mask = makemask(smask, count); } - lua_pushlightuserdata(L, (void *)&KEY_HOOK); + lua_pushlightuserdata(L, KEY_HOOK); lua_pushvalue(L, arg+1); lua_rawset(L, LUA_REGISTRYINDEX); lua_sethook(L, func, mask, count); @@ -349,7 +349,7 @@ LJLIB_CF(debug_gethook) if (hook != NULL && hook != hookf) { /* external hook? */ lua_pushliteral(L, "external hook"); } else { - lua_pushlightuserdata(L, (void *)&KEY_HOOK); + lua_pushlightuserdata(L, KEY_HOOK); lua_rawget(L, LUA_REGISTRYINDEX); /* get hook */ } lua_pushstring(L, unmakemask(mask, buff)); diff --git a/src/lib_package.c b/src/lib_package.c index 8c336b02..898897b1 100644 --- a/src/lib_package.c +++ b/src/lib_package.c @@ -399,8 +399,7 @@ static int lj_cf_package_loader_preload(lua_State *L) /* ------------------------------------------------------------------------ */ -static const int sentinel_ = 0; -#define sentinel ((void *)&sentinel_) +#define sentinel ((void *)0x4004) static int lj_cf_package_require(lua_State *L) { From 6538c8a18711a6eb009def36050acd5f02e42aec Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 25 Nov 2016 09:23:08 +0100 Subject: [PATCH 24/94] Document 47 bit limit for lightuserdata. --- doc/status.html | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/status.html b/doc/status.html index d10033b0..0acf78c9 100644 --- a/doc/status.html +++ b/doc/status.html @@ -97,6 +97,17 @@ handled correctly. The error may fall through an on-trace lua_atpanic on x64. This issue will be fixed with the new garbage collector. +
  • +LuaJIT on 64 bit systems provides a limited range of 47 bits for the +legacy lightuserdata data type. +This is only relevant on x64 systems which use the negative part of the +virtual address space in user mode, e.g. Solaris/x64, and on ARM64 systems +configured with a 48 bit or 52 bit VA. +Avoid using lightuserdata to hold pointers that may point outside +of that range, e.g. variables on the stack. In general, avoid this data +type for new code and replace it with (much more performant) FFI bindings. +FFI cdata pointers can address the full 64 bit range. +

  • From 3ad2bbf58600f8ba2918b56b0a7ab305df19cfe5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 29 Nov 2016 19:30:40 +0100 Subject: [PATCH 25/94] ARM64: Make use of tbz/tbnz and cbz/cbnz. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 83 ++++++++++++++++++++++++++++++++++--------- src/lj_emit_arm64.h | 19 ++++++++++ src/lj_target_arm64.h | 6 ++++ 3 files changed, 91 insertions(+), 17 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 19b3331d..eea957b5 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -84,6 +84,34 @@ static void asm_guardcc(ASMState *as, A64CC cc) emit_cond_branch(as, cc, target); } +/* Emit test and branch instruction to exit for guard. */ +static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit) +{ + MCode *target = asm_exitstub_addr(as, as->snapno); + MCode *p = as->mcp; + if (LJ_UNLIKELY(p == as->invmcp)) { + as->loopinv = 1; + *p = A64I_B | ((target-p) & 0x03ffffffu); + emit_tnb(as, ai^0x01000000u, r, bit, p-1); + return; + } + emit_tnb(as, ai, r, bit, target); +} + +/* Emit compare and branch instruction to exit for guard. */ +static void asm_guardcnb(ASMState *as, A64Ins ai, Reg r) +{ + MCode *target = asm_exitstub_addr(as, as->snapno); + MCode *p = as->mcp; + if (LJ_UNLIKELY(p == as->invmcp)) { + as->loopinv = 1; + *p = A64I_B | ((target-p) & 0x03ffffffu); + emit_cnb(as, ai^0x01000000u, r, p-1); + return; + } + emit_cnb(as, ai, r, target); +} + /* -- Operand fusion ------------------------------------------------------ */ /* Limit linear search to this distance. Avoids O(n^2) behavior. */ @@ -482,10 +510,9 @@ static void asm_strto(ASMState *as, IRIns *ir) dest = ra_dest(as, ir, RSET_FPR); } } - asm_guardcc(as, CC_EQ); if (destused) emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); - emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET); + asm_guardcnb(as, A64I_CBZ, RID_RET); args[0] = ir->op1; /* GCstr *str */ args[1] = ASMREF_TMP1; /* TValue *n */ asm_gencall(as, ci, args); @@ -1465,13 +1492,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir) else if (cc > CC_NE) cc ^= 11; /* LO <-> HI, LS <-> HS */ } oldcc = cc; - if (irref_isk(rref) && IR(rref)->i == 0) { + if (irref_isk(rref) && get_k64val(IR(rref)) == 0) { IRIns *irl = IR(lref); if (cc == CC_GE) cc = CC_PL; else if (cc == CC_LT) cc = CC_MI; - else if (cc > CC_NE) goto notst; /* Other conds don't work with tst. */ + else if (cc > CC_NE) goto nocombine; /* Other conds don't work with tst. */ cmpprev0 = (irl+1 == ir); - /* Combine comp(BAND(left, right), 0) into tst left, right. */ + /* Combine and-cmp-bcc into tbz/tbnz or and-cmp into tst. */ if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) { IRRef blref = irl->op1, brref = irl->op2; uint32_t m2 = 0; @@ -1480,10 +1507,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir) Reg tmp = blref; blref = brref; brref = tmp; } if (irref_isk(brref)) { - /* NYI: use tbz/tbnz, if applicable. */ - m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t)); - if (!m2) - goto notst; /* Not beneficial if we miss a constant operand. */ + uint64_t k = get_k64val(IR(brref)); + if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) { + asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, + ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k)); + return; + } + m2 = emit_isk13(k, irt_is64(irl->t)); } bleft = ra_alloc1(as, blref, RSET_GPR); ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw); @@ -1493,9 +1523,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir) emit_n(as, ai^m2, bleft); return; } - /* NYI: use cbz/cbnz for EQ/NE 0. */ + if (cc == CC_EQ || cc == CC_NE) { + /* Combine cmp-bcc into cbz/cbnz. */ + ai = cc == CC_EQ ? A64I_CBZ : A64I_CBNZ; + if (irt_is64(ir->t)) ai |= A64I_X; + asm_guardcnb(as, ai, ra_alloc1(as, lref, RSET_GPR)); + return; + } } -notst: +nocombine: left = ra_alloc1(as, lref, RSET_GPR); m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left)); asm_guardcc(as, cc); @@ -1638,8 +1674,7 @@ static void asm_gc_check(ASMState *as) ra_evictset(as, RSET_SCRATCH); l_end = emit_label(as); /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */ - asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */ - emit_n(as, A64I_CMPx^A64I_K12, RID_RET); + asm_guardcnb(as, A64I_CBNZ, RID_RET); /* Assumes asm_snap_prep() is done. */ args[0] = ASMREF_TMP1; /* global_State *g */ args[1] = ASMREF_TMP2; /* MSize steps */ asm_gencall(as, ci, args); @@ -1666,10 +1701,10 @@ static void asm_loop_fixup(ASMState *as) MCode *p = as->mctop; MCode *target = as->mcp; if (as->loopinv) { /* Inverted loop branch? */ + uint32_t mask = (p[-2] & 0x7e000000) == 0x36000000 ? 0x3fffu : 0x7ffffu; ptrdiff_t delta = target - (p - 2); - lua_assert(((delta + 0x40000) >> 19) == 0); - /* asm_guardcc already inverted the b.cc and patched the final bl. */ - p[-2] |= ((uint32_t)delta & 0x7ffff) << 5; + /* asm_guard* already inverted the bcc/tnb/cnb and patched the final b. */ + p[-2] |= ((uint32_t)delta & mask) << 5; } else { ptrdiff_t delta = target - (p - 1); p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu); @@ -1795,18 +1830,32 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MCode *mcarea = lj_mcode_patch(J, p, 0); MCode *px = exitstub_trace_addr(T, exitno); for (; p < pe; p++) { - /* Look for bcc/b exitstub, replace with bcc/b target. */ + /* Look for exitstub branch, replace with branch to target. */ uint32_t ins = *p; if ((ins & 0xff000000u) == 0x54000000u && ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + /* Patch bcc exitstub. */ *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); cend = p+1; if (!cstart) cstart = p; } else if ((ins & 0xfc000000u) == 0x14000000u && ((ins ^ (px-p)) & 0x03ffffffu) == 0) { + /* Patch b exitstub. */ *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); cend = p+1; if (!cstart) cstart = p; + } else if ((ins & 0x7e000000u) == 0x34000000u && + ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + /* Patch cbz/cbnz exitstub. */ + *p = (ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u); + cend = p+1; + if (!cstart) cstart = p; + } else if ((ins & 0x7e000000u) == 0x36000000u && + ((ins ^ ((px-p)<<5)) & 0x0007ffe0u) == 0) { + /* Patch tbz/tbnz exitstub. */ + *p = (ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u); + cend = p+1; + if (!cstart) cstart = p; } } lua_assert(cstart != NULL); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 52e75559..1eb14204 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -321,6 +321,25 @@ static void emit_branch(ASMState *as, A64Ins ai, MCode *target) as->mcp = p; } +static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - (p - 1); + lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0); + if (bit > 31) ai |= A64I_X; + *--p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; + as->mcp = p; +} + +static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - (p - 1); + lua_assert(((delta + 0x40000) >> 19) == 0); + *--p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; + as->mcp = p; +} + #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) static void emit_call(ASMState *as, void *target) diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 1cd02fe8..6c8771c6 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -127,7 +127,9 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) #define A64F_U12(x) ((x) << 10) #define A64F_S26(x) (x) #define A64F_S19(x) ((x) << 5) +#define A64F_S14(x) ((x) << 5) #define A64F_S9(x) ((x) << 12) +#define A64F_BIT(x) ((x) << 19) #define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10)) #define A64F_EX(ex) (A64I_EX | ((ex) << 13)) #define A64F_EXSH(ex,x) (A64I_EX | ((ex) << 13) | ((x) << 10)) @@ -235,6 +237,10 @@ typedef enum A64Ins { A64I_BL = 0x94000000, A64I_BR = 0xd61f0000, A64I_BLR = 0xd63f0000, + A64I_TBZ = 0x36000000, + A64I_TBNZ = 0x37000000, + A64I_CBZ = 0x34000000, + A64I_CBNZ = 0x35000000, A64I_NOP = 0xd503201f, From 22511fbe2b284d722f3a7ea901f1ae54992c9c5e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 09:42:43 +0100 Subject: [PATCH 26/94] ARM64: Fix pc-relative loads of consts. Cleanup branch codegen. Thanks to Zhongwei Yao. --- src/lj_emit_arm64.h | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 1eb14204..6686802b 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -234,7 +234,7 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64) #define glofs(as, k) \ ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) #define mcpofs(as, k) \ - ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp)) + ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1))) #define checkmcpofs(as, k) \ ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0) @@ -305,39 +305,35 @@ typedef MCode *MCLabel; static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; - as->mcp = p; + *p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; } static void emit_branch(ASMState *as, A64Ins ai, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(((delta + 0x02000000) >> 26) == 0); - *--p = ai | ((uint32_t)delta & 0x03ffffffu); - as->mcp = p; + *p = ai | ((uint32_t)delta & 0x03ffffffu); } static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0); if (bit > 31) ai |= A64I_X; - *--p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; - as->mcp = p; + *p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; } static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *--p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; - as->mcp = p; + *p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; } #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) From 48b00297b3b37fe77f59f2f4b6fa63358b442f15 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 18:34:10 +0100 Subject: [PATCH 27/94] ARM64: Add missing ldrb/strb instructions to disassembler. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/jit/dis_arm64.lua | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index 909b33bc..8cb608e6 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -411,15 +411,19 @@ local map_lsriro = { shift = 26, mask = 1, [0] = { shift = 30, mask = 3, - [1] = { + [0] = { shift = 22, mask = 3, - [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO" + [0] = "strbDwO", "ldrbDwO", "ldrsbDxO", "ldrsbDwO" }, - [2] = { + { + shift = 22, mask = 3, + [0] = "strhDwO", "ldrhDwO", "ldrshDxO", "ldrshDwO" + }, + { shift = 22, mask = 3, [0] = "strDwO", "ldrDwO", "ldrswDxO" }, - [3] = { + { shift = 22, mask = 3, [0] = "strDxO", "ldrDxO" } @@ -982,7 +986,7 @@ local function disass_ins(ctx) local sz = band(rshift(op, 30), 3) -- extension to be applied if opt == 3 then - if s == 0 then x = nil + if s == 0 then x = x.."]" else x = x..", lsl #"..sz.."]" end elseif opt == 2 or opt == 6 or opt == 7 then if s == 0 then x = x..", "..map_extend[opt].."]" From 2ac2cd4699d2e3a2eaa55417eae901216204fb37 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 18:38:32 +0100 Subject: [PATCH 28/94] ARM64: Reorganize operand extension definitions. --- src/lj_asm_arm64.h | 8 ++++---- src/lj_target_arm64.h | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index eea957b5..fff0b3fd 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -277,7 +277,7 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, if (!emit_checkofs(ai, ofs)) { Reg rn = ra_alloc1(as, ref, allow); Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn)); - emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm); + emit_dnm(as, (ai^A64I_LS_R)|A64I_LS_UXTWx, rd, rn, rm); return; } } @@ -936,7 +936,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp); } if (ofs & FUSE_REG) - emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31)); + emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx|A64I_LS_SH, tmp, idx, (ofs & 31)); else emit_lso(as, A64I_LDRx, tmp, idx, ofs); } @@ -951,7 +951,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir) src = ra_alloc1(as, ir->op2, RSET_FPR); idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd); if (ofs & FUSE_REG) - emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31)); + emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx|A64I_LS_SH, (src & 31), idx, (ofs &31)); else emit_lso(as, A64I_STRd, (src & 31), idx, ofs); } else { @@ -968,7 +968,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir) idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type), A64I_STRx); if (ofs & FUSE_REG) - emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31)); + emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx|A64I_LS_SH, tmp, idx, (ofs & 31)); else emit_lso(as, A64I_STRx, tmp, idx, ofs); if (ra_hasreg(src)) { diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 6c8771c6..e1210045 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -147,8 +147,11 @@ typedef enum A64Ins { A64I_LS_U = 0x01000000, A64I_LS_S = 0x00800000, A64I_LS_R = 0x01200800, - A64I_LS_UXTWx = 0x00005000, - A64I_LS_LSLx = 0x00007000, + A64I_LS_SH = 0x00001000, + A64I_LS_UXTWx = 0x00004000, + A64I_LS_SXTWx = 0x0000c000, + A64I_LS_SXTXx = 0x0000e000, + A64I_LS_LSLx = 0x00006000, A64I_ADDw = 0x0b000000, A64I_ADDx = 0x8b000000, From bfeb1167cd77194c1d49368e3c1468f134be337c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 18:40:31 +0100 Subject: [PATCH 29/94] ARM64: Fuse XLOAD/XSTORE with STRREF/ADD/BSHL/CONV. --- src/lj_asm_arm64.h | 53 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index fff0b3fd..c202bc82 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -232,7 +232,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) irl->o == IR_CONV && irl->op2 == ((IRT_I64<op1)) { + !neverfuse(as)) { Reg m = ra_alloc1(as, irl->op1, allow); return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift); } else { @@ -257,19 +257,60 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, int32_t ofs = 0; if (ra_noreg(ir->r) && canfuse(as, ir)) { if (ir->o == IR_ADD) { - if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs)) + if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs)) { ref = ir->op1; - /* NYI: Fuse add with two registers. */ + } else { + Reg rn, rm; + IRRef lref = ir->op1, rref = ir->op2; + IRIns *irl = IR(lref); + if (mayfuse(as, irl->op1)) { + unsigned int shift = 4; + if (irl->o == IR_BSHL && irref_isk(irl->op2)) { + shift = (IR(irl->op2)->i & 63); + } else if (irl->o == IR_ADD && irl->op1 == irl->op2) { + shift = 1; + } + if ((ai >> 30) == shift) { + lref = irl->op1; + irl = IR(lref); + ai |= A64I_LS_SH; + } + } + if (irl->o == IR_CONV && + irl->op2 == ((IRT_I64<op1; + ai |= A64I_LS_SXTWx; + } else { + ai |= A64I_LS_LSLx; + } + rm = ra_alloc1(as, lref, allow); + rn = ra_alloc1(as, rref, rset_exclude(allow, rm)); + emit_dnm(as, (ai^A64I_LS_R), rd, rn, rm); + return; + } } else if (ir->o == IR_STRREF) { if (asm_isk32(as, ir->op2, &ofs)) { ref = ir->op1; } else if (asm_isk32(as, ir->op1, &ofs)) { ref = ir->op2; } else { - /* NYI: Fuse ADD with constant. */ Reg rn = ra_alloc1(as, ir->op1, allow); - uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn)); - emit_lso(as, ai, rd, rd, sizeof(GCstr)); + IRIns *irr = IR(ir->op2); + uint32_t m; + if (irr+1 == ir && !ra_used(irr) && + irr->o == IR_ADD && irref_isk(irr->op2)) { + ofs = sizeof(GCstr) + IR(irr->op2)->i; + if (emit_checkofs(ai, ofs)) { + Reg rm = ra_alloc1(as, irr->op1, rset_exclude(allow, rn)); + m = A64F_M(rm) | A64F_EX(A64EX_SXTW); + goto skipopm; + } + } + m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn)); + ofs = sizeof(GCstr); + skipopm: + emit_lso(as, ai, rd, rd, ofs); emit_dn(as, A64I_ADDx^m, rd, rn); return; } From 2772cbc36e13200d5b028585abf506a5d19daaba Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 01:38:09 +0100 Subject: [PATCH 30/94] ARM64: Fuse FP multiply-add/sub. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 27 +++++++++++++++++++++++++-- src/lj_emit_arm64.h | 5 +++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index c202bc82..25016f4a 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -327,6 +327,27 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, emit_lso(as, ai, (rd & 31), base, ofs); } +/* Fuse FP multiply-add/sub. */ +static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) +{ + IRRef lref = ir->op1, rref = ir->op2; + IRIns *irm; + if (lref != rref && + ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && + ra_noreg(irm->r)) || + (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && + (rref = lref, ai = air, ra_noreg(irm->r))))) { + Reg dest = ra_dest(as, ir, RSET_FPR); + Reg add = ra_hintalloc(as, rref, dest, RSET_FPR); + Reg left = ra_alloc2(as, irm, + rset_exclude(rset_exclude(RSET_FPR, dest), add)); + Reg right = (left >> 8); left &= 255; + emit_dnma(as, ai, (dest & 31), (left & 31), (right & 31), (add & 31)); + return 1; + } + return 0; +} + /* -- Calls --------------------------------------------------------------- */ /* Generate a call to a C function. */ @@ -1308,7 +1329,8 @@ static void asm_intmul(ASMState *as, IRIns *ir) static void asm_add(ASMState *as, IRIns *ir) { if (irt_isnum(ir->t)) { - asm_fparith(as, ir, A64I_FADDd); + if (!asm_fusemadd(as, ir, A64I_FMADDd, A64I_FMADDd)) + asm_fparith(as, ir, A64I_FADDd); return; } asm_intop_s(as, ir, A64I_ADDw); @@ -1317,7 +1339,8 @@ static void asm_add(ASMState *as, IRIns *ir) static void asm_sub(ASMState *as, IRIns *ir) { if (irt_isnum(ir->t)) { - asm_fparith(as, ir, A64I_FSUBd); + if (!asm_fusemadd(as, ir, A64I_FNMSUBd, A64I_FMSUBd)) + asm_fparith(as, ir, A64I_FSUBd); return; } asm_intop_s(as, ir, A64I_SUBw); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 6686802b..e0f43689 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -74,6 +74,11 @@ static uint32_t emit_isfpk64(uint64_t n) /* -- Emit basic instructions --------------------------------------------- */ +static void emit_dnma(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm, Reg ra) +{ + *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm) | A64F_A(ra); +} + static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm) { *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm); From 3975b6c9f4c59e2913e36f62a99653754fd33fe1 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 04:09:29 +0100 Subject: [PATCH 31/94] ARM64: Fuse various BAND/BSHL/BSHR/BSAR combinations. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 60 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 25016f4a..d14f0224 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -348,6 +348,36 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) return 0; } +/* Fuse BAND + BSHL/BSHR into UBFM. */ +static int asm_fuseandshift(ASMState *as, IRIns *ir) +{ + lua_assert(ir->o == IR_BAND); + if (!neverfuse(as) && irref_isk(ir->op2)) { + uint64_t mask = get_k64val(IR(ir->op2)); + IRIns *irl = IR(ir->op1); + if (irref_isk(irl->op2) && (irl->o == IR_BSHR || irl->o == IR_BSHL)) { + int32_t shmask = irt_is64(irl->t) ? 63 : 31; + int32_t shift = (IR(irl->op2)->i & shmask); + int32_t imms = shift; + if (irl->o == IR_BSHL) { + mask >>= shift; + shift = (shmask-shift+1) & shmask; + imms = 0; + } + if (mask && !((mask+1) & mask)) { /* Contiguous 1-bits at the bottom. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, irl->op1, RSET_GPR); + A64Ins ai = shmask == 63 ? A64I_UBFMx : A64I_UBFMw; + imms += 63 - emit_clz64(mask); + if (imms > shmask) imms = shmask; + emit_dn(as, ai | A64F_IMMS(imms) | A64F_IMMR(shift), dest, left); + return 1; + } + } + } + return 0; +} + /* -- Calls --------------------------------------------------------------- */ /* Generate a call to a C function. */ @@ -1423,8 +1453,14 @@ static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai) } } +static void asm_band(ASMState *as, IRIns *ir) +{ + if (asm_fuseandshift(as, ir)) + return; + asm_bitop(as, ir, A64I_ANDw); +} + #define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) -#define asm_band(as, ir) asm_bitop(as, ir, A64I_ANDw) #define asm_bor(as, ir) asm_bitop(as, ir, A64I_ORRw) #define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) @@ -1437,16 +1473,28 @@ static void asm_bswap(ASMState *as, IRIns *ir) static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh) { - int shmask = irt_is64(ir->t) ? 63 : 31; + int32_t shmask = irt_is64(ir->t) ? 63 : 31; if (irref_isk(ir->op2)) { /* Constant shifts. */ - Reg dest = ra_dest(as, ir, RSET_GPR); - Reg left = ra_alloc1(as, ir->op1, RSET_GPR); + Reg left, dest = ra_dest(as, ir, RSET_GPR); int32_t shift = (IR(ir->op2)->i & shmask); - if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw; + + /* Fuse BSHL + BSHR/BSAR into UBFM/SBFM aka UBFX/SBFX/UBFIZ/SBFIZ. */ + if (!neverfuse(as) && (sh == A64SH_LSR || sh == A64SH_ASR)) { + IRIns *irl = IR(ir->op1); + if (irl->o == IR_BSHL && irref_isk(irl->op2)) { + int32_t shift2 = (IR(irl->op2)->i & shmask); + shift = ((shift - shift2) & shmask); + shmask -= shift2; + ir = irl; + } + } + + left = ra_alloc1(as, ir->op1, RSET_GPR); switch (sh) { case A64SH_LSL: - emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left); + emit_dn(as, ai | A64F_IMMS(shmask-shift) | + A64F_IMMR((shmask-shift+1)&shmask), dest, left); break; case A64SH_LSR: case A64SH_ASR: emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left); From 986854cbb2fa08514e10d9d4d5ded2b7f5f60445 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 05:53:36 +0100 Subject: [PATCH 32/94] ARM64: Fix code generation for S19 offsets. Contributed by Zhongwei Yao. --- src/lj_asm_arm64.h | 2 +- src/lj_emit_arm64.h | 4 ++-- src/lj_target_arm64.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index d14f0224..ab0de5cd 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -781,7 +781,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key)); } - *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE; + *l_loop = A64I_BCC | A64F_S19(as->mcp - l_loop) | CC_NE; if (!isk && irt_isaddr(kt)) { Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow); emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index e0f43689..c7eb4d81 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -313,7 +313,7 @@ static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target) MCode *p = --as->mcp; ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; + *p = A64I_BCC | A64F_S19(delta) | cond; } static void emit_branch(ASMState *as, A64Ins ai, MCode *target) @@ -338,7 +338,7 @@ static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) MCode *p = --as->mcp; ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; + *p = ai | A64F_S19(delta) | r; } #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index e1210045..f77a58a0 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -126,7 +126,7 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) #define A64F_U16(x) ((x) << 5) #define A64F_U12(x) ((x) << 10) #define A64F_S26(x) (x) -#define A64F_S19(x) ((x) << 5) +#define A64F_S19(x) (((uint32_t)(x) & 0x7ffffu) << 5) #define A64F_S14(x) ((x) << 5) #define A64F_S9(x) ((x) << 12) #define A64F_BIT(x) ((x) << 19) From ec2756ba786cd68a0af37ec8ffe806f3ce392d7d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 22:38:35 +0100 Subject: [PATCH 33/94] Add missing FOLD rule for 64 bit shift+BAND simplification. --- src/lj_opt_fold.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 5f4b8810..a72aa440 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -347,6 +347,11 @@ static uint64_t kfold_int64arith(uint64_t k1, uint64_t k2, IROp op) case IR_BAND: k1 &= k2; break; case IR_BOR: k1 |= k2; break; case IR_BXOR: k1 ^= k2; break; + case IR_BSHL: k1 <<= (k2 & 63); break; + case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 63)); break; + case IR_BSAR: k1 >>= (k2 & 63); break; + case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 63)); break; + case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 63)); break; #endif default: UNUSED(k2); lua_assert(0); break; } @@ -1653,6 +1658,14 @@ LJFOLDF(simplify_shiftk_andk) fins->op2 = (IRRef1)lj_ir_kint(J, k); fins->ot = IRTI(IR_BAND); return RETRYFOLD; + } else if (irk->o == IR_KINT64) { + uint64_t k = kfold_int64arith(ir_k64(irk)->u64, fright->i, (IROp)fins->o); + IROpT ot = fleft->ot; + fins->op1 = fleft->op1; + fins->op1 = (IRRef1)lj_opt_fold(J); + fins->op2 = (IRRef1)lj_ir_kint64(J, k); + fins->ot = ot; + return RETRYFOLD; } return NEXTFOLD; } From 44b99ff14d0a543da54fce27793464edbbfacd16 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 9 Dec 2016 18:16:12 +0100 Subject: [PATCH 34/94] ARM64: Fuse BOR(BSHL, BSHR) into EXTR/ROR. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index ab0de5cd..b771d2f1 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -378,6 +378,34 @@ static int asm_fuseandshift(ASMState *as, IRIns *ir) return 0; } +/* Fuse BOR(BSHL, BSHR) into EXTR/ROR. */ +static int asm_fuseorshift(ASMState *as, IRIns *ir) +{ + IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); + lua_assert(ir->o == IR_BOR); + if (!neverfuse(as) && ((irl->o == IR_BSHR && irr->o == IR_BSHL) || + (irl->o == IR_BSHL && irr->o == IR_BSHR))) { + if (irref_isk(irl->op2) && irref_isk(irr->op2)) { + IRRef lref = irl->op1, rref = irr->op1; + uint32_t lshift = IR(irl->op2)->i, rshift = IR(irr->op2)->i; + if (irl->o == IR_BSHR) { /* BSHR needs to be the right operand. */ + uint32_t tmp2; + IRRef tmp1 = lref; lref = rref; rref = tmp1; + tmp2 = lshift; lshift = rshift; rshift = tmp2; + } + if (rshift + lshift == (irt_is64(ir->t) ? 64 : 32)) { + A64Ins ai = irt_is64(ir->t) ? A64I_EXTRx : A64I_EXTRw; + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, lref, RSET_GPR); + Reg right = ra_alloc1(as, rref, rset_exclude(RSET_GPR, left)); + emit_dnm(as, ai | A64F_IMMS(rshift), dest, left, right); + return 1; + } + } + } + return 0; +} + /* -- Calls --------------------------------------------------------------- */ /* Generate a call to a C function. */ @@ -1460,8 +1488,14 @@ static void asm_band(ASMState *as, IRIns *ir) asm_bitop(as, ir, A64I_ANDw); } +static void asm_bor(ASMState *as, IRIns *ir) +{ + if (asm_fuseorshift(as, ir)) + return; + asm_bitop(as, ir, A64I_ORRw); +} + #define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) -#define asm_bor(as, ir) asm_bitop(as, ir, A64I_ORRw) #define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) static void asm_bswap(ASMState *as, IRIns *ir) From 4ccd876a65f7ea4c52a7b44330bc1c279dd8afff Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 9 Dec 2016 18:24:48 +0100 Subject: [PATCH 35/94] ARM64: Use the correct FUSE check. Oops, my bad. --- src/lj_asm_arm64.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index b771d2f1..372429db 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -232,7 +232,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) irl->o == IR_CONV && irl->op2 == ((IRT_I64<op1, allow); return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift); } else { @@ -278,7 +278,7 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, } if (irl->o == IR_CONV && irl->op2 == ((IRT_I64<op1; ai |= A64I_LS_SXTWx; } else { @@ -351,10 +351,10 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) /* Fuse BAND + BSHL/BSHR into UBFM. */ static int asm_fuseandshift(ASMState *as, IRIns *ir) { + IRIns *irl = IR(ir->op1); lua_assert(ir->o == IR_BAND); - if (!neverfuse(as) && irref_isk(ir->op2)) { + if (canfuse(as, irl) && irref_isk(ir->op2)) { uint64_t mask = get_k64val(IR(ir->op2)); - IRIns *irl = IR(ir->op1); if (irref_isk(irl->op2) && (irl->o == IR_BSHR || irl->o == IR_BSHL)) { int32_t shmask = irt_is64(irl->t) ? 63 : 31; int32_t shift = (IR(irl->op2)->i & shmask); @@ -383,8 +383,9 @@ static int asm_fuseorshift(ASMState *as, IRIns *ir) { IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); lua_assert(ir->o == IR_BOR); - if (!neverfuse(as) && ((irl->o == IR_BSHR && irr->o == IR_BSHL) || - (irl->o == IR_BSHL && irr->o == IR_BSHR))) { + if (canfuse(as, irl) && canfuse(as, irr) && + ((irl->o == IR_BSHR && irr->o == IR_BSHL) || + (irl->o == IR_BSHL && irr->o == IR_BSHR))) { if (irref_isk(irl->op2) && irref_isk(irr->op2)) { IRRef lref = irl->op1, rref = irr->op1; uint32_t lshift = IR(irl->op2)->i, rshift = IR(irr->op2)->i; @@ -1511,11 +1512,11 @@ static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh) if (irref_isk(ir->op2)) { /* Constant shifts. */ Reg left, dest = ra_dest(as, ir, RSET_GPR); int32_t shift = (IR(ir->op2)->i & shmask); + IRIns *irl = IR(ir->op1); if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw; /* Fuse BSHL + BSHR/BSAR into UBFM/SBFM aka UBFX/SBFX/UBFIZ/SBFIZ. */ - if (!neverfuse(as) && (sh == A64SH_LSR || sh == A64SH_ASR)) { - IRIns *irl = IR(ir->op1); + if ((sh == A64SH_LSR || sh == A64SH_ASR) && canfuse(as, irl)) { if (irl->o == IR_BSHL && irref_isk(irl->op2)) { int32_t shift2 = (IR(irl->op2)->i & shmask); shift = ((shift - shift2) & shmask); From 197380748052fcc5781fb357d3ac77dcee353004 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 13 Dec 2016 21:30:13 +0100 Subject: [PATCH 36/94] Add "proto" field to jit.util.funcinfo(). --- src/lib_jit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib_jit.c b/src/lib_jit.c index 592538bd..176488e8 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -204,6 +204,7 @@ LJLIB_CF(jit_util_funcinfo) lua_setfield(L, -2, "source"); lj_debug_pushloc(L, pt, pc); lua_setfield(L, -2, "loc"); + setprotoV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "proto")), pt); } else { GCfunc *fn = funcV(L->base); GCtab *t; From fb61f7cbe3ec983dfc9087bde04496aa4bbaa31b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 15 Dec 2016 22:45:28 +0100 Subject: [PATCH 37/94] Add "proto" field to jit.util.funcinfo(). Backport. --- src/lib_jit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib_jit.c b/src/lib_jit.c index 921b84c8..868f697c 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -199,6 +199,7 @@ LJLIB_CF(jit_util_funcinfo) lua_setfield(L, -2, "source"); lj_debug_pushloc(L, pt, pc); lua_setfield(L, -2, "loc"); + setprotoV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "proto")), pt); } else { GCfunc *fn = funcV(L->base); GCtab *t; From ebec2530befabb8777f3e46d22980112c942dba8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 15 Dec 2016 22:47:40 +0100 Subject: [PATCH 38/94] ARM64: Fuse BOR/BXOR and BNOT into ORN/EON. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 52 +++++++++++++++++++++++++++++-------------- src/lj_target_arm64.h | 1 + 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 372429db..28050bf8 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -1464,40 +1464,58 @@ static void asm_neg(ASMState *as, IRIns *ir) asm_intneg(as, ir); } -static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai) +static void asm_band(ASMState *as, IRIns *ir) { - if (as->flagmcp == as->mcp && ai == A64I_ANDw) { + A64Ins ai = A64I_ANDw; + if (asm_fuseandshift(as, ir)) + return; + if (as->flagmcp == as->mcp) { /* Try to drop cmp r, #0. */ as->flagmcp = NULL; as->mcp++; - ai += A64I_ANDSw - A64I_ANDw; + ai = A64I_ANDSw; } - if (ir->op2 == 0) { - Reg dest = ra_dest(as, ir, RSET_GPR); - uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR); + asm_intop(as, ir, ai); +} + +static void asm_borbxor(ASMState *as, IRIns *ir, A64Ins ai) +{ + IRRef lref = ir->op1, rref = ir->op2; + IRIns *irl = IR(lref), *irr = IR(rref); + if ((canfuse(as, irl) && irl->o == IR_BNOT && !irref_isk(rref)) || + (canfuse(as, irr) && irr->o == IR_BNOT && !irref_isk(lref))) { + Reg left, dest = ra_dest(as, ir, RSET_GPR); + uint32_t m; + if (irl->o == IR_BNOT) { + IRRef tmp = lref; lref = rref; rref = tmp; + } + left = ra_alloc1(as, lref, RSET_GPR); + ai |= A64I_ON; if (irt_is64(ir->t)) ai |= A64I_X; - emit_d(as, ai^m, dest); + m = asm_fuseopm(as, ai, IR(rref)->op1, rset_exclude(RSET_GPR, left)); + emit_dn(as, ai^m, dest, left); } else { asm_intop(as, ir, ai); } } -static void asm_band(ASMState *as, IRIns *ir) -{ - if (asm_fuseandshift(as, ir)) - return; - asm_bitop(as, ir, A64I_ANDw); -} - static void asm_bor(ASMState *as, IRIns *ir) { if (asm_fuseorshift(as, ir)) return; - asm_bitop(as, ir, A64I_ORRw); + asm_borbxor(as, ir, A64I_ORRw); } -#define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) -#define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) +#define asm_bxor(as, ir) asm_borbxor(as, ir, A64I_EORw) + +static void asm_bnot(ASMState *as, IRIns *ir) +{ + A64Ins ai = A64I_MVNw; + Reg dest = ra_dest(as, ir, RSET_GPR); + uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR); + if (irt_is64(ir->t)) ai |= A64I_X; + emit_d(as, ai^m, dest); +} static void asm_bswap(ASMState *as, IRIns *ir) { diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index f77a58a0..9e9fbd01 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -142,6 +142,7 @@ typedef enum A64Ins { A64I_S = 0x20000000, A64I_X = 0x80000000, A64I_EX = 0x00200000, + A64I_ON = 0x00200000, A64I_K12 = 0x1a000000, A64I_K13 = 0x18000000, A64I_LS_U = 0x01000000, From 8e5d7bec0d110aa4ccd7e8492f697ff2a88a55ed Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 30 Dec 2016 17:54:10 +0100 Subject: [PATCH 39/94] ARM64: Remove unused variables in disassembler. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to François Perrad. --- src/jit/dis_arm64.lua | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index 8cb608e6..6e3f9ff4 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -13,10 +13,9 @@ -- NYI: Advanced SIMD and VFP instructions. ------------------------------------------------------------------------------ -local type, tonumber = type, tonumber +local type = type local sub, byte, format = string.sub, string.byte, string.format local match, gmatch, gsub = string.match, string.gmatch, string.gsub -local rep = string.rep local concat = table.concat local bit = require("bit") local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex @@ -864,7 +863,6 @@ local function disass_ins(ctx) local operands = {} local suffix = "" local last, name, pat - local vr local map_reg ctx.op = op ctx.rel = nil @@ -1014,7 +1012,6 @@ local function disass_ins(ctx) elseif p == "I" then local shf = band(rshift(op, 22), 3) local imm12 = band(rshift(op, 10), 0x0fff) - local n = #operands local rn, rd = band(rshift(op, 5), 31), band(op, 31) if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then name = altname From a2013dd39abfc036fc02d277ae3f053209d2c4fd Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 10:46:45 +0100 Subject: [PATCH 40/94] Fix cross-endian jit.bcsave for MIPS target. --- src/jit/bcsave.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index 5c417c06..70b92aaf 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -239,7 +239,7 @@ typedef struct { hdr.type = f16(1) hdr.machine = f16(({ x86=3, x64=62, arm=40, ppc=20, ppcspe=20, mips=8, mipsel=8 })[ctx.arch]) if ctx.arch == "mips" or ctx.arch == "mipsel" then - hdr.flags = 0x50001006 + hdr.flags = f32(0x50001006) end hdr.version = f32(1) hdr.shofs = fofs(ffi.offsetof(o, "sect")) From a1e13fa6e4382bcae44f97bf5954f0e57cfad90c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 10:55:31 +0100 Subject: [PATCH 41/94] Fix HTML formatting. --- doc/extensions.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index 1efaca9d..c361b5d5 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -251,8 +251,8 @@ enabled:
  • load(string|reader [, chunkname [,mode [,env]]]).
  • loadstring() is an alias for load().
  • loadfile(filename [,mode [,env]]).
  • -
  • math.log(x [,base]). -
  • string.rep(s, n [,sep]). +
  • math.log(x [,base]).
  • +
  • string.rep(s, n [,sep]).
  • string.format(): %q reversible. %s checks __tostring. %a and "%A added.
  • @@ -295,7 +295,7 @@ instead of true. exit status.
  • debug.setmetatable() returns object.
  • debug.getuservalue() and debug.setuservalue().
  • -
  • Remove math.mod(), string.gfind(). +
  • Remove math.mod(), string.gfind().
  • Note: this provides only partial compatibility with Lua 5.2 at the From c1981676907cedde9ffe2bbdfb28d2f786ff69d9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 11:37:28 +0100 Subject: [PATCH 42/94] Add some more extensions from Lua 5.2/5.3. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contributed by François Perrad. --- doc/extensions.html | 4 ++++ src/Makefile.dep | 4 ++-- src/host/buildvm_libbc.h | 17 ++++++++++++++--- src/lib_base.c | 7 +++++++ src/lib_io.c | 11 +++++------ src/lib_package.c | 4 ++++ src/lib_table.c | 20 ++++++++++++++++++++ 7 files changed, 56 insertions(+), 11 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index 70dc6995..b048f137 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -303,6 +303,7 @@ enabled:

  • os.exit(status|true|false [,close]).
  • package.searchpath(name, path [, sep [, rep]]).
  • package.loadlib(name, "*").
  • +
  • package.searchers.
  • debug.getinfo() returns nparams and isvararg for option "u".
  • debug.getlocal() accepts function instead of level.
  • @@ -350,6 +351,9 @@ LuaJIT supports some extensions from Lua 5.3:
    • Unicode escape '\u{XX...}' embeds the UTF-8 encoding in string literals.
    • The argument table arg can be read (and modified) by LUA_INIT and -e chunks.
    • +
    • io.read() and file:read() accept formats with or without a leading *.
    • +
    • table.move(a1, f, e, t [,a2]).
    • +
    • coroutine.isyieldable().

    C++ Exception Interoperability

    diff --git a/src/Makefile.dep b/src/Makefile.dep index 4ef002e9..2b1cb5ef 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -3,8 +3,8 @@ lib_aux.o: lib_aux.c lua.h luaconf.h lauxlib.h lj_obj.h lj_def.h \ lj_dispatch.h lj_bc.h lj_traceerr.h lj_lib.h lj_alloc.h lib_base.o: lib_base.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \ lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_str.h \ - lj_tab.h lj_meta.h lj_state.h lj_ctype.h lj_cconv.h lj_bc.h lj_ff.h \ - lj_ffdef.h lj_dispatch.h lj_jit.h lj_ir.h lj_char.h lj_strscan.h \ + lj_tab.h lj_meta.h lj_state.h lj_frame.h lj_bc.h lj_ctype.h lj_cconv.h \ + lj_ff.h lj_ffdef.h lj_dispatch.h lj_jit.h lj_ir.h lj_char.h lj_strscan.h \ lj_strfmt.h lj_lib.h lj_libdef.h lib_bit.o: lib_bit.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \ lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_strscan.h \ diff --git a/src/host/buildvm_libbc.h b/src/host/buildvm_libbc.h index 45f8f8cb..b2600bd5 100644 --- a/src/host/buildvm_libbc.h +++ b/src/host/buildvm_libbc.h @@ -15,7 +15,12 @@ static const uint8_t libbc_code[] = { 8,2,0,0,88,3,23,128,59,3,2,0,43,4,0,0,64,4,2,0,76,3,2,0,88,3,18,128,16,1,14, 0,41,3,1,0,3,3,1,0,88,3,14,128,3,1,2,0,88,3,12,128,59,3,1,0,22,4,1,1,18,5,2, 0,41,6,1,0,77,4,4,128,23,8,1,7,59,9,7,0,64,9,8,0,79,4,252,127,43,4,0,0,64,4, -2,0,76,3,2,0,75,0,1,0,0,2,0 +2,0,76,3,2,0,75,0,1,0,0,2,0,5,12,0,0,0,35,16,0,12,0,16,1,14,0,16,2,14,0,16, +3,14,0,11,4,0,0,88,5,1,128,18,4,0,0,16,4,12,0,3,1,2,0,88,5,24,128,33,5,1,3, +0,2,3,0,88,6,4,128,2,3,1,0,88,6,2,128,4,4,0,0,88,6,9,128,18,6,1,0,18,7,2,0, +41,8,1,0,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,6,252,127,88,6,8,128, +18,6,2,0,18,7,1,0,41,8,255,255,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79, +6,252,127,76,4,2,0,0 #else 0,1,2,0,0,1,2,24,1,0,0,76,1,2,0,241,135,158,166,3,220,203,178,130,4,0,1,2,0, 0,1,2,24,1,0,0,76,1,2,0,243,244,148,165,20,198,190,199,252,3,0,1,2,0,0,0,3, @@ -28,7 +33,12 @@ static const uint8_t libbc_code[] = { 8,2,0,0,88,3,23,128,59,3,2,0,43,4,0,0,64,4,2,0,76,3,2,0,88,3,18,128,16,1,14, 0,41,3,1,0,3,3,1,0,88,3,14,128,3,1,2,0,88,3,12,128,59,3,1,0,22,4,1,1,18,5,2, 0,41,6,1,0,77,4,4,128,23,8,1,7,59,9,7,0,64,9,8,0,79,4,252,127,43,4,0,0,64,4, -2,0,76,3,2,0,75,0,1,0,0,2,0 +2,0,76,3,2,0,75,0,1,0,0,2,0,5,12,0,0,0,35,16,0,12,0,16,1,14,0,16,2,14,0,16, +3,14,0,11,4,0,0,88,5,1,128,18,4,0,0,16,4,12,0,3,1,2,0,88,5,24,128,33,5,1,3, +0,2,3,0,88,6,4,128,2,3,1,0,88,6,2,128,4,4,0,0,88,6,9,128,18,6,1,0,18,7,2,0, +41,8,1,0,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,6,252,127,88,6,8,128, +18,6,2,0,18,7,1,0,41,8,255,255,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79, +6,252,127,76,4,2,0,0 #endif }; @@ -40,6 +50,7 @@ static const struct { const char *name; int ofs; } libbc_map[] = { {"table_foreach",136}, {"table_getn",207}, {"table_remove",226}, -{NULL,355} +{"table_move",355}, +{NULL,502} }; diff --git a/src/lib_base.c b/src/lib_base.c index 6107bde0..7c523241 100644 --- a/src/lib_base.c +++ b/src/lib_base.c @@ -23,6 +23,7 @@ #include "lj_tab.h" #include "lj_meta.h" #include "lj_state.h" +#include "lj_frame.h" #if LJ_HASFFI #include "lj_ctype.h" #include "lj_cconv.h" @@ -557,6 +558,12 @@ LJLIB_CF(coroutine_running) #endif } +LJLIB_CF(coroutine_isyieldable) +{ + setboolV(L->top++, cframe_canyield(L->cframe)); + return 1; +} + LJLIB_CF(coroutine_create) { lua_State *L1; diff --git a/src/lib_io.c b/src/lib_io.c index 31f0ea97..53c17d92 100644 --- a/src/lib_io.c +++ b/src/lib_io.c @@ -203,13 +203,12 @@ static int io_file_read(lua_State *L, FILE *fp, int start) for (n = start; nargs-- && ok; n++) { if (tvisstr(L->base+n)) { const char *p = strVdata(L->base+n); - if (p[0] != '*') - lj_err_arg(L, n+1, LJ_ERR_INVOPT); - if (p[1] == 'n') + if (p[0] == '*') p++; + if (p[0] == 'n') ok = io_file_readnum(L, fp); - else if ((p[1] & ~0x20) == 'L') - ok = io_file_readline(L, fp, (p[1] == 'l')); - else if (p[1] == 'a') + else if ((p[0] & ~0x20) == 'L') + ok = io_file_readline(L, fp, (p[0] == 'l')); + else if (p[0] == 'a') io_file_readall(L, fp); else lj_err_arg(L, n+1, LJ_ERR_INVFMT); diff --git a/src/lib_package.c b/src/lib_package.c index 898897b1..c0252b73 100644 --- a/src/lib_package.c +++ b/src/lib_package.c @@ -589,6 +589,10 @@ LUALIB_API int luaopen_package(lua_State *L) lj_lib_pushcf(L, package_loaders[i], 1); lua_rawseti(L, -2, i+1); } +#if LJ_52 + lua_pushvalue(L, -1); + lua_setfield(L, -3, "searchers"); +#endif lua_setfield(L, -2, "loaders"); lua_getfield(L, LUA_REGISTRYINDEX, "LUA_NOENV"); noenv = lua_toboolean(L, -1); diff --git a/src/lib_table.c b/src/lib_table.c index f9a3693d..0204f25d 100644 --- a/src/lib_table.c +++ b/src/lib_table.c @@ -129,6 +129,26 @@ LJLIB_LUA(table_remove) /* end */ +LJLIB_LUA(table_move) /* + function(a1, f, e, t, a2) + CHECK_tab(a1) + CHECK_int(f) + CHECK_int(e) + CHECK_int(t) + if a2 == nil then a2 = a1 end + CHECK_tab(a2) + if e >= f then + local d = t - f + if t > e or t <= f or a2 ~= a1 then + for i=f,e do a2[i+d] = a1[i] end + else + for i=e,f,-1 do a2[i+d] = a1[i] end + end + end + return a2 + end +*/ + LJLIB_CF(table_concat) LJLIB_REC(.) { GCtab *t = lj_lib_checktab(L, 1); From c94b921f924c1b37fea52e34f4e01ba8b37d77d0 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 12:21:12 +0100 Subject: [PATCH 43/94] LJ_GC64: Add build options and install instructions. --- doc/extensions.html | 3 ++- doc/install.html | 15 ++++++++++----- src/Makefile | 3 +++ src/msvcbuild.bat | 8 +++++++- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index b048f137..cb9be3f4 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -210,7 +210,8 @@ bytecode (e.g. from Lua 5.1) is incompatible and cannot be loaded.

    Note: LJ_GC64 mode requires a different frame layout, which implies a different, incompatible bytecode format for ports that use this mode (e.g. -ARM64). This may be rectified in the future. +ARM64 or MIPS64) or when explicitly enabled for x64. This may be rectified +in the future.

    table.new(narray, nhash) allocates a pre-sized table

    diff --git a/doc/install.html b/doc/install.html index efeda33c..230e9386 100644 --- a/doc/install.html +++ b/doc/install.html @@ -175,6 +175,14 @@ MSVC or WinSDK. Please read the instructions given in these files, before changing any settings.

    +

    +LuaJIT on x64 currently uses 32 bit GC objects by default. +LJ_GC64 mode may be explicitly enabled: +add XCFLAGS=-DLUAJIT_ENABLE_GC64 to the make command or run +msvcbuild gc64 for MSVC/WinSDK. Please check the note +about the bytecode format +differences, too. +

    POSIX Systems (Linux, OSX, *BSD etc.)

    Prerequisites

    @@ -584,14 +592,11 @@ intend to load Lua/C modules at runtime.
  • If you're building a 64 bit application on OSX which links directly or -indirectly against LuaJIT, you need to link your main executable -with these flags: +indirectly against LuaJIT which is not built for LJ_GC64 mode, +you need to link your main executable with these flags:
     -pagezero_size 10000 -image_base 100000000
     
    -Also, it's recommended to rebase all (self-compiled) shared libraries -which are loaded at runtime on OSX/x64 (e.g. C extension modules for Lua). -See: man rebase
  • Additional hints for initializing LuaJIT using the C API functions:

    diff --git a/src/Makefile b/src/Makefile index 4e479ae5..21e16a22 100644 --- a/src/Makefile +++ b/src/Makefile @@ -110,6 +110,9 @@ XCFLAGS= #XCFLAGS+= -DLUAJIT_NUMMODE=1 #XCFLAGS+= -DLUAJIT_NUMMODE=2 # +# Enable GC64 mode for x64. +#XCFLAGS+= -DLUAJIT_ENABLE_GC64 +# ############################################################################## ############################################################################## diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index 4334bbde..f7a1addc 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -20,6 +20,7 @@ @set LJLIB=lib /nologo /nodefaultlib @set DASMDIR=..\dynasm @set DASM=%DASMDIR%\dynasm.lua +@set DASC=vm_x86.dasc @set LJDLLNAME=lua51.dll @set LJLIBNAME=lua51.lib @set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c @@ -39,7 +40,12 @@ if exist minilua.exe.manifest^ @set LJARCH=x86 @set LJCOMPILE=%LJCOMPILE% /arch:SSE2 :X64 -minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x86.dasc +@if "%1" neq "gc64" goto :NOGC64 +@shift +@set DASC=vm_x64.dasc +@set LJCOMPILE=%LJCOMPILE% /DLUAJIT_ENABLE_GC64 +:NOGC64 +minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC% @if errorlevel 1 goto :BAD %LJCOMPILE% /I "." /I %DASMDIR% host\buildvm*.c From b93a1dd0c831cab22f98163d0dde792a493c0eef Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 12:35:03 +0100 Subject: [PATCH 44/94] Bump copyright date to 2017. --- COPYRIGHT | 2 +- Makefile | 2 +- README | 2 +- doc/bluequad-print.css | 2 +- doc/bluequad.css | 2 +- doc/changes.html | 4 ++-- doc/contact.html | 6 +++--- doc/ext_c_api.html | 4 ++-- doc/ext_ffi.html | 4 ++-- doc/ext_ffi_api.html | 4 ++-- doc/ext_ffi_semantics.html | 4 ++-- doc/ext_ffi_tutorial.html | 4 ++-- doc/ext_jit.html | 4 ++-- doc/extensions.html | 4 ++-- doc/faq.html | 4 ++-- doc/install.html | 4 ++-- doc/luajit.html | 6 +++--- doc/running.html | 4 ++-- doc/status.html | 4 ++-- dynasm/dasm_arm.h | 2 +- dynasm/dasm_arm.lua | 2 +- dynasm/dasm_mips.h | 2 +- dynasm/dasm_mips.lua | 2 +- dynasm/dasm_ppc.h | 2 +- dynasm/dasm_ppc.lua | 2 +- dynasm/dasm_proto.h | 2 +- dynasm/dasm_x64.lua | 2 +- dynasm/dasm_x86.h | 2 +- dynasm/dasm_x86.lua | 2 +- dynasm/dynasm.lua | 4 ++-- etc/luajit.1 | 2 +- src/Makefile | 2 +- src/host/buildvm.c | 2 +- src/host/buildvm.h | 2 +- src/host/buildvm_asm.c | 2 +- src/host/buildvm_fold.c | 2 +- src/host/buildvm_lib.c | 2 +- src/host/buildvm_peobj.c | 2 +- src/host/genminilua.lua | 2 +- src/jit/bc.lua | 2 +- src/jit/bcsave.lua | 2 +- src/jit/dis_arm.lua | 2 +- src/jit/dis_mips.lua | 2 +- src/jit/dis_mipsel.lua | 2 +- src/jit/dis_ppc.lua | 2 +- src/jit/dis_x64.lua | 2 +- src/jit/dis_x86.lua | 2 +- src/jit/dump.lua | 2 +- src/jit/v.lua | 2 +- src/lib_aux.c | 2 +- src/lib_base.c | 2 +- src/lib_bit.c | 2 +- src/lib_debug.c | 2 +- src/lib_ffi.c | 2 +- src/lib_init.c | 2 +- src/lib_io.c | 2 +- src/lib_jit.c | 2 +- src/lib_math.c | 2 +- src/lib_os.c | 2 +- src/lib_package.c | 2 +- src/lib_string.c | 2 +- src/lib_table.c | 2 +- src/lj_api.c | 2 +- src/lj_arch.h | 2 +- src/lj_asm.c | 2 +- src/lj_asm.h | 2 +- src/lj_asm_arm.h | 2 +- src/lj_asm_mips.h | 2 +- src/lj_asm_ppc.h | 2 +- src/lj_asm_x86.h | 2 +- src/lj_bc.c | 2 +- src/lj_bc.h | 2 +- src/lj_bcdump.h | 2 +- src/lj_bcread.c | 2 +- src/lj_bcwrite.c | 2 +- src/lj_carith.c | 2 +- src/lj_carith.h | 2 +- src/lj_ccall.c | 2 +- src/lj_ccall.h | 2 +- src/lj_ccallback.c | 2 +- src/lj_ccallback.h | 2 +- src/lj_cconv.c | 2 +- src/lj_cconv.h | 2 +- src/lj_cdata.c | 2 +- src/lj_cdata.h | 2 +- src/lj_clib.c | 2 +- src/lj_clib.h | 2 +- src/lj_cparse.c | 2 +- src/lj_cparse.h | 2 +- src/lj_crecord.c | 2 +- src/lj_crecord.h | 2 +- src/lj_ctype.c | 2 +- src/lj_ctype.h | 2 +- src/lj_debug.c | 2 +- src/lj_debug.h | 2 +- src/lj_def.h | 2 +- src/lj_dispatch.c | 2 +- src/lj_dispatch.h | 2 +- src/lj_emit_arm.h | 2 +- src/lj_emit_mips.h | 2 +- src/lj_emit_ppc.h | 2 +- src/lj_emit_x86.h | 2 +- src/lj_err.c | 2 +- src/lj_err.h | 2 +- src/lj_errmsg.h | 2 +- src/lj_ff.h | 2 +- src/lj_ffrecord.c | 2 +- src/lj_ffrecord.h | 2 +- src/lj_frame.h | 2 +- src/lj_func.c | 2 +- src/lj_func.h | 2 +- src/lj_gc.c | 2 +- src/lj_gc.h | 2 +- src/lj_gdbjit.c | 2 +- src/lj_gdbjit.h | 2 +- src/lj_ir.c | 2 +- src/lj_ir.h | 2 +- src/lj_ircall.h | 2 +- src/lj_iropt.h | 2 +- src/lj_jit.h | 2 +- src/lj_lex.c | 2 +- src/lj_lex.h | 2 +- src/lj_lib.c | 2 +- src/lj_lib.h | 2 +- src/lj_load.c | 2 +- src/lj_mcode.c | 2 +- src/lj_mcode.h | 2 +- src/lj_meta.c | 2 +- src/lj_meta.h | 2 +- src/lj_obj.c | 2 +- src/lj_obj.h | 2 +- src/lj_opt_dce.c | 2 +- src/lj_opt_fold.c | 2 +- src/lj_opt_loop.c | 2 +- src/lj_opt_mem.c | 2 +- src/lj_opt_narrow.c | 2 +- src/lj_opt_sink.c | 2 +- src/lj_opt_split.c | 2 +- src/lj_parse.c | 2 +- src/lj_parse.h | 2 +- src/lj_record.c | 2 +- src/lj_record.h | 2 +- src/lj_snap.c | 2 +- src/lj_snap.h | 2 +- src/lj_state.c | 2 +- src/lj_state.h | 2 +- src/lj_str.c | 2 +- src/lj_str.h | 2 +- src/lj_strscan.c | 2 +- src/lj_strscan.h | 2 +- src/lj_tab.c | 2 +- src/lj_tab.h | 2 +- src/lj_target.h | 2 +- src/lj_target_arm.h | 2 +- src/lj_target_mips.h | 2 +- src/lj_target_ppc.h | 2 +- src/lj_target_x86.h | 2 +- src/lj_trace.c | 2 +- src/lj_trace.h | 2 +- src/lj_traceerr.h | 2 +- src/lj_udata.c | 2 +- src/lj_udata.h | 2 +- src/lj_vm.h | 2 +- src/lj_vmevent.c | 2 +- src/lj_vmevent.h | 2 +- src/lj_vmmath.c | 2 +- src/ljamalg.c | 2 +- src/luaconf.h | 2 +- src/luajit.c | 2 +- src/luajit.h | 4 ++-- src/lualib.h | 2 +- src/msvcbuild.bat | 2 +- src/vm_arm.dasc | 2 +- src/vm_mips.dasc | 2 +- src/vm_ppc.dasc | 2 +- src/vm_ppcspe.dasc | 2 +- src/vm_x86.dasc | 2 +- 177 files changed, 195 insertions(+), 195 deletions(-) diff --git a/COPYRIGHT b/COPYRIGHT index b614d3eb..6ed40025 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,7 +1,7 @@ =============================================================================== LuaJIT -- a Just-In-Time Compiler for Lua. http://luajit.org/ -Copyright (C) 2005-2016 Mike Pall. All rights reserved. +Copyright (C) 2005-2017 Mike Pall. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index eb9572d8..b29f1031 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ # For MSVC, please follow the instructions given in src/msvcbuild.bat. # For MinGW and Cygwin, cd to src and run make with the Makefile there. # -# Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h +# Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h ############################################################################## MAJVER= 2 diff --git a/README b/README index e5bb1c62..1b0abed2 100644 --- a/README +++ b/README @@ -5,7 +5,7 @@ LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language. Project Homepage: http://luajit.org/ -LuaJIT is Copyright (C) 2005-2016 Mike Pall. +LuaJIT is Copyright (C) 2005-2017 Mike Pall. LuaJIT is free software, released under the MIT license. See full Copyright Notice in the COPYRIGHT file or in luajit.h. diff --git a/doc/bluequad-print.css b/doc/bluequad-print.css index 975a55bf..62e1c165 100644 --- a/doc/bluequad-print.css +++ b/doc/bluequad-print.css @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2016 Mike Pall. +/* Copyright (C) 2004-2017 Mike Pall. * * You are welcome to use the general ideas of this design for your own sites. * But please do not steal the stylesheet, the layout or the color scheme. diff --git a/doc/bluequad.css b/doc/bluequad.css index 5dca9064..be2c4bf2 100644 --- a/doc/bluequad.css +++ b/doc/bluequad.css @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2016 Mike Pall. +/* Copyright (C) 2004-2017 Mike Pall. * * You are welcome to use the general ideas of this design for your own sites. * But please do not steal the stylesheet, the layout or the color scheme. diff --git a/doc/changes.html b/doc/changes.html index 96eef660..8811efc5 100644 --- a/doc/changes.html +++ b/doc/changes.html @@ -4,7 +4,7 @@ LuaJIT Change History - + @@ -968,7 +968,7 @@ This is the initial non-public release of LuaJIT.

    Building LuaJIT

    The supplied Makefiles try to auto-detect the settings needed for your diff --git a/doc/running.html b/doc/running.html index 1232b84f..331c22df 100644 --- a/doc/running.html +++ b/doc/running.html @@ -186,7 +186,7 @@ itself. For a description of their options and output format, please read the comment block at the start of their source. They can be found in the lib directory of the source distribution or installed under the jit directory. By default -this is /usr/local/share/luajit-2.0.4/jit on POSIX +this is /usr/local/share/luajit-2.0.5/jit on POSIX systems.

    diff --git a/etc/luajit.pc b/etc/luajit.pc index a652b40d..36840ab8 100644 --- a/etc/luajit.pc +++ b/etc/luajit.pc @@ -1,7 +1,7 @@ # Package information for LuaJIT to be used by pkg-config. majver=2 minver=0 -relver=4 +relver=5 version=${majver}.${minver}.${relver} abiver=5.1 diff --git a/src/Makefile b/src/Makefile index bd172dbe..f7f81a4e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,7 +12,7 @@ MAJVER= 2 MINVER= 0 -RELVER= 4 +RELVER= 5 ABIVER= 5.1 NODOTABIVER= 51 diff --git a/src/jit/bc.lua b/src/jit/bc.lua index a4cd5f20..a2f84aaf 100644 --- a/src/jit/bc.lua +++ b/src/jit/bc.lua @@ -41,7 +41,7 @@ -- Cache some library functions and objects. local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local jutil = require("jit.util") local vmdef = require("jit.vmdef") local bit = require("bit") diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index d548b1a7..aa677dfc 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -11,7 +11,7 @@ ------------------------------------------------------------------------------ local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local bit = require("bit") -- Symbol name prefix for LuaJIT bytecode. diff --git a/src/jit/dump.lua b/src/jit/dump.lua index 6c6abb66..666ba438 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -55,7 +55,7 @@ -- Cache some library functions and objects. local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local jutil = require("jit.util") local vmdef = require("jit.vmdef") local funcinfo, funcbc = jutil.funcinfo, jutil.funcbc diff --git a/src/jit/v.lua b/src/jit/v.lua index 0f5407a1..47ee3941 100644 --- a/src/jit/v.lua +++ b/src/jit/v.lua @@ -59,7 +59,7 @@ -- Cache some library functions and objects. local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local jutil = require("jit.util") local vmdef = require("jit.vmdef") local funcinfo, traceinfo = jutil.funcinfo, jutil.traceinfo diff --git a/src/luaconf.h b/src/luaconf.h index 73b80f11..b33e91b7 100644 --- a/src/luaconf.h +++ b/src/luaconf.h @@ -37,7 +37,7 @@ #endif #define LUA_LROOT "/usr/local" #define LUA_LUADIR "/lua/5.1/" -#define LUA_LJDIR "/luajit-2.0.4/" +#define LUA_LJDIR "/luajit-2.0.5/" #ifdef LUA_ROOT #define LUA_JROOT LUA_ROOT diff --git a/src/luajit.h b/src/luajit.h index 1709ca26..c5ff3acb 100644 --- a/src/luajit.h +++ b/src/luajit.h @@ -30,9 +30,9 @@ #include "lua.h" -#define LUAJIT_VERSION "LuaJIT 2.0.4" -#define LUAJIT_VERSION_NUM 20004 /* Version 2.0.4 = 02.00.04. */ -#define LUAJIT_VERSION_SYM luaJIT_version_2_0_4 +#define LUAJIT_VERSION "LuaJIT 2.0.5" +#define LUAJIT_VERSION_NUM 20005 /* Version 2.0.5 = 02.00.05. */ +#define LUAJIT_VERSION_SYM luaJIT_version_2_0_5 #define LUAJIT_COPYRIGHT "Copyright (C) 2005-2017 Mike Pall" #define LUAJIT_URL "http://luajit.org/" From e9f8abfbf5e565f26b10dbd5c66f73143ff2211b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 1 May 2017 21:02:34 +0200 Subject: [PATCH 93/94] Update changelog. --- doc/changes.html | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/changes.html b/doc/changes.html index 6522fa16..a66a8d95 100644 --- a/doc/changes.html +++ b/doc/changes.html @@ -74,6 +74,30 @@ to see whether newer versions are available.

    +

    LuaJIT 2.1.0-beta3 — 2017-05-01

    +
      +
    • Rewrite memory block allocator.
    • +
    • Add various extension from Lua 5.2/5.3.
    • +
    • Remove old Lua 5.0 compatibility defines.
    • +
    • Set arg table before evaluating LUA_INIT and -e chunks.
    • +
    • Fix FOLD rules for math.abs() and FP negation.
    • +
    • Fix soft-float math.abs() and negation.
    • +
    • Fix formatting of some small denormals at low precision.
    • +
    • LJ_GC64: Add JIT compiler support.
    • +
    • x64/LJ_GC64: Add JIT compiler backend.
    • +
    • x86/x64: Generate BMI2 shifts and rotates, if available.
    • +
    • Windows/x86: Add full exception interoperability.
    • +
    • ARM64: Add big-endian support.
    • +
    • ARM64: Add JIT compiler backend.
    • +
    • MIPS: Fix TSETR barrier.
    • +
    • MIPS: Support MIPS16 interlinking.
    • +
    • MIPS soft-float: Fix code generation for HREF.
    • +
    • MIPS64: Add MIPS64 hard-float JIT compiler backend.
    • +
    • MIPS64: Add MIPS64 hard-float/soft-float support to interpreter.
    • +
    • FFI: Compile bitfield loads/stores.
    • +
    • Various fixes common with the 2.0 branch.
    • +
    +

    LuaJIT 2.1.0-beta2 — 2016-03-03

    • Enable trace stitching.
    • @@ -149,7 +173,7 @@ Please take a look at the commit history for more details.
    • Remove internal __mode = "K" and replace with safe check.
    • Add "proto" field to jit.util.funcinfo().
    • Fix GC step size calculation.
    • -
    • Initialize uv->immutable for upvalues of loaded chunks.
    • +
    • Initialize uv->immutable for upvalues of loaded chunks.
    • Fix for cdata vs. non-cdata arithmetics/comparisons.
    • Drop leftover regs in 'for' iterator assignment, too.
    • Fix PHI remarking in SINK pass.
    • From 8271c643c21d1b2f344e339f559f2de6f3663191 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 1 May 2017 21:03:01 +0200 Subject: [PATCH 94/94] RELEASE LuaJIT-2.1.0-beta3 --- Makefile | 2 +- README | 2 +- etc/luajit.pc | 2 +- src/luaconf.h | 2 +- src/luajit.h | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index e6472e0b..0f933089 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ MAJVER= 2 MINVER= 1 RELVER= 0 -PREREL= -beta2 +PREREL= -beta3 VERSION= $(MAJVER).$(MINVER).$(RELVER)$(PREREL) ABIVER= 5.1 diff --git a/README b/README index 719e6118..2b9ae9d2 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -README for LuaJIT 2.1.0-beta2 +README for LuaJIT 2.1.0-beta3 ----------------------------- LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language. diff --git a/etc/luajit.pc b/etc/luajit.pc index 0fdd1efd..a78f1746 100644 --- a/etc/luajit.pc +++ b/etc/luajit.pc @@ -2,7 +2,7 @@ majver=2 minver=1 relver=0 -version=${majver}.${minver}.${relver}-beta2 +version=${majver}.${minver}.${relver}-beta3 abiver=5.1 prefix=/usr/local diff --git a/src/luaconf.h b/src/luaconf.h index 0c70b145..c2d29d94 100644 --- a/src/luaconf.h +++ b/src/luaconf.h @@ -37,7 +37,7 @@ #endif #define LUA_LROOT "/usr/local" #define LUA_LUADIR "/lua/5.1/" -#define LUA_LJDIR "/luajit-2.1.0-beta2/" +#define LUA_LJDIR "/luajit-2.1.0-beta3/" #ifdef LUA_ROOT #define LUA_JROOT LUA_ROOT diff --git a/src/luajit.h b/src/luajit.h index c1c801c9..708a5a11 100644 --- a/src/luajit.h +++ b/src/luajit.h @@ -30,9 +30,9 @@ #include "lua.h" -#define LUAJIT_VERSION "LuaJIT 2.1.0-beta2" +#define LUAJIT_VERSION "LuaJIT 2.1.0-beta3" #define LUAJIT_VERSION_NUM 20100 /* Version 2.1.0 = 02.01.00. */ -#define LUAJIT_VERSION_SYM luaJIT_version_2_1_0_beta2 +#define LUAJIT_VERSION_SYM luaJIT_version_2_1_0_beta3 #define LUAJIT_COPYRIGHT "Copyright (C) 2005-2017 Mike Pall" #define LUAJIT_URL "http://luajit.org/"