From 7a58a8fb3d3d5808c54d096ab772113bf9024ae8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 13 Nov 2016 20:03:01 +0100 Subject: [PATCH 001/116] Report parent of stitched trace. Thanks to Nick Zavaritsky. --- src/jit/dump.lua | 2 +- src/jit/v.lua | 2 +- src/lj_trace.c | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/jit/dump.lua b/src/jit/dump.lua index a8bc2af2..1adf7095 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -556,7 +556,7 @@ local function dump_trace(what, tr, func, pc, otr, oex) if what == "start" then if dumpmode.H then out:write('
\n') end
     out:write("---- TRACE ", tr, " ", what)
-    if otr then out:write(" ", otr, "/", oex) end
+    if otr then out:write(" ", otr, "/", oex == -1 and "stitch" or oex) end
     out:write(" ", fmtfunc(func, pc), "\n")
   elseif what == "stop" or what == "abort" then
     out:write("---- TRACE ", tr, " ", what)
diff --git a/src/jit/v.lua b/src/jit/v.lua
index 60c8b05a..b07ec7c0 100644
--- a/src/jit/v.lua
+++ b/src/jit/v.lua
@@ -99,7 +99,7 @@ end
 local function dump_trace(what, tr, func, pc, otr, oex)
   if what == "start" then
     startloc = fmtfunc(func, pc)
-    startex = otr and "("..otr.."/"..oex..") " or ""
+    startex = otr and "("..otr.."/"..(oex == -1 and "stitch" or oex)..") " or ""
   else
     if what == "abort" then
       local loc = fmtfunc(func, pc)
diff --git a/src/lj_trace.c b/src/lj_trace.c
index 87146832..11e54d97 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -446,6 +446,12 @@ static void trace_start(jit_State *J)
     if (J->parent) {
       setintV(L->top++, J->parent);
       setintV(L->top++, J->exitno);
+    } else {
+      BCOp op = bc_op(*J->pc);
+      if (op == BC_CALLM || op == BC_CALL || op == BC_ITERC) {
+	setintV(L->top++, J->exitno);  /* Parent of stitched trace. */
+	setintV(L->top++, -1);
+      }
     }
   );
   lj_record_setup(J);

From 5400c1e42469cdb3cb5df691baa877b762b27704 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Wed, 16 Nov 2016 11:18:10 +0100
Subject: [PATCH 002/116] MIPS: Fix TSETR barrier.

Thanks to tongwell.
---
 src/vm_mips.dasc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc
index 6f5a83dd..7dc42905 100644
--- a/src/vm_mips.dasc
+++ b/src/vm_mips.dasc
@@ -4317,7 +4317,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_next2
     |
     |7:  // Possible table write barrier for the value. Skip valiswhite check.
-    |  barrierback TAB:RB, TMP3, TMP0, <2
+    |  barrierback TAB:CARG2, TMP3, TMP0, <2
     break;
 
   case BC_TSETM:

From e577db52c543303543c9e30e8ebe0c244e1b85c8 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sat, 19 Nov 2016 19:53:46 +0100
Subject: [PATCH 003/116] Increase range of GG_State loads via IR_FLOAD with
 REF_NIL.

Require 32 bit alignment and store offset/4 instead.
Otherwise this can overflow the 10 bit limit for the FOLD op2 key.
---
 src/lj_asm_mips.h | 2 +-
 src/lj_asm_ppc.h  | 2 +-
 src/lj_asm_x86.h  | 4 ++--
 src/lj_ir.c       | 6 ++++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h
index cf446346..0ae5e287 100644
--- a/src/lj_asm_mips.h
+++ b/src/lj_asm_mips.h
@@ -901,7 +901,7 @@ static void asm_fload(ASMState *as, IRIns *ir)
   int32_t ofs;
   if (ir->op1 == REF_NIL) {
     idx = RID_JGL;
-    ofs = ir->op2 - 32768;
+    ofs = (ir->op2 << 2) - 32768;
   } else {
     idx = ra_alloc1(as, ir->op1, RSET_GPR);
     if (ir->op2 == IRFL_TAB_ARRAY) {
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h
index 46821515..1ac882ca 100644
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -809,7 +809,7 @@ static void asm_fload(ASMState *as, IRIns *ir)
   int32_t ofs;
   if (ir->op1 == REF_NIL) {
     idx = RID_JGL;
-    ofs = ir->op2 - 32768;
+    ofs = (ir->op2 << 2) - 32768;
   } else {
     idx = ra_alloc1(as, ir->op1, RSET_GPR);
     if (ir->op2 == IRFL_TAB_ARRAY) {
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 1b94371e..381eac9c 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -234,10 +234,10 @@ static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow)
   as->mrm.idx = RID_NONE;
   if (ir->op1 == REF_NIL) {
 #if LJ_GC64
-    as->mrm.ofs = (int32_t)ir->op2 - GG_OFS(dispatch);
+    as->mrm.ofs = (int32_t)(ir->op2 << 2) - GG_OFS(dispatch);
     as->mrm.base = RID_DISPATCH;
 #else
-    as->mrm.ofs = (int32_t)ir->op2 + ptr2addr(J2GG(as->J));
+    as->mrm.ofs = (int32_t)(ir->op2 << 2) + ptr2addr(J2GG(as->J));
     as->mrm.base = RID_NONE;
 #endif
     return;
diff --git a/src/lj_ir.c b/src/lj_ir.c
index 87fd0f4d..c5c521be 100644
--- a/src/lj_ir.c
+++ b/src/lj_ir.c
@@ -145,10 +145,12 @@ TRef lj_ir_call(jit_State *J, IRCallID id, ...)
   return emitir(CCI_OPTYPE(ci), tr, id);
 }
 
-/* Load field of type t from GG_State + offset. */
+/* Load field of type t from GG_State + offset. Must be 32 bit aligned. */
 LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs)
 {
-  lua_assert(ofs >= IRFL__MAX && ofs < REF_BIAS);
+  lua_assert((ofs & 3) == 0);
+  ofs >>= 2;
+  lua_assert(ofs >= IRFL__MAX && ofs <= 0x3ff);  /* 10 bit FOLD key limit. */
   lj_ir_set(J, IRT(IR_FLOAD, t), REF_NIL, ofs);
   return lj_opt_fold(J);
 }

From 202713a63808856b07136d7639324a6dd548d37e Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sat, 19 Nov 2016 20:53:31 +0100
Subject: [PATCH 004/116] Fix amalgamated build.

---
 src/lj_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lj_state.h b/src/lj_state.h
index e128d321..1e5c8b34 100644
--- a/src/lj_state.h
+++ b/src/lj_state.h
@@ -28,7 +28,7 @@ static LJ_AINLINE void lj_state_checkstack(lua_State *L, MSize need)
 
 LJ_FUNC lua_State *lj_state_new(lua_State *L);
 LJ_FUNC void LJ_FASTCALL lj_state_free(global_State *g, lua_State *L);
-#if LJ_64
+#if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC))
 LJ_FUNC lua_State *lj_state_newstate(lua_Alloc f, void *ud);
 #endif
 

From 13642b75ac37957d9e2a37b35ebec69d6d4b3bc1 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 22:14:09 +0100
Subject: [PATCH 005/116] Whitespace.

---
 src/lj_strfmt_num.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lj_strfmt_num.c b/src/lj_strfmt_num.c
index 04769258..7b33f930 100644
--- a/src/lj_strfmt_num.c
+++ b/src/lj_strfmt_num.c
@@ -138,7 +138,7 @@ static uint32_t nd_mul2k(uint32_t* nd, uint32_t ndhi, uint32_t k,
     }
     if (carry_in) {
       nd[++ndhi] = carry_in; carry_in = 0;
-      if(start++ == ndlo) ++ndlo;
+      if (start++ == ndlo) ++ndlo;
     }
     k -= ND_MUL2K_MAX_SHIFT;
   }

From 04b60707d7d117da22b40736a353e2a10179108a Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 22:16:08 +0100
Subject: [PATCH 006/116] ARM64: Add JIT compiler backend.

Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
Sponsored by Cisco Systems, Inc.
---
 Makefile              |    4 +-
 src/jit/dis_arm64.lua | 1215 +++++++++++++++++++++++++++
 src/lj_arch.h         |    1 -
 src/lj_asm.c          |    4 +
 src/lj_asm_arm64.h    | 1823 +++++++++++++++++++++++++++++++++++++++++
 src/lj_ccall.c        |    2 +-
 src/lj_dispatch.h     |    1 +
 src/lj_emit_arm64.h   |  397 +++++++++
 src/lj_gdbjit.c       |   12 +
 src/lj_target.h       |    4 +-
 src/lj_target_arm64.h |  221 ++++-
 src/vm_arm64.dasc     |  227 ++++-
 12 files changed, 3887 insertions(+), 24 deletions(-)
 create mode 100644 src/jit/dis_arm64.lua
 create mode 100644 src/lj_asm_arm64.h
 create mode 100644 src/lj_emit_arm64.h

diff --git a/Makefile b/Makefile
index 6dfbbde4..5e640d94 100644
--- a/Makefile
+++ b/Makefile
@@ -86,8 +86,8 @@ FILE_MAN= luajit.1
 FILE_PC= luajit.pc
 FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h
 FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \
-	      dis_x86.lua dis_x64.lua dis_arm.lua dis_ppc.lua \
-	      dis_mips.lua dis_mipsel.lua vmdef.lua
+	      dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \
+	      dis_ppc.lua dis_mips.lua dis_mipsel.lua vmdef.lua
 
 ifeq (,$(findstring Windows,$(OS)))
   HOST_SYS:= $(shell uname -s)
diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua
new file mode 100644
index 00000000..909b33bc
--- /dev/null
+++ b/src/jit/dis_arm64.lua
@@ -0,0 +1,1215 @@
+----------------------------------------------------------------------------
+-- LuaJIT ARM64 disassembler module.
+--
+-- Copyright (C) 2005-2016 Mike Pall. All rights reserved.
+-- Released under the MIT license. See Copyright Notice in luajit.h
+--
+-- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+-- Sponsored by Cisco Systems, Inc.
+----------------------------------------------------------------------------
+-- This is a helper module used by the LuaJIT machine code dumper module.
+--
+-- It disassembles most user-mode AArch64 instructions.
+-- NYI: Advanced SIMD and VFP instructions.
+------------------------------------------------------------------------------
+
+local type, tonumber = type, tonumber
+local sub, byte, format = string.sub, string.byte, string.format
+local match, gmatch, gsub = string.match, string.gmatch, string.gsub
+local rep = string.rep
+local concat = table.concat
+local bit = require("bit")
+local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex
+local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift
+local ror = bit.ror
+
+------------------------------------------------------------------------------
+-- Opcode maps
+------------------------------------------------------------------------------
+
+local map_adr = { -- PC-relative addressing.
+  shift = 31, mask = 1,
+  [0] = "adrDBx", "adrpDBx"
+}
+
+local map_addsubi = { -- Add/subtract immediate.
+  shift = 29, mask = 3,
+  [0] = "add|movDNIg", "adds|cmnD0NIg", "subDNIg", "subs|cmpD0NIg",
+}
+
+local map_logi = { -- Logical immediate.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig"
+  }
+}
+
+local map_movwi = { -- Move wide immediate.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
+    }, false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg"
+  },
+}
+
+local map_bitf = { -- Bitfield.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 22, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12w",
+      "bfm|bfi|bfxilDN13w",
+      "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12w"
+    }
+  },
+  {
+    shift = 22, mask = 1,
+    {
+      shift = 29, mask = 3,
+      [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12x",
+      "bfm|bfi|bfxilDN13x",
+      "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12x"
+    }
+  }
+}
+
+local map_datai = { -- Data processing - immediate.
+  shift = 23, mask = 7,
+  [0] = map_adr, map_adr, map_addsubi, false,
+  map_logi, map_movwi, map_bitf,
+  {
+    shift = 15, mask = 0x1c0c1,
+    [0] = "extr|rorDNM4w", [0x10080] = "extr|rorDNM4x",
+    [0x10081] = "extr|rorDNM4x"
+  }
+}
+
+local map_logsr = { -- Logical, shifted register.
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = {
+	shift = 21, mask = 7,
+	[0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+	"andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+	     "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+	"eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+      },
+      {
+	shift = 21, mask = 7,
+	[0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+	"ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+      }
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = {
+      shift = 21, mask = 7,
+      [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg",
+      "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg",
+      "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg",
+      "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg",
+      "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg"
+    }
+  }
+}
+
+local map_assh = {
+  shift = 31, mask = 1,
+  [0] = {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = {
+	shift = 22, mask = 3,
+	[0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg",
+	      "adds|cmnD0NMSg", "adds|cmnD0NMg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
+	      "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
+      },
+    },
+    false -- unallocated
+  },
+  {
+    shift = 29, mask = 3,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg", "adds|cmnD0NMSg",
+	    "adds|cmnD0NMg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg",
+	    "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg"
+    }
+  }
+}
+
+local map_addsubsh = { -- Add/subtract, shifted register.
+  shift = 22, mask = 3,
+  [0] = map_assh, map_assh, map_assh
+}
+
+local map_addsubex = { -- Add/subtract, extended register.
+  shift = 22, mask = 3,
+  [0] = {
+    shift = 29, mask = 3,
+    [0] = "addDNMXg", "adds|cmnD0NMXg", "subDNMXg", "subs|cmpD0NMzXg",
+  }
+}
+
+local map_addsubc = { -- Add/subtract, with carry.
+  shift = 10, mask = 63,
+  [0] = {
+    shift = 29, mask = 3,
+    [0] = "adcDNMg", "adcsDNMg", "sbc|ngcDN0Mg", "sbcs|ngcsDN0Mg",
+  }
+}
+
+local map_ccomp = {
+  shift = 4, mask = 1,
+  [0] = {
+    shift = 10, mask = 3,
+    [0] = { -- Conditional compare register.
+      shift = 29, mask = 3,
+      "ccmnNMVCg", false, "ccmpNMVCg",
+    },
+    [2] = {  -- Conditional compare immediate.
+      shift = 29, mask = 3,
+      "ccmnN5VCg", false, "ccmpN5VCg",
+    }
+  }
+}
+
+local map_csel = { -- Conditional select.
+  shift = 11, mask = 1,
+  [0] = {
+    shift = 10, mask = 1,
+    [0] = {
+      shift = 29, mask = 3,
+      [0] = "cselDNMzCg", false, "csinv|cinv|csetmDNMcg", false,
+    },
+    {
+      shift = 29, mask = 3,
+      [0] = "csinc|cinc|csetDNMcg", false, "csneg|cnegDNMcg", false,
+    }
+  }
+}
+
+local map_data1s = { -- Data processing, 1 source.
+  shift = 29, mask = 1,
+  [0] = {
+    shift = 31, mask = 1,
+    [0] = {
+      shift = 10, mask = 0x7ff,
+      [0] = "rbitDNg", "rev16DNg", "revDNw", false, "clzDNg", "clsDNg"
+    },
+    {
+      shift = 10, mask = 0x7ff,
+      [0] = "rbitDNg", "rev16DNg", "rev32DNx", "revDNx", "clzDNg", "clsDNg"
+    }
+  }
+}
+
+local map_data2s = { -- Data processing, 2 sources.
+  shift = 29, mask = 1,
+  [0] = {
+    shift = 10, mask = 63,
+    false, "udivDNMg", "sdivDNMg", false, false, false, false, "lslDNMg",
+    "lsrDNMg", "asrDNMg", "rorDNMg"
+  }
+}
+
+local map_data3s = { -- Data processing, 3 sources.
+  shift = 29, mask = 7,
+  [0] = {
+    shift = 21, mask = 7,
+    [0] = {
+      shift = 15, mask = 1,
+      [0] = "madd|mulDNMA0g", "msub|mnegDNMA0g"
+    }
+  }, false, false, false,
+  {
+    shift = 15, mask = 1,
+    [0] = {
+      shift = 21, mask = 7,
+      [0] = "madd|mulDNMA0g", "smaddl|smullDxNMwA0x", "smulhDNMx", false,
+      false, "umaddl|umullDxNMwA0x", "umulhDNMx"
+    },
+    {
+      shift = 21, mask = 7,
+      [0] = "msub|mnegDNMA0g", "smsubl|smneglDxNMwA0x", false, false,
+      false, "umsubl|umneglDxNMwA0x"
+    }
+  }
+}
+
+local map_datar = { -- Data processing, register.
+  shift = 28, mask = 1,
+  [0] = {
+    shift = 24, mask = 1,
+    [0] = map_logsr,
+    {
+      shift = 21, mask = 1,
+      [0] = map_addsubsh, map_addsubex
+    }
+  },
+  {
+    shift = 21, mask = 15,
+    [0] = map_addsubc, false, map_ccomp, false, map_csel, false,
+    {
+      shift = 30, mask = 1,
+      [0] = map_data2s, map_data1s
+    },
+    false, map_data3s, map_data3s, map_data3s, map_data3s, map_data3s,
+    map_data3s, map_data3s, map_data3s
+  }
+}
+
+local map_lrl = { -- Load register, literal.
+  shift = 26, mask = 1,
+  [0] = {
+    shift = 30, mask = 3,
+    [0] = "ldrDwB", "ldrDxB", "ldrswDxB"
+  },
+  {
+    shift = 30, mask = 3,
+    [0] = "ldrDsB", "ldrDdB"
+  }
+}
+
+local map_lsriind = { -- Load/store register, immediate pre/post-indexed.
+  shift = 30, mask = 3,
+  [0] = {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strbDwzL", "ldrbDwzL", "ldrsbDxzL", "ldrsbDwzL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strhDwzL", "ldrhDwzL", "ldrshDxzL", "ldrshDwzL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strDwzL", "ldrDwzL", "ldrswDxzL"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "strDszL", "ldrDszL"
+    }
+  },
+  {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 22, mask = 3,
+      [0] = "strDxzL", "ldrDxzL"
+    },
+    {
+      shift = 22, mask = 3,
+      [0] = "strDdzL", "ldrDdzL"
+    }
+  }
+}
+
+local map_lsriro = {
+  shift = 21, mask = 1,
+  [0] = {  -- Load/store register immediate.
+    shift = 10, mask = 3,
+    [0] = { -- Unscaled immediate.
+      shift = 26, mask = 1,
+      [0] = {
+	shift = 30, mask = 3,
+	[0] = {
+	  shift = 22, mask = 3,
+	  [0] = "sturbDwK", "ldurbDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturhDwK", "ldurhDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturDwK", "ldurDwK"
+	},
+	{
+	  shift = 22, mask = 3,
+	  [0] = "sturDxK", "ldurDxK"
+	}
+      }
+    }, map_lsriind, false, map_lsriind
+  },
+  {  -- Load/store register, register offset.
+    shift = 10, mask = 3,
+    [2] = {
+      shift = 26, mask = 1,
+      [0] = {
+	shift = 30, mask = 3,
+	[1] = {
+	  shift = 22, mask = 3,
+	  [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO"
+	},
+	[2] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDwO", "ldrDwO", "ldrswDxO"
+	},
+	[3] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDxO", "ldrDxO"
+	}
+      },
+      {
+	shift = 30, mask = 3,
+	[2] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDsO", "ldrDsO"
+	},
+	[3] = {
+	  shift = 22, mask = 3,
+	  [0] = "strDdO", "ldrDdO"
+	}
+      }
+    }
+  }
+}
+
+local map_lsp = { -- Load/store register pair, offset.
+  shift = 22, mask = 1,
+  [0] = {
+    shift = 30, mask = 3,
+    [0] = {
+      shift = 26, mask = 1,
+      [0] = "stpDzAzwP", "stpDzAzsP",
+    },
+    {
+      shift = 26, mask = 1,
+      "stpDzAzdP"
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "stpDzAzxP"
+    }
+  },
+  {
+    shift = 30, mask = 3,
+    [0] = {
+      shift = 26, mask = 1,
+      [0] = "ldpDzAzwP", "ldpDzAzsP",
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "ldpswDAxP", "ldpDzAzdP"
+    },
+    {
+      shift = 26, mask = 1,
+      [0] = "ldpDzAzxP"
+    }
+  }
+}
+
+local map_ls = { -- Loads and stores.
+  shift = 24, mask = 0x31,
+  [0x10] = map_lrl, [0x30] = map_lsriro,
+  [0x20] = {
+    shift = 23, mask = 3,
+    map_lsp, map_lsp, map_lsp
+  },
+  [0x21] = {
+    shift = 23, mask = 3,
+    map_lsp, map_lsp, map_lsp
+  },
+  [0x31] = {
+    shift = 26, mask = 1,
+    [0] = {
+      shift = 30, mask = 3,
+      [0] = {
+	shift = 22, mask = 3,
+	[0] = "strbDwzU", "ldrbDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strhDwzU", "ldrhDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strDwzU", "ldrDwzU"
+      },
+      {
+	shift = 22, mask = 3,
+	[0] = "strDxzU", "ldrDxzU"
+      }
+    },
+    {
+      shift = 30, mask = 3,
+      [2] = {
+	shift = 22, mask = 3,
+	[0] = "strDszU", "ldrDszU"
+      },
+      [3] = {
+	shift = 22, mask = 3,
+	[0] = "strDdzU", "ldrDdzU"
+      }
+    }
+  },
+}
+
+local map_datafp = { -- Data processing, SIMD and FP.
+  shift = 28, mask = 7,
+  { -- 001
+    shift = 24, mask = 1,
+    [0] = {
+      shift = 21, mask = 1,
+      {
+	shift = 10, mask = 3,
+	[0] = {
+	  shift = 12, mask = 1,
+	  [0] = {
+	    shift = 13, mask = 1,
+	    [0] = {
+	      shift = 14, mask = 1,
+	      [0] = {
+		shift = 15, mask = 1,
+		[0] = { -- FP/int conversion.
+		  shift = 31, mask = 1,
+		  [0] = {
+		    shift = 16, mask = 0xff,
+		    [0x20] = "fcvtnsDwNs", [0x21] = "fcvtnuDwNs",
+		    [0x22] = "scvtfDsNw", [0x23] = "ucvtfDsNw",
+		    [0x24] = "fcvtasDwNs", [0x25] = "fcvtauDwNs",
+		    [0x26] = "fmovDwNs", [0x27] = "fmovDsNw",
+		    [0x28] = "fcvtpsDwNs", [0x29] = "fcvtpuDwNs",
+		    [0x30] = "fcvtmsDwNs", [0x31] = "fcvtmuDwNs",
+		    [0x38] = "fcvtzsDwNs", [0x39] = "fcvtzuDwNs",
+		    [0x60] = "fcvtnsDwNd", [0x61] = "fcvtnuDwNd",
+		    [0x62] = "scvtfDdNw", [0x63] = "ucvtfDdNw",
+		    [0x64] = "fcvtasDwNd", [0x65] = "fcvtauDwNd",
+		    [0x68] = "fcvtpsDwNd", [0x69] = "fcvtpuDwNd",
+		    [0x70] = "fcvtmsDwNd", [0x71] = "fcvtmuDwNd",
+		    [0x78] = "fcvtzsDwNd", [0x79] = "fcvtzuDwNd"
+		  },
+		  {
+		    shift = 16, mask = 0xff,
+		    [0x20] = "fcvtnsDxNs", [0x21] = "fcvtnuDxNs",
+		    [0x22] = "scvtfDsNx", [0x23] = "ucvtfDsNx",
+		    [0x24] = "fcvtasDxNs", [0x25] = "fcvtauDxNs",
+		    [0x28] = "fcvtpsDxNs", [0x29] = "fcvtpuDxNs",
+		    [0x30] = "fcvtmsDxNs", [0x31] = "fcvtmuDxNs",
+		    [0x38] = "fcvtzsDxNs", [0x39] = "fcvtzuDxNs",
+		    [0x60] = "fcvtnsDxNd", [0x61] = "fcvtnuDxNd",
+		    [0x62] = "scvtfDdNx", [0x63] = "ucvtfDdNx",
+		    [0x64] = "fcvtasDxNd", [0x65] = "fcvtauDxNd",
+		    [0x66] = "fmovDxNd", [0x67] = "fmovDdNx",
+		    [0x68] = "fcvtpsDxNd", [0x69] = "fcvtpuDxNd",
+		    [0x70] = "fcvtmsDxNd", [0x71] = "fcvtmuDxNd",
+		    [0x78] = "fcvtzsDxNd", [0x79] = "fcvtzuDxNd"
+		  }
+		}
+	      },
+	      { -- FP data-processing, 1 source.
+		shift = 31, mask = 1,
+		[0] = {
+		  shift = 22, mask = 3,
+		  [0] = {
+		    shift = 15, mask = 63,
+		    [0] = "fmovDNf", "fabsDNf", "fnegDNf",
+		    "fsqrtDNf", false, "fcvtDdNs", false, false,
+		    "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
+		    "frintaDNf", false, "frintxDNf", "frintiDNf",
+		  },
+		  {
+		    shift = 15, mask = 63,
+		    [0] = "fmovDNf", "fabsDNf", "fnegDNf",
+		    "fsqrtDNf", "fcvtDsNd", false, false, false,
+		    "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf",
+		    "frintaDNf", false, "frintxDNf", "frintiDNf",
+		  }
+		}
+	      }
+	    },
+	    { -- FP compare.
+	      shift = 31, mask = 1,
+	      [0] = {
+		shift = 14, mask = 3,
+		[0] = {
+		  shift = 23, mask = 1,
+		  [0] = {
+		    shift = 0, mask = 31,
+		    [0] = "fcmpNMf", [8] = "fcmpNZf",
+		    [16] = "fcmpeNMf", [24] = "fcmpeNZf",
+		  }
+		}
+	      }
+	    }
+	  },
+	  { -- FP immediate.
+	    shift = 31, mask = 1,
+	    [0] = {
+	      shift = 5, mask = 31,
+	      [0] = {
+		shift = 23, mask = 1,
+		[0] = "fmovDFf"
+	      }
+	    }
+	  }
+	},
+	{ -- FP conditional compare.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = {
+	      shift = 4, mask = 1,
+	      [0] = "fccmpNMVCf", "fccmpeNMVCf"
+	    }
+	  }
+	},
+	{ -- FP data-processing, 2 sources.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = {
+	      shift = 12, mask = 15,
+	      [0] = "fmulDNMf", "fdivDNMf", "faddDNMf", "fsubDNMf",
+	      "fmaxDNMf", "fminDNMf", "fmaxnmDNMf", "fminnmDNMf",
+	      "fnmulDNMf"
+	    }
+	  }
+	},
+	{ -- FP conditional select.
+	  shift = 31, mask = 1,
+	  [0] = {
+	    shift = 23, mask = 1,
+	    [0] = "fcselDNMCf"
+	  }
+	}
+      }
+    },
+    { -- FP data-processing, 3 sources.
+      shift = 31, mask = 1,
+      [0] = {
+	shift = 15, mask = 1,
+	[0] = {
+	  shift = 21, mask = 5,
+	  [0] = "fmaddDNMAf", "fnmaddDNMAf"
+	},
+	{
+	  shift = 21, mask = 5,
+	  [0] = "fmsubDNMAf", "fnmsubDNMAf"
+	}
+      }
+    }
+  }
+}
+
+local map_br = { -- Branches, exception generating and system instructions.
+  shift = 29, mask = 7,
+  [0] = "bB",
+  { -- Compare & branch, immediate.
+    shift = 24, mask = 3,
+    [0] = "cbzDBg", "cbnzDBg", "tbzDTBw", "tbnzDTBw"
+  },
+  { -- Conditional branch, immediate.
+    shift = 24, mask = 3,
+    [0] = {
+      shift = 4, mask = 1,
+      [0] = {
+	shift = 0, mask = 15,
+	[0] = "beqB", "bneB", "bhsB", "bloB", "bmiB", "bplB", "bvsB", "bvcB",
+	"bhiB", "blsB", "bgeB", "bltB", "bgtB", "bleB", "balB"
+      }
+    }
+  }, false, "blB",
+  { -- Compare & branch, immediate.
+    shift = 24, mask = 3,
+    [0] = "cbzDBg", "cbnzDBg", "tbzDTBx", "tbnzDTBx"
+  },
+  {
+    shift = 24, mask = 3,
+    [0] = { -- Exception generation.
+      shift = 0, mask = 0xe0001f,
+      [0x200000] = "brkW"
+    },
+    { -- System instructions.
+      shift = 0, mask = 0x3fffff,
+      [0x03201f] = "nop"
+    },
+    { -- Unconditional branch, register.
+      shift = 0, mask = 0xfffc1f,
+      [0x1f0000] = "brNx", [0x3f0000] = "blrNx",
+      [0x5f0000] = "retNx"
+    },
+  }
+}
+
+local map_init = {
+  shift = 25, mask = 15,
+  [0] = false, false, false, false, map_ls, map_datar, map_ls, map_datafp,
+  map_datai, map_datai, map_br, map_br, map_ls, map_datar, map_ls, map_datafp
+}
+
+------------------------------------------------------------------------------
+
+local map_regs = { x = {}, w = {}, d = {}, s = {} }
+
+for i=0,30 do
+  map_regs.x[i] = "x"..i
+  map_regs.w[i] = "w"..i
+  map_regs.d[i] = "d"..i
+  map_regs.s[i] = "s"..i
+end
+map_regs.x[31] = "sp"
+map_regs.w[31] = "wsp"
+map_regs.d[31] = "d31"
+map_regs.s[31] = "s31"
+
+local map_cond = {
+  [0] = "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+  "hi", "ls", "ge", "lt", "gt", "le", "al",
+}
+
+local map_shift = { [0] = "lsl", "lsr", "asr", }
+
+local map_extend = {
+  [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx",
+}
+
+------------------------------------------------------------------------------
+
+-- Output a nicely formatted line with an opcode and operands.
+local function putop(ctx, text, operands)
+  local pos = ctx.pos
+  local extra = ""
+  if ctx.rel then
+    local sym = ctx.symtab[ctx.rel]
+    if sym then
+      extra = "\t->"..sym
+    end
+  end
+  if ctx.hexdump > 0 then
+    ctx.out(format("%08x  %s  %-5s %s%s\n",
+      ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra))
+  else
+    ctx.out(format("%08x  %-5s %s%s\n",
+      ctx.addr+pos, text, concat(operands, ", "), extra))
+  end
+  ctx.pos = pos + 4
+end
+
+-- Fallback for unknown opcodes.
+local function unknown(ctx)
+  return putop(ctx, ".long", { "0x"..tohex(ctx.op) })
+end
+
+local function match_reg(p, pat, regnum)
+  return map_regs[match(pat, p.."%w-([xwds])")][regnum]
+end
+
+local function fmt_hex32(x)
+  if x < 0 then
+    return tohex(x)
+  else
+    return format("%x", x)
+  end
+end
+
+local imm13_rep = { 0x55555555, 0x11111111, 0x01010101, 0x00010001, 0x00000001 }
+
+local function decode_imm13(op)
+  local imms = band(rshift(op, 10), 63)
+  local immr = band(rshift(op, 16), 63)
+  if band(op, 0x00400000) == 0 then
+    local len = 5
+    if imms >= 56 then
+      if imms >= 60 then len = 1 else len = 2 end
+    elseif imms >= 48 then len = 3 elseif imms >= 32 then len = 4 end
+    local l = lshift(1, len)-1
+    local s = band(imms, l)
+    local r = band(immr, l)
+    local imm = ror(rshift(-1, 31-s), r)
+    if len ~= 5 then imm = band(imm, lshift(1, l)-1) + rshift(imm, 31-l) end
+    imm = imm * imm13_rep[len]
+    local ix = fmt_hex32(imm)
+    if rshift(op, 31) ~= 0 then
+      return ix..tohex(imm)
+    else
+      return ix
+    end
+  else
+    local lo, hi = -1, 0
+    if imms < 32 then lo = rshift(-1, 31-imms) else hi = rshift(-1, 63-imms) end
+    if immr ~= 0 then
+      lo, hi = ror(lo, immr), ror(hi, immr)
+      local x = immr == 32 and 0 or band(bxor(lo, hi), lshift(-1, 32-immr))
+      lo, hi = bxor(lo, x), bxor(hi, x)
+      if immr >= 32 then lo, hi = hi, lo end
+    end
+    if hi ~= 0 then
+      return fmt_hex32(hi)..tohex(lo)
+    else
+      return fmt_hex32(lo)
+    end
+  end
+end
+
+local function parse_immpc(op, name)
+  if name == "b" or name == "bl" then
+    return arshift(lshift(op, 6), 4)
+  elseif name == "adr" or name == "adrp" then
+    local immlo = band(rshift(op, 29), 3)
+    local immhi = lshift(arshift(lshift(op, 8), 13), 2)
+    return bor(immhi, immlo)
+  elseif name == "tbz" or name == "tbnz" then
+    return lshift(arshift(lshift(op, 13), 18), 2)
+  else
+    return lshift(arshift(lshift(op, 8), 13), 2)
+  end
+end
+
+local function parse_fpimm8(op)
+  local sign = band(op, 0x100000) == 0 and 1 or -1
+  local exp = bxor(rshift(arshift(lshift(op, 12), 5), 24), 0x80) - 131
+  local frac = 16+band(rshift(op, 13), 15)
+  return sign * frac * 2^exp
+end
+
+local function prefer_bfx(sf, uns, imms, immr)
+  if imms < immr or imms == 31 or imms == 63 then
+    return false
+  end
+  if immr == 0 then
+    if sf == 0 and (imms == 7 or imms == 15) then
+      return false
+    end
+    if sf ~= 0 and uns == 0 and (imms == 7 or imms == 15 or imms == 31) then
+      return false
+    end
+  end
+  return true
+end
+
+-- Disassemble a single instruction.
+local function disass_ins(ctx)
+  local pos = ctx.pos
+  local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4)
+  local op = bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0)
+  local operands = {}
+  local suffix = ""
+  local last, name, pat
+  local vr
+  local map_reg
+  ctx.op = op
+  ctx.rel = nil
+  last = nil
+  local opat
+  opat = map_init[band(rshift(op, 25), 15)]
+  while type(opat) ~= "string" do
+    if not opat then return unknown(ctx) end
+    opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._
+  end
+  name, pat = match(opat, "^([a-z0-9]*)(.*)")
+  local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)")
+  if altname then pat = pat2 end
+  if sub(pat, 1, 1) == "." then
+    local s2, p2 = match(pat, "^([a-z0-9.]*)(.*)")
+    suffix = suffix..s2
+    pat = p2
+  end
+
+  local rt = match(pat, "[gf]")
+  if rt then
+    if rt == "g" then
+      map_reg = band(op, 0x80000000) ~= 0 and map_regs.x or map_regs.w
+    else
+      map_reg = band(op, 0x400000) ~= 0 and map_regs.d or map_regs.s
+    end
+  end
+
+  local second0, immr
+
+  for p in gmatch(pat, ".") do
+    local x = nil
+    if p == "D" then
+      local regnum = band(op, 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "N" then
+      local regnum = band(rshift(op, 5), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "M" then
+      local regnum = band(rshift(op, 16), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "A" then
+      local regnum = band(rshift(op, 10), 31)
+      x = rt and map_reg[regnum] or match_reg(p, pat, regnum)
+    elseif p == "B" then
+      local addr = ctx.addr + pos + parse_immpc(op, name)
+      ctx.rel = addr
+      x = "0x"..tohex(addr)
+    elseif p == "T" then
+      x = bor(band(rshift(op, 26), 32), band(rshift(op, 19), 31))
+    elseif p == "V" then
+      x = band(op, 15)
+    elseif p == "C" then
+      x = map_cond[band(rshift(op, 12), 15)]
+    elseif p == "c" then
+      local rn = band(rshift(op, 5), 31)
+      local rm = band(rshift(op, 16), 31)
+      local cond = band(rshift(op, 12), 15)
+      local invc = bxor(cond, 1)
+      x = map_cond[cond]
+      if altname and cond ~= 14 and cond ~= 15 then
+	local a1, a2 = match(altname, "([^|]*)|(.*)")
+	if rn == rm then
+	  local n = #operands
+	  operands[n] = nil
+	  x = map_cond[invc]
+	  if rn ~= 31 then
+	    if a1 then name = a1 else name = altname end
+	  else
+	    operands[n-1] = nil
+	    name = a2
+	  end
+	end
+      end
+    elseif p == "W" then
+      x = band(rshift(op, 5), 0xffff)
+    elseif p == "Y" then
+      x = band(rshift(op, 5), 0xffff)
+      local hw = band(rshift(op, 21), 3)
+      if altname and (hw == 0 or x ~= 0) then
+	name = altname
+      end
+    elseif p == "L" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local imm9 = arshift(lshift(op, 11), 23)
+      if band(op, 0x800) ~= 0 then
+	x = "["..rn..", #"..imm9.."]!"
+      else
+	x = "["..rn.."], #"..imm9
+      end
+    elseif p == "U" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local sz = band(rshift(op, 30), 3)
+      local imm12 = lshift(arshift(lshift(op, 10), 20), sz)
+      if imm12 ~= 0 then
+	x = "["..rn..", #"..imm12.."]"
+      else
+	x = "["..rn.."]"
+      end
+    elseif p == "K" then
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local imm9 = arshift(lshift(op, 11), 23)
+      if imm9 ~= 0 then
+	x = "["..rn..", #"..imm9.."]"
+      else
+	x = "["..rn.."]"
+      end
+    elseif p == "O" then
+      local rn, rm = map_regs.x[band(rshift(op, 5), 31)]
+      local m = band(rshift(op, 13), 1)
+      if m == 0 then
+	rm = map_regs.w[band(rshift(op, 16), 31)]
+      else
+	rm = map_regs.x[band(rshift(op, 16), 31)]
+      end
+      x = "["..rn..", "..rm
+      local opt = band(rshift(op, 13), 7)
+      local s = band(rshift(op, 12), 1)
+      local sz = band(rshift(op, 30), 3)
+      -- extension to be applied
+      if opt == 3 then
+       if s == 0 then x = nil
+       else x = x..", lsl #"..sz.."]" end
+      elseif opt == 2 or opt == 6 or opt == 7 then
+	if s == 0 then x = x..", "..map_extend[opt].."]"
+	else x = x..", "..map_extend[opt].." #"..sz.."]" end
+      else
+	x = x.."]"
+      end
+    elseif p == "P" then
+      local opcv, sh = rshift(op, 26), 2
+      if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end
+      local imm7 = lshift(arshift(lshift(op, 10), 25), sh)
+      local rn = map_regs.x[band(rshift(op, 5), 31)]
+      local ind = band(rshift(op, 23), 3)
+      if ind == 1 then
+	x = "["..rn.."], #"..imm7
+      elseif ind == 2 then
+	if imm7 == 0 then
+	  x = "["..rn.."]"
+	else
+	  x = "["..rn..", #"..imm7.."]"
+	end
+      elseif ind == 3 then
+	x = "["..rn..", #"..imm7.."]!"
+      end
+    elseif p == "I" then
+      local shf = band(rshift(op, 22), 3)
+      local imm12 = band(rshift(op, 10), 0x0fff)
+      local n = #operands
+      local rn, rd = band(rshift(op, 5), 31), band(op, 31)
+      if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then
+	name = altname
+	x = nil
+      elseif shf == 0 then
+	x = imm12
+      elseif shf == 1 then
+	x = imm12..", lsl #12"
+      end
+    elseif p == "i" then
+      x = "#0x"..decode_imm13(op)
+    elseif p == "1" then
+      immr = band(rshift(op, 16), 63)
+      x = immr
+    elseif p == "2" then
+      x = band(rshift(op, 10), 63)
+      if altname then
+	local a1, a2, a3, a4, a5, a6 =
+	  match(altname, "([^|]*)|([^|]*)|([^|]*)|([^|]*)|([^|]*)|(.*)")
+	local sf = band(rshift(op, 26), 32)
+	local uns = band(rshift(op, 30), 1)
+	if prefer_bfx(sf, uns, x, immr) then
+	  name = a2
+	  x = x - immr + 1
+	elseif immr == 0 and x == 7 then
+	  local n = #operands
+	  operands[n] = nil
+	  if sf ~= 0 then
+	    operands[n-1] = gsub(operands[n-1], "x", "w")
+	  end
+	  last = operands[n-1]
+	  name = a6
+	  x = nil
+	elseif immr == 0 and x == 15 then
+	  local n = #operands
+	  operands[n] = nil
+	  if sf ~= 0 then
+	    operands[n-1] = gsub(operands[n-1], "x", "w")
+	  end
+	  last = operands[n-1]
+	  name = a5
+	  x = nil
+	elseif x == 31 or x == 63 then
+	  if x == 31 and immr == 0 and name == "sbfm" then
+	    name = a4
+	    local n = #operands
+	    operands[n] = nil
+	    if sf ~= 0 then
+	      operands[n-1] = gsub(operands[n-1], "x", "w")
+	    end
+	    last = operands[n-1]
+	  else
+	    name = a3
+	  end
+	  x = nil
+	elseif band(x, 31) ~= 31 and immr == x+1 and name == "ubfm" then
+	  name = a4
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = nil
+	elseif x < immr then
+	  name = a1
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = x + 1
+	end
+      end
+    elseif p == "3" then
+      x = band(rshift(op, 10), 63)
+      if altname then
+	local a1, a2 = match(altname, "([^|]*)|(.*)")
+	if x < immr then
+	  name = a1
+	  local sf = band(rshift(op, 26), 32)
+	  last = "#"..(sf+32 - immr)
+	  operands[#operands] = last
+	  x = x + 1
+	elseif x >= immr then
+	  name = a2
+	  x = x - immr + 1
+	end
+      end
+    elseif p == "4" then
+      x = band(rshift(op, 10), 63)
+      local rn = band(rshift(op, 5), 31)
+      local rm = band(rshift(op, 16), 31)
+      if altname and rn == rm then
+	local n = #operands
+	operands[n] = nil
+	last = operands[n-1]
+	name = altname
+      end
+    elseif p == "5" then
+      x = band(rshift(op, 16), 31)
+    elseif p == "S" then
+      x = band(rshift(op, 10), 63)
+      if x == 0 then x = nil
+      else x = map_shift[band(rshift(op, 22), 3)].." #"..x end
+    elseif p == "X" then
+      local opt = band(rshift(op, 13), 7)
+      -- Width specifier .
+      if opt ~= 3 and opt ~= 7 then
+	last = map_regs.w[band(rshift(op, 16), 31)]
+	operands[#operands] = last
+      end
+      x = band(rshift(op, 10), 7)
+      -- Extension.
+      if opt == 2 + band(rshift(op, 31), 1) and
+	 band(rshift(op, second0 and 5 or 0), 31) == 31 then
+	if x == 0 then x = nil
+	else x = "lsl #"..x end
+      else
+	if x == 0 then x = map_extend[band(rshift(op, 13), 7)]
+	else x = map_extend[band(rshift(op, 13), 7)].." #"..x end
+      end
+    elseif p == "R" then
+      x = band(rshift(op,21), 3)
+      if x == 0 then x = nil
+      else x = "lsl #"..x*16 end
+    elseif p == "z" then
+      local n = #operands
+      if operands[n] == "sp" then operands[n] = "xzr"
+      elseif operands[n] == "wsp" then operands[n] = "wzr"
+      end
+    elseif p == "Z" then
+      x = 0
+    elseif p == "F" then
+      x = parse_fpimm8(op)
+    elseif p == "g" or p == "f" or p == "x" or p == "w" or
+	   p == "d" or p == "s" then
+      -- These are handled in D/N/M/A.
+    elseif p == "0" then
+      if last == "sp" or last == "wsp" then
+	local n = #operands
+	operands[n] = nil
+	last = operands[n-1]
+	if altname then
+	  local a1, a2 = match(altname, "([^|]*)|(.*)")
+	  if not a1 then
+	    name = altname
+	  elseif second0 then
+	    name, altname = a2, a1
+	  else
+	    name, altname = a1, a2
+	  end
+	end
+      end
+      second0 = true
+    else
+      assert(false)
+    end
+    if x then
+      last = x
+      if type(x) == "number" then x = "#"..x end
+      operands[#operands+1] = x
+    end
+  end
+
+  return putop(ctx, name..suffix, operands)
+end
+
+------------------------------------------------------------------------------
+
+-- Disassemble a block of code.
+local function disass_block(ctx, ofs, len)
+  if not ofs then ofs = 0 end
+  local stop = len and ofs+len or #ctx.code
+  ctx.pos = ofs
+  ctx.rel = nil
+  while ctx.pos < stop do disass_ins(ctx) end
+end
+
+-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len).
+local function create(code, addr, out)
+  local ctx = {}
+  ctx.code = code
+  ctx.addr = addr or 0
+  ctx.out = out or io.write
+  ctx.symtab = {}
+  ctx.disass = disass_block
+  ctx.hexdump = 8
+  return ctx
+end
+
+-- Simple API: disassemble code (a string) at address and output via out.
+local function disass(code, addr, out)
+  create(code, addr, out):disass()
+end
+
+-- Return register name for RID.
+local function regname(r)
+  if r < 32 then return map_regs.x[r] end
+  return map_regs.d[r-32]
+end
+
+-- Public module functions.
+return {
+  create = create,
+  disass = disass,
+  regname = regname
+}
+
diff --git a/src/lj_arch.h b/src/lj_arch.h
index cc5a0a66..3df602e3 100644
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -226,7 +226,6 @@
 #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
 #define LJ_TARGET_GC64		1
 #define LJ_ARCH_NUMMODE		LJ_NUMMODE_DUAL
-#define LJ_ARCH_NOJIT		1	/* NYI */
 
 #define LJ_ARCH_VERSION		80
 
diff --git a/src/lj_asm.c b/src/lj_asm.c
index 7ce58924..2cb5abea 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -171,6 +171,8 @@ IRFLDEF(FLOFS)
 #include "lj_emit_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_emit_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_emit_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_emit_ppc.h"
 #elif LJ_TARGET_MIPS
@@ -1563,6 +1565,8 @@ static void asm_loop(ASMState *as)
 #include "lj_asm_x86.h"
 #elif LJ_TARGET_ARM
 #include "lj_asm_arm.h"
+#elif LJ_TARGET_ARM64
+#include "lj_asm_arm64.h"
 #elif LJ_TARGET_PPC
 #include "lj_asm_ppc.h"
 #elif LJ_TARGET_MIPS
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
new file mode 100644
index 00000000..0a2f5306
--- /dev/null
+++ b/src/lj_asm_arm64.h
@@ -0,0 +1,1823 @@
+/*
+** ARM64 IR assembler (SSA IR -> machine code).
+** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Register allocator extensions --------------------------------------- */
+
+/* Allocate a register with a hint. */
+static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow)
+{
+  Reg r = IR(ref)->r;
+  if (ra_noreg(r)) {
+    if (!ra_hashint(r) && !iscrossref(as, ref))
+      ra_sethint(IR(ref)->r, hint);  /* Propagate register hint. */
+    r = ra_allocref(as, ref, allow);
+  }
+  ra_noweak(as, r);
+  return r;
+}
+
+/* Allocate two source registers for three-operand instructions. */
+static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow)
+{
+  IRIns *irl = IR(ir->op1), *irr = IR(ir->op2);
+  Reg left = irl->r, right = irr->r;
+  if (ra_hasreg(left)) {
+    ra_noweak(as, left);
+    if (ra_noreg(right))
+      right = ra_allocref(as, ir->op2, rset_exclude(allow, left));
+    else
+      ra_noweak(as, right);
+  } else if (ra_hasreg(right)) {
+    ra_noweak(as, right);
+    left = ra_allocref(as, ir->op1, rset_exclude(allow, right));
+  } else if (ra_hashint(right)) {
+    right = ra_allocref(as, ir->op2, allow);
+    left = ra_alloc1(as, ir->op1, rset_exclude(allow, right));
+  } else {
+    left = ra_allocref(as, ir->op1, allow);
+    right = ra_alloc1(as, ir->op2, rset_exclude(allow, left));
+  }
+  return left | (right << 8);
+}
+
+/* -- Guard handling ------------------------------------------------------ */
+
+/* Generate an exit stub group at the bottom of the reserved MCode memory. */
+static MCode *asm_exitstub_gen(ASMState *as, ExitNo group)
+{
+  MCode *mxp = as->mcbot;
+  int i;
+  if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop)
+    asm_mclimit(as);
+  /* str lr, [sp]; bl ->vm_exit_handler; .long group. */
+  *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP);
+  *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu);
+  mxp++;
+  *mxp++ = group*EXITSTUBS_PER_GROUP;
+  for (i = 0; i < EXITSTUBS_PER_GROUP; i++)
+    *mxp++ = A64I_B | ((-3-i)&0x03ffffffu);
+  lj_mcode_sync(as->mcbot, mxp);
+  lj_mcode_commitbot(as->J, mxp);
+  as->mcbot = mxp;
+  as->mclim = as->mcbot + MCLIM_REDZONE;
+  return mxp - EXITSTUBS_PER_GROUP;
+}
+
+/* Setup all needed exit stubs. */
+static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
+{
+  ExitNo i;
+  if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR)
+    lj_trace_err(as->J, LJ_TRERR_SNAPOV);
+  for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++)
+    if (as->J->exitstubgroup[i] == NULL)
+      as->J->exitstubgroup[i] = asm_exitstub_gen(as, i);
+}
+
+/* Emit conditional branch to exit for guard. */
+static void asm_guardcc(ASMState *as, A64CC cc)
+{
+  MCode *target = exitstub_addr(as->J, as->snapno);
+  MCode *p = as->mcp;
+  if (LJ_UNLIKELY(p == as->invmcp)) {
+    as->loopinv = 1;
+    *p = A64I_BL | ((target-p) & 0x03ffffffu);
+    emit_cond_branch(as, cc^1, p-1);
+    return;
+  }
+  /* No conditional calls. Emit b.cc/bl instead. */
+  /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */
+  emit_branch(as, A64I_BL, target);
+  emit_cond_branch(as, cc^1, p);
+}
+
+/* -- Operand fusion ------------------------------------------------------ */
+
+/* Limit linear search to this distance. Avoids O(n^2) behavior. */
+#define CONFLICT_SEARCH_LIM	31
+
+static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
+{
+  if (irref_isk(ref)) {
+    IRIns *ir = IR(ref);
+    if (ir->o == IR_KNULL || !irt_is64(ir->t)) {
+      *k = ir->i;
+      return 1;
+    } else if (checki32((int64_t)ir_k64(ir)->u64)) {
+      *k = (int32_t)ir_k64(ir)->u64;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/* Check if there's no conflicting instruction between curins and ref. */
+static int noconflict(ASMState *as, IRRef ref, IROp conflict)
+{
+  IRIns *ir = as->ir;
+  IRRef i = as->curins;
+  if (i > ref + CONFLICT_SEARCH_LIM)
+    return 0;  /* Give up, ref is too far away. */
+  while (--i > ref)
+    if (ir[i].o == conflict)
+      return 0;  /* Conflict found. */
+  return 1;  /* Ok, no conflict. */
+}
+
+/* Fuse the array base of colocated arrays. */
+static int32_t asm_fuseabase(ASMState *as, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE &&
+      !neverfuse(as) && noconflict(as, ref, IR_NEWREF))
+    return (int32_t)sizeof(GCtab);
+  return 0;
+}
+
+#define FUSE_REG	0x40000000
+
+/* Fuse array/hash/upvalue reference into register+offset operand. */
+static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow,
+			  A64Ins ins)
+{
+  IRIns *ir = IR(ref);
+  if (ra_noreg(ir->r)) {
+    if (ir->o == IR_AREF) {
+      if (mayfuse(as, ref)) {
+	if (irref_isk(ir->op2)) {
+	  IRRef tab = IR(ir->op1)->op1;
+	  int32_t ofs = asm_fuseabase(as, tab);
+	  IRRef refa = ofs ? tab : ir->op1;
+	  ofs += 8*IR(ir->op2)->i;
+	  if (emit_checkofs(ins, ofs)) {
+	    *ofsp = ofs;
+	    return ra_alloc1(as, refa, allow);
+	  }
+	} else {
+	  Reg base = ra_alloc1(as, ir->op1, allow);
+	  *ofsp = FUSE_REG|ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
+	  return base;
+	}
+      }
+    } else if (ir->o == IR_HREFK) {
+      if (mayfuse(as, ref)) {
+	int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node));
+	if (emit_checkofs(ins, ofs)) {
+	  *ofsp = ofs;
+	  return ra_alloc1(as, ir->op1, allow);
+	}
+      }
+    } else if (ir->o == IR_UREFC) {
+      if (irref_isk(ir->op1)) {
+	GCfunc *fn = ir_kfunc(IR(ir->op1));
+	GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv;
+	int64_t ofs = glofs(as, &uv->tv);
+	if (emit_checkofs(ins, ofs)) {
+	  *ofsp = (int32_t)ofs;
+	  return RID_GL;
+	}
+      }
+    }
+  }
+  *ofsp = 0;
+  return ra_alloc1(as, ref, allow);
+}
+
+/* Fuse m operand into arithmetic/logic instructions. */
+static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  if (ra_hasreg(ir->r)) {
+    ra_noweak(as, ir->r);
+    return A64F_M(ir->r);
+  } else if (irref_isk(ref)) {
+    uint32_t m;
+    int64_t k = get_k64val(ir);
+    if ((ai & 0x1f000000) == 0x0a000000)
+      m = emit_isk13(k, irt_is64(ir->t));
+    else
+      m = emit_isk12(k);
+    if (m)
+      return m;
+  } else if (mayfuse(as, ref)) {
+    if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR && irref_isk(ir->op2)) ||
+	(ir->o == IR_ADD && ir->op1 == ir->op2)) {
+      A64Shift sh = ir->o == IR_BSHR ? A64SH_LSR :
+		    ir->o == IR_BSAR ? A64SH_ASR : A64SH_LSL;
+      int shift = ir->o == IR_ADD ? 1 :
+		    (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31));
+      IRIns *irl = IR(ir->op1);
+      if (sh == A64SH_LSL &&
+	  irl->o == IR_CONV &&
+	  irl->op2 == ((IRT_I64<op1)) {
+	Reg m = ra_alloc1(as, irl->op1, allow);
+	return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift);
+      } else {
+	Reg m = ra_alloc1(as, ir->op1, allow);
+	return A64F_M(m) | A64F_SH(sh, shift);
+      }
+    } else if (ir->o == IR_CONV &&
+	       ir->op2 == ((IRT_I64<op1, allow);
+      return A64F_M(m) | A64F_EX(A64EX_SXTW);
+    }
+  }
+  return A64F_M(ra_allocref(as, ref, allow));
+}
+
+/* Fuse XLOAD/XSTORE reference into load/store operand. */
+static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref,
+			 RegSet allow)
+{
+  IRIns *ir = IR(ref);
+  Reg base;
+  int32_t ofs = 0;
+  if (ra_noreg(ir->r) && canfuse(as, ir)) {
+    if (ir->o == IR_ADD) {
+      if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs))
+	ref = ir->op1;
+      /* NYI: Fuse add with two registers. */
+    } else if (ir->o == IR_STRREF) {
+      if (asm_isk32(as, ir->op2, &ofs)) {
+	ref = ir->op1;
+      } else if (asm_isk32(as, ir->op1, &ofs)) {
+	ref = ir->op2;
+      } else {
+	/* NYI: Fuse ADD with constant. */
+	Reg rn = ra_alloc1(as, ir->op1, allow);
+	uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn));
+	emit_lso(as, ai, rd, rd, sizeof(GCstr));
+	emit_dn(as, A64I_ADDx^m, rd, rn);
+	return;
+      }
+      ofs += sizeof(GCstr);
+      if (!emit_checkofs(ai, ofs)) {
+	Reg rn = ra_alloc1(as, ref, allow);
+	Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn));
+	emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm);
+	return;
+      }
+    }
+  }
+  base = ra_alloc1(as, ref, allow);
+  emit_lso(as, ai, (rd & 31), base, ofs);
+}
+
+/* -- Calls --------------------------------------------------------------- */
+
+/* Generate a call to a C function. */
+static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
+{
+  uint32_t n, nargs = CCI_XNARGS(ci);
+  int32_t ofs = 0;
+  Reg gpr, fpr = REGARG_FIRSTFPR;
+  if ((void *)ci->func)
+    emit_call(as, (void *)ci->func);
+  for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++)
+    as->cost[gpr] = REGCOST(~0u, ASMREF_L);
+  gpr = REGARG_FIRSTGPR;
+  for (n = 0; n < nargs; n++) { /* Setup args. */
+    IRRef ref = args[n];
+    IRIns *ir = IR(ref);
+    if (ref) {
+      if (irt_isfp(ir->t)) {
+	if (fpr <= REGARG_LASTFPR) {
+	  lua_assert(rset_test(as->freeset, fpr)); /* Must have been evicted. */
+	  ra_leftov(as, fpr, ref);
+	  fpr++;
+	} else {
+	  Reg r = ra_alloc1(as, ref, RSET_FPR);
+	  emit_spstore(as, ir, r, ofs);
+	  ofs += 8;
+	}
+      } else {
+	if (gpr <= REGARG_LASTGPR) {
+	  lua_assert(rset_test(as->freeset, gpr)); /* Must have been evicted. */
+	  ra_leftov(as, gpr, ref);
+	  gpr++;
+	} else {
+	  Reg r = ra_alloc1(as, ref, RSET_GPR);
+	  emit_spstore(as, ir, r, ofs);
+	  ofs += 8;
+	}
+      }
+    }
+  }
+}
+
+/* Setup result reg/sp for call. Evict scratch regs. */
+static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
+{
+  RegSet drop = RSET_SCRATCH;
+  if (ra_hasreg(ir->r))
+    rset_clear(drop, ir->r); /* Dest reg handled below. */
+  ra_evictset(as, drop); /* Evictions must be performed first. */
+  if (ra_used(ir)) {
+    lua_assert(!irt_ispri(ir->t));
+    if (irt_isfp(ir->t)) {
+      if (ci->flags & CCI_CASTU64) {
+	Reg dest = ra_dest(as, ir, RSET_FPR) & 31;
+	emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D_R : A64I_FMOV_S_R,
+		dest, RID_RET);
+      } else {
+	ra_destreg(as, ir, RID_FPRET);
+      }
+    } else {
+      ra_destreg(as, ir, RID_RET);
+    }
+  }
+  UNUSED(ci);
+}
+
+static void asm_callx(ASMState *as, IRIns *ir)
+{
+  IRRef args[CCI_NARGS_MAX*2];
+  CCallInfo ci;
+  IRRef func;
+  IRIns *irf;
+  ci.flags = asm_callx_flags(as, ir);
+  asm_collectargs(as, ir, &ci, args);
+  asm_setupresult(as, ir, &ci);
+  func = ir->op2; irf = IR(func);
+  if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); }
+  if (irref_isk(func)) {  /* Call to constant address. */
+    ci.func = (ASMFunction)(ir_k64(irf)->u64);
+  } else {  /* Need a non-argument register for indirect calls. */
+    Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+    emit_n(as, A64I_BLR, freg);
+    ci.func = (ASMFunction)(void *)0;
+  }
+  asm_gencall(as, &ci, args);
+}
+
+/* -- Returns ------------------------------------------------------------- */
+
+/* Return to lower frame. Guard that it goes to the right spot. */
+static void asm_retf(ASMState *as, IRIns *ir)
+{
+  Reg base = ra_alloc1(as, REF_BASE, RSET_GPR);
+  void *pc = ir_kptr(IR(ir->op2));
+  int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1));
+  as->topslot -= (BCReg)delta;
+  if ((int32_t)as->topslot < 0) as->topslot = 0;
+  irt_setmark(IR(REF_BASE)->t);  /* Children must not coalesce with BASE reg. */
+  /* Need to force a spill on REF_BASE now to update the stack slot. */
+  emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE)));
+  emit_setgl(as, base, jit_base);
+  emit_addptr(as, base, -8*delta);
+  asm_guardcc(as, CC_NE);
+  emit_nm(as, A64I_CMPx, RID_TMP,
+	  ra_allock(as, i64ptr(pc), rset_exclude(RSET_GPR, base)));
+  emit_lso(as, A64I_LDRx, RID_TMP, base, -8);
+}
+
+/* -- Type conversions ---------------------------------------------------- */
+
+static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
+{
+  Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  asm_guardcc(as, CC_NE);
+  emit_nm(as, A64I_FCMPd, (tmp & 31), (left & 31));
+  emit_dn(as, A64I_FCVT_F64_S32, (tmp & 31), dest);
+  emit_dn(as, A64I_FCVT_S32_F64, dest, (left & 31));
+}
+
+static void asm_tobit(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_FPR;
+  Reg left = ra_alloc1(as, ir->op1, allow);
+  Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left));
+  Reg tmp = ra_scratch(as, rset_clear(allow, right));
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  emit_dn(as, A64I_FMOV_R_S, dest, (tmp & 31));
+  emit_dnm(as, A64I_FADDd, (tmp & 31), (left & 31), (right & 31));
+}
+
+static void asm_conv(ASMState *as, IRIns *ir)
+{
+  IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK);
+  int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64);
+  int stfp = (st == IRT_NUM || st == IRT_FLOAT);
+  IRRef lref = ir->op1;
+  lua_assert(irt_type(ir->t) != st);
+  if (irt_isfp(ir->t)) {
+    Reg dest = ra_dest(as, ir, RSET_FPR);
+    if (stfp) {  /* FP to FP conversion. */
+      emit_dn(as, st == IRT_NUM ? A64I_FCVT_F32_F64 : A64I_FCVT_F64_F32,
+	      (dest & 31), (ra_alloc1(as, lref, RSET_FPR) & 31));
+    } else {  /* Integer to FP conversion. */
+      Reg left = ra_alloc1(as, lref, RSET_GPR);
+      A64Ins ai = irt_isfloat(ir->t) ?
+	(((IRT_IS64 >> st) & 1) ?
+	 (st == IRT_I64 ? A64I_FCVT_F32_S64 : A64I_FCVT_F32_U64) :
+	 (st == IRT_INT ? A64I_FCVT_F32_S32 : A64I_FCVT_F32_U32)) :
+	(((IRT_IS64 >> st) & 1) ?
+	 (st == IRT_I64 ? A64I_FCVT_F64_S64 : A64I_FCVT_F64_U64) :
+	 (st == IRT_INT ? A64I_FCVT_F64_S32 : A64I_FCVT_F64_U32));
+      emit_dn(as, ai, (dest & 31), left);
+    }
+  } else if (stfp) {  /* FP to integer conversion. */
+    if (irt_isguard(ir->t)) {
+      /* Checked conversions are only supported from number to int. */
+      lua_assert(irt_isint(ir->t) && st == IRT_NUM);
+      asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR));
+    } else {
+      Reg left = ra_alloc1(as, lref, RSET_FPR);
+      Reg dest = ra_dest(as, ir, RSET_GPR);
+      A64Ins ai = irt_is64(ir->t) ?
+	(st == IRT_NUM ?
+	 (irt_isi64(ir->t) ? A64I_FCVT_S64_F64 : A64I_FCVT_U64_F64) :
+	 (irt_isi64(ir->t) ? A64I_FCVT_S64_F32 : A64I_FCVT_U64_F32)) :
+	(st == IRT_NUM ?
+	 (irt_isint(ir->t) ? A64I_FCVT_S32_F64 : A64I_FCVT_U32_F64) :
+	 (irt_isint(ir->t) ? A64I_FCVT_S32_F32 : A64I_FCVT_U32_F32));
+      emit_dn(as, ai, dest, (left & 31));
+    }
+  } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, lref, RSET_GPR);
+    A64Ins ai = st == IRT_I8 ? A64I_SXTBw :
+		st == IRT_U8 ? A64I_UXTBw :
+		st == IRT_I16 ? A64I_SXTHw : A64I_UXTHw;
+    lua_assert(irt_isint(ir->t) || irt_isu32(ir->t));
+    emit_dn(as, ai, dest, left);
+  } else {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    if (irt_is64(ir->t)) {
+      if (st64 || !(ir->op2 & IRCONV_SEXT)) {
+	/* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */
+	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
+      } else {  /* 32 to 64 bit sign extension. */
+	Reg left = ra_alloc1(as, lref, RSET_GPR);
+	emit_dn(as, A64I_SXTW, dest, left);
+      }
+    } else {
+      if (st64) {
+	/* This is either a 32 bit reg/reg mov which zeroes the hiword
+	** or a load of the loword from a 64 bit address.
+	*/
+	Reg left = ra_alloc1(as, lref, RSET_GPR);
+	emit_dm(as, A64I_MOVw, dest, left);
+      } else {  /* 32/32 bit no-op (cast). */
+	ra_leftov(as, dest, lref);  /* Do nothing, but may need to move regs. */
+      }
+    }
+  }
+}
+
+static void asm_strto(ASMState *as, IRIns *ir)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num];
+  IRRef args[2];
+  Reg dest = 0, tmp;
+  int destused = ra_used(ir);
+  int32_t ofs = 0;
+  ra_evictset(as, RSET_SCRATCH);
+  if (destused) {
+    if (ra_hasspill(ir->s)) {
+      ofs = sps_scale(ir->s);
+      destused = 0;
+      if (ra_hasreg(ir->r)) {
+	ra_free(as, ir->r);
+	ra_modified(as, ir->r);
+	emit_spload(as, ir, ir->r, ofs);
+      }
+    } else {
+      dest = ra_dest(as, ir, RSET_FPR);
+    }
+  }
+  asm_guardcc(as, CC_EQ);
+  if (destused)
+    emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0);
+  emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET);
+  args[0] = ir->op1; /* GCstr *str */
+  args[1] = ASMREF_TMP1; /* TValue *n  */
+  asm_gencall(as, ci, args);
+  tmp = ra_releasetmp(as, ASMREF_TMP1);
+  emit_opk(as, A64I_ADDx, tmp, RID_SP, ofs, RSET_GPR);
+}
+
+/* -- Memory references --------------------------------------------------- */
+
+/* Get pointer to TValue. */
+static void asm_tvptr(ASMState *as, Reg dest, IRRef ref)
+{
+  IRIns *ir = IR(ref);
+  if (irt_isnum(ir->t)) {
+    if (irref_isk(ref)) {
+      /* Use the number constant itself as a TValue. */
+      ra_allockreg(as, i64ptr(ir_knum(ir)), dest);
+    } else {
+      /* Otherwise force a spill and use the spill slot. */
+      emit_opk(as, A64I_ADDx, dest, RID_SP, ra_spill(as, ir), RSET_GPR);
+    }
+  } else {
+    /* Otherwise use g->tmptv to hold the TValue. */
+    RegSet allow = rset_exclude(RSET_GPR, dest);
+    Reg src;
+    if (irref_isk(ref)) {
+      TValue k;
+      lj_ir_kvalue(as->J->L, &k, ir);
+      src = ra_allock(as, k.u64, allow);
+      emit_lso(as, A64I_STRx, src, dest, 0);
+    } else {
+      Reg type;
+      if (irt_ispri(ir->t)) {
+	src = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+	emit_lso(as, A64I_STRx, src, dest, 0);
+      } else if (irt_isint(ir->t)) {
+	src = ra_alloc1(as, ref, allow);
+	type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow);
+	emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
+	emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
+      } else {
+	src = ra_alloc1(as, ref, allow);
+	type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+	emit_lso(as, A64I_STRx, RID_TMP, dest, 0);
+	emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
+      }
+    }
+    ra_allockreg(as, i64ptr(&J2G(as->J)->tmptv), dest);
+  }
+}
+
+static void asm_aref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg idx, base;
+  if (irref_isk(ir->op2)) {
+    IRRef tab = IR(ir->op1)->op1;
+    int32_t ofs = asm_fuseabase(as, tab);
+    IRRef refa = ofs ? tab : ir->op1;
+    uint32_t k = emit_isk12(ofs + 8*IR(ir->op2)->i);
+    if (k) {
+      base = ra_alloc1(as, refa, RSET_GPR);
+      emit_dn(as, A64I_ADDx^k, dest, base);
+      return;
+    }
+  }
+  base = ra_alloc1(as, ir->op1, RSET_GPR);
+  idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base));
+  emit_dnm(as, A64I_ADDx | A64F_EXSH(A64EX_UXTW, 3), dest, base, idx);
+}
+
+/* Inlined hash lookup. Specialized for key type and for const keys.
+** The equivalent C code is:
+**   Node *n = hashkey(t, key);
+**   do {
+**     if (lj_obj_equal(&n->key, key)) return &n->val;
+**   } while ((n = nextnode(n)));
+**   return niltv(L);
+*/
+static void asm_href(ASMState *as, IRIns *ir, IROp merge)
+{
+  RegSet allow = RSET_GPR;
+  int destused = ra_used(ir);
+  Reg dest = ra_dest(as, ir, allow);
+  Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest));
+  Reg key = 0, tmp = RID_TMP;
+  IRRef refkey = ir->op2;
+  IRIns *irkey = IR(refkey);
+  int isk = irref_isk(ir->op2);
+  IRType1 kt = irkey->t;
+  uint32_t k = 0;
+  uint32_t khash;
+  MCLabel l_end, l_loop, l_next;
+  rset_clear(allow, tab);
+
+  if (!isk) {
+    key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow);
+    rset_clear(allow, key);
+    if (!irt_isstr(kt)) {
+      tmp = ra_scratch(as, allow);
+      rset_clear(allow, tmp);
+    }
+  } else if (irt_isnum(kt)) {
+    int64_t val = (int64_t)ir_knum(irkey)->u64;
+    if (!(k = emit_isk12(val))) {
+      key = ra_allock(as, val, allow);
+      rset_clear(allow, key);
+    }
+  } else if (!irt_ispri(kt)) {
+    if (!(k = emit_isk12(irkey->i))) {
+      key = ra_alloc1(as, refkey, allow);
+      rset_clear(allow, key);
+    }
+  }
+
+  /* Key not found in chain: jump to exit (if merged) or load niltv. */
+  l_end = emit_label(as);
+  as->invmcp = NULL;
+  if (merge == IR_NE)
+    asm_guardcc(as, CC_AL);
+  else if (destused)
+    emit_loada(as, dest, niltvg(J2G(as->J)));
+
+  /* Follow hash chain until the end. */
+  l_loop = --as->mcp;
+  emit_n(as, A64I_CMPx^A64I_K12^0, dest);
+  emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next));
+  l_next = emit_label(as);
+
+  /* Type and value comparison. */
+  if (merge == IR_EQ)
+    asm_guardcc(as, CC_EQ);
+  else
+    emit_cond_branch(as, CC_EQ, l_end);
+
+  if (irt_isnum(kt)) {
+    if (isk) {
+      /* Assumes -0.0 is already canonicalized to +0.0. */
+      if (k)
+	emit_n(as, A64I_CMPx^k, tmp);
+      else
+	emit_nm(as, A64I_CMPx, key, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+    } else {
+      Reg tisnum = ra_allock(as, LJ_TISNUM << 15, allow);
+      Reg ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key));
+      rset_clear(allow, tisnum);
+      emit_nm(as, A64I_FCMPd, key, ftmp);
+      emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31));
+      emit_cond_branch(as, CC_LO, l_next);
+      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n));
+    }
+  } else if (irt_isaddr(kt)) {
+    Reg scr;
+    if (isk) {
+      int64_t kk = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64;
+      scr = ra_allock(as, kk, allow);
+      emit_nm(as, A64I_CMPx, scr, tmp);
+      emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64));
+    } else {
+      scr = ra_scratch(as, allow);
+      emit_nm(as, A64I_CMPx, tmp, scr);
+      emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64));
+    }
+    rset_clear(allow, scr);
+  } else {
+    Reg type, scr;
+    lua_assert(irt_ispri(kt) && !irt_isnil(kt));
+    type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+    scr = ra_scratch(as, rset_clear(allow, type));
+    rset_clear(allow, scr);
+    emit_nm(as, A64I_CMPw, scr, type);
+    emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key));
+  }
+
+  *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE;
+  if (!isk && irt_isaddr(kt)) {
+    Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow);
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type);
+    rset_clear(allow, type);
+  }
+  /* Load main position relative to tab->node into dest. */
+  khash = isk ? ir_khash(irkey) : 1;
+  if (khash == 0) {
+    emit_lso(as, A64I_LDRx, dest, tab, offsetof(GCtab, node));
+  } else {
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 3), dest, tmp, dest);
+    emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 1), dest, dest, dest);
+    emit_lso(as, A64I_LDRx, tmp, tab, offsetof(GCtab, node));
+    if (isk) {
+      Reg tmphash = ra_allock(as, khash, allow);
+      emit_dnm(as, A64I_ANDw, dest, dest, tmphash);
+      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+    } else if (irt_isstr(kt)) {
+      /* Fetch of str->hash is cheaper than ra_allock. */
+      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+      emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, hash));
+      emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask));
+    } else {  /* Must match with hash*() in lj_tab.c. */
+      emit_dnm(as, A64I_ANDw, dest, dest, tmp);
+      emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask));
+      emit_dnm(as, A64I_SUBw, dest, dest, tmp);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp);
+      emit_dnm(as, A64I_EORw, dest, dest, tmp);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest);
+      emit_dnm(as, A64I_SUBw, tmp, tmp, dest);
+      emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest);
+      emit_dnm(as, A64I_EORw, tmp, tmp, dest);
+      if (irt_isnum(kt)) {
+	emit_dnm(as, A64I_ADDw, dest, dest, dest);
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+	emit_dm(as, A64I_MOVw, tmp, dest);
+	emit_dn(as, A64I_FMOV_R_D, dest, (key & 31));
+      } else {
+	checkmclim(as);
+	emit_dm(as, A64I_MOVw, tmp, key);
+	emit_dnm(as, A64I_EORw, dest, dest,
+		 ra_allock(as, irt_toitype(kt) << 15, allow));
+	emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest);
+	emit_dm(as, A64I_MOVx, dest, key);
+      }
+    }
+  }
+}
+
+static void asm_hrefk(ASMState *as, IRIns *ir)
+{
+  IRIns *kslot = IR(ir->op2);
+  IRIns *irkey = IR(kslot->op1);
+  int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node));
+  int32_t kofs = ofs + (int32_t)offsetof(Node, key);
+  int bigofs = !emit_checkofs(A64I_LDRx, ofs);
+  RegSet allow = RSET_GPR;
+  Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE;
+  Reg node = ra_alloc1(as, ir->op1, allow);
+  Reg key = ra_scratch(as, rset_clear(allow, node));
+  Reg idx = node;
+  uint64_t k;
+  lua_assert(ofs % sizeof(Node) == 0);
+  rset_clear(allow, key);
+  if (bigofs) {
+    idx = dest;
+    rset_clear(allow, dest);
+    kofs = (int32_t)offsetof(Node, key);
+  } else if (ra_hasreg(dest)) {
+    emit_opk(as, A64I_ADDx, dest, node, ofs, allow);
+  }
+  asm_guardcc(as, CC_NE);
+  if (irt_ispri(irkey->t)) {
+    k = ~((int64_t)~irt_toitype(irkey->t) << 47);
+  } else if (irt_isnum(irkey->t)) {
+    k = ir_knum(irkey)->u64;
+  } else {
+    k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey);
+  }
+  emit_nm(as, A64I_CMPx, key, ra_allock(as, k, allow));
+  emit_lso(as, A64I_LDRx, key, idx, kofs);
+  if (bigofs)
+    emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR);
+}
+
+static void asm_uref(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  if (irref_isk(ir->op1)) {
+    GCfunc *fn = ir_kfunc(IR(ir->op1));
+    MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v;
+    emit_lsptr(as, A64I_LDRx, dest, v);
+  } else {
+    Reg uv = ra_scratch(as, RSET_GPR);
+    Reg func = ra_alloc1(as, ir->op1, RSET_GPR);
+    if (ir->o == IR_UREFC) {
+      asm_guardcc(as, CC_NE);
+      emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP);
+      emit_opk(as, A64I_ADDx, dest, uv,
+	       (int32_t)offsetof(GCupval, tv), RSET_GPR);
+      emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed));
+    } else {
+      emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v));
+    }
+    emit_lso(as, A64I_LDRx, uv, func,
+	     (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8));
+  }
+}
+
+static void asm_fref(ASMState *as, IRIns *ir)
+{
+  UNUSED(as); UNUSED(ir);
+  lua_assert(!ra_used(ir));
+}
+
+static void asm_strref(ASMState *as, IRIns *ir)
+{
+  RegSet allow = RSET_GPR;
+  Reg dest = ra_dest(as, ir, allow);
+  Reg base = ra_alloc1(as, ir->op1, allow);
+  IRIns *irr = IR(ir->op2);
+  int32_t ofs = sizeof(GCstr);
+  uint32_t m;
+  rset_clear(allow, base);
+  if (irref_isk(ir->op2) && (m = emit_isk12(ofs + irr->i))) {
+    emit_dn(as, A64I_ADDx^m, dest, base);
+  } else {
+    emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, dest);
+    emit_dnm(as, A64I_ADDx, dest, base, ra_alloc1(as, ir->op2, allow));
+  }
+}
+
+/* -- Loads and stores ---------------------------------------------------- */
+
+static A64Ins asm_fxloadins(IRIns *ir)
+{
+  switch (irt_type(ir->t)) {
+  case IRT_I8: return A64I_LDRB ^ A64I_LS_S;
+  case IRT_U8: return A64I_LDRB;
+  case IRT_I16: return A64I_LDRH ^ A64I_LS_S;
+  case IRT_U16: return A64I_LDRH;
+  case IRT_NUM: return A64I_LDRd;
+  case IRT_FLOAT: return A64I_LDRs;
+  default: return irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw;
+  }
+}
+
+static A64Ins asm_fxstoreins(IRIns *ir)
+{
+  switch (irt_type(ir->t)) {
+  case IRT_I8: case IRT_U8: return A64I_STRB;
+  case IRT_I16: case IRT_U16: return A64I_STRH;
+  case IRT_NUM: return A64I_STRd;
+  case IRT_FLOAT: return A64I_STRs;
+  default: return irt_is64(ir->t) ? A64I_STRx : A64I_STRw;
+  }
+}
+
+static void asm_fload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg idx;
+  A64Ins ai = asm_fxloadins(ir);
+  int32_t ofs;
+  if (ir->op1 == REF_NIL) {
+    idx = RID_GL;
+    ofs = (ir->op2 << 2) - GG_OFS(g);
+  } else {
+    idx = ra_alloc1(as, ir->op1, RSET_GPR);
+    if (ir->op2 == IRFL_TAB_ARRAY) {
+      ofs = asm_fuseabase(as, ir->op1);
+      if (ofs) {  /* Turn the t->array load into an add for colocated arrays. */
+	emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, idx);
+	return;
+      }
+    }
+    ofs = field_ofs[ir->op2];
+  }
+  emit_lso(as, ai, (dest & 31), idx, ofs);
+}
+
+static void asm_fstore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    Reg src = ra_alloc1(as, ir->op2, RSET_GPR);
+    IRIns *irf = IR(ir->op1);
+    Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src));
+    int32_t ofs = field_ofs[irf->op2];
+    emit_lso(as, asm_fxstoreins(ir), (src & 31), idx, ofs);
+  }
+}
+
+static void asm_xload(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+  lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED));
+  asm_fusexref(as, asm_fxloadins(ir), dest, ir->op1, RSET_GPR);
+}
+
+static void asm_xstore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR);
+    asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1,
+		 rset_exclude(RSET_GPR, src));
+  }
+}
+
+static void asm_ahuvload(ASMState *as, IRIns *ir)
+{
+  Reg idx, tmp, type;
+  int32_t ofs = 0;
+  RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR;
+  lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
+	     irt_isint(ir->t));
+  if (ra_used(ir)) {
+    Reg dest = ra_dest(as, ir, allow);
+    tmp = irt_isnum(ir->t) ? ra_scratch(as, rset_clear(gpr, dest)) : dest;
+    if (irt_isaddr(ir->t)) {
+      emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
+    } else if (irt_isnum(ir->t)) {
+      emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+    } else if (irt_isint(ir->t)) {
+      emit_dm(as, A64I_MOVw, dest, dest);
+    }
+  } else {
+    tmp = ra_scratch(as, gpr);
+  }
+  type = ra_scratch(as, rset_clear(gpr, tmp));
+  idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx);
+  /* Always do the type check, even if the load result is unused. */
+  asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE);
+  if (irt_type(ir->t) >= IRT_NUM) {
+    lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t));
+    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	    ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp);
+  } else if (irt_isaddr(ir->t)) {
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+  } else if (irt_isnil(ir->t)) {
+    emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+  } else {
+    emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	    ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp);
+  }
+  if (ofs & FUSE_REG)
+    emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
+  else
+    emit_lso(as, A64I_LDRx, tmp, idx, ofs);
+}
+
+static void asm_ahustore(ASMState *as, IRIns *ir)
+{
+  if (ir->r != RID_SINK) {
+    RegSet allow = RSET_GPR;
+    Reg idx, src = RID_NONE, tmp = RID_TMP, type = RID_NONE;
+    int32_t ofs = 0;
+    if (irt_isnum(ir->t)) {
+      src = ra_alloc1(as, ir->op2, RSET_FPR);
+      idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd);
+      if (ofs & FUSE_REG)
+	emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31));
+      else
+	emit_lso(as, A64I_STRd, (src & 31), idx, ofs);
+    } else {
+      if (!irt_ispri(ir->t)) {
+	src = ra_alloc1(as, ir->op2, allow);
+	rset_clear(allow, src);
+	if (irt_isinteger(ir->t))
+	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	else
+	  type = ra_allock(as, irt_toitype(ir->t), allow);
+      } else {
+	tmp = type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t)<<47), allow);
+      }
+      idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type),
+			   A64I_STRx);
+      if (ofs & FUSE_REG)
+	emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31));
+      else
+	emit_lso(as, A64I_STRx, tmp, idx, ofs);
+      if (ra_hasreg(src)) {
+	if (irt_isinteger(ir->t)) {
+	  emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), tmp, type, src);
+	} else {
+	  emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, src, type);
+	}
+      }
+    }
+  }
+}
+
+static void asm_sload(ASMState *as, IRIns *ir)
+{
+  int32_t ofs = 8*((int32_t)ir->op1-2);
+  IRType1 t = ir->t;
+  Reg dest = RID_NONE, base;
+  RegSet allow = RSET_GPR;
+  lua_assert(!(ir->op2 & IRSLOAD_PARENT));  /* Handled by asm_head_side(). */
+  lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
+  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
+    dest = ra_scratch(as, RSET_FPR);
+    asm_tointg(as, ir, dest);
+    t.irt = IRT_NUM;  /* Continue with a regular number type check. */
+  } else if (ra_used(ir)) {
+    Reg tmp = RID_NONE;
+    if ((ir->op2 & IRSLOAD_CONVERT))
+      tmp = ra_scratch(as, irt_isint(t) ? RSET_FPR : RSET_GPR);
+    lua_assert((irt_isnum(t)) || irt_isint(t) || irt_isaddr(t));
+    dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow);
+    base = ra_alloc1(as, REF_BASE, rset_clear(allow, dest));
+    if (irt_isaddr(t)) {
+      emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest);
+    } else if ((ir->op2 & IRSLOAD_CONVERT)) {
+      if (irt_isint(t)) {
+	emit_dn(as, A64I_FCVT_S32_F64, dest, (tmp & 31));
+	/* If value is already loaded for type check, move it to FPR. */
+	if ((ir->op2 & IRSLOAD_TYPECHECK))
+	  emit_dn(as, A64I_FMOV_D_R, (tmp & 31), dest);
+	else
+	  dest = tmp;
+	t.irt = IRT_NUM;  /* Check for original type. */
+      } else {
+	emit_dn(as, A64I_FCVT_F64_S32, (dest & 31), tmp);
+	dest = tmp;
+	t.irt = IRT_INT;  /* Check for original type. */
+      }
+    } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) {
+      emit_dm(as, A64I_MOVw, dest, dest);
+    }
+    goto dotypecheck;
+  }
+  base = ra_alloc1(as, REF_BASE, allow);
+dotypecheck:
+  rset_clear(allow, base);
+  if ((ir->op2 & IRSLOAD_TYPECHECK)) {
+    Reg tmp;
+    if (ra_hasreg(dest) && rset_test(RSET_GPR, dest)) {
+      tmp = dest;
+    } else {
+      tmp = ra_scratch(as, allow);
+      rset_clear(allow, tmp);
+    }
+    if (irt_isnum(t) && !(ir->op2 & IRSLOAD_CONVERT))
+      emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp);
+    /* Need type check, even if the load result is unused. */
+    asm_guardcc(as, irt_isnum(t) ? CC_LS : CC_NE);
+    if (irt_type(t) >= IRT_NUM) {
+      lua_assert(irt_isinteger(t) || irt_isnum(t));
+      emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32),
+	      ra_allock(as, LJ_TISNUM << 15, allow), tmp);
+    } else if (irt_isnil(t)) {
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp);
+    } else if (irt_ispri(t)) {
+      emit_nm(as, A64I_CMPx,
+	      ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp);
+    } else {
+      Reg type = ra_scratch(as, allow);
+      emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type);
+      emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp);
+    }
+    emit_lso(as, A64I_LDRx, tmp, base, ofs);
+    return;
+  }
+  if (ra_hasreg(dest)) {
+    emit_lso(as, irt_isnum(t) ? A64I_LDRd :
+	     (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base, ofs);
+  }
+}
+
+/* -- Allocations --------------------------------------------------------- */
+
+#if LJ_HASFFI
+static void asm_cnew(ASMState *as, IRIns *ir)
+{
+  CTState *cts = ctype_ctsG(J2G(as->J));
+  CTypeID id = (CTypeID)IR(ir->op1)->i;
+  CTSize sz;
+  CTInfo info = lj_ctype_info(cts, id, &sz);
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco];
+  IRRef args[4];
+  RegSet allow = (RSET_GPR & ~RSET_SCRATCH);
+  lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL));
+
+  as->gcsteps++;
+  asm_setupresult(as, ir, ci);  /* GCcdata * */
+  /* Initialize immutable cdata object. */
+  if (ir->o == IR_CNEWI) {
+    int32_t ofs = sizeof(GCcdata);
+    Reg r = ra_alloc1(as, ir->op2, allow);
+    lua_assert(sz == 4 || sz == 8);
+    emit_lso(as, sz == 8 ? A64I_STRx : A64I_STRw, r, RID_RET, ofs);
+  } else if (ir->op2 != REF_NIL) {  /* Create VLA/VLS/aligned cdata. */
+    ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv];
+    args[0] = ASMREF_L;     /* lua_State *L */
+    args[1] = ir->op1;      /* CTypeID id   */
+    args[2] = ir->op2;      /* CTSize sz    */
+    args[3] = ASMREF_TMP1;  /* CTSize align */
+    asm_gencall(as, ci, args);
+    emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info));
+    return;
+  }
+
+  /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */
+  {
+    Reg r = (id < 65536) ? RID_X1 : ra_allock(as, id, allow);
+    emit_lso(as, A64I_STRB, RID_TMP, RID_RET, offsetof(GCcdata, gct));
+    emit_lso(as, A64I_STRH, r, RID_RET, offsetof(GCcdata, ctypeid));
+    emit_d(as, A64I_MOVZw | A64F_U16(~LJ_TCDATA), RID_TMP);
+    if (id < 65536) emit_d(as, A64I_MOVZw | A64F_U16(id), RID_X1);
+  }
+  args[0] = ASMREF_L;     /* lua_State *L */
+  args[1] = ASMREF_TMP1;  /* MSize size   */
+  asm_gencall(as, ci, args);
+  ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)),
+	       ra_releasetmp(as, ASMREF_TMP1));
+}
+#else
+#define asm_cnew(as, ir)	((void)0)
+#endif
+
+/* -- Write barriers ------------------------------------------------------ */
+
+static void asm_tbar(ASMState *as, IRIns *ir)
+{
+  Reg tab = ra_alloc1(as, ir->op1, RSET_GPR);
+  Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab));
+  Reg gr = ra_allock(as, i64ptr(J2G(as->J)),
+		     rset_exclude(rset_exclude(RSET_GPR, tab), link));
+  Reg mark = RID_TMP;
+  MCLabel l_end = emit_label(as);
+  emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist));
+  emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+  emit_lso(as, A64I_STRx, tab, gr,
+	   (int32_t)offsetof(global_State, gc.grayagain));
+  emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark);
+  emit_lso(as, A64I_LDRx, link, gr,
+	   (int32_t)offsetof(global_State, gc.grayagain));
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), mark);
+  emit_lso(as, A64I_LDRB, mark, tab, (int32_t)offsetof(GCtab, marked));
+}
+
+static void asm_obar(ASMState *as, IRIns *ir)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
+  IRRef args[2];
+  MCLabel l_end;
+  RegSet allow = RSET_GPR;
+  Reg obj, val, tmp;
+  /* No need for other object barriers (yet). */
+  lua_assert(IR(ir->op1)->o == IR_UREFC);
+  ra_evictset(as, RSET_SCRATCH);
+  l_end = emit_label(as);
+  args[0] = ASMREF_TMP1;  /* global_State *g */
+  args[1] = ir->op1;      /* TValue *tv      */
+  asm_gencall(as, ci, args);
+  ra_allockreg(as, i64ptr(J2G(as->J)), ra_releasetmp(as, ASMREF_TMP1) );
+  obj = IR(ir->op1)->r;
+  tmp = ra_scratch(as, rset_exclude(allow, obj));
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp);
+  emit_cond_branch(as, CC_EQ, l_end);
+  emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP);
+  val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj));
+  emit_lso(as, A64I_LDRB, tmp, obj,
+     (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
+  emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked));
+}
+
+/* -- Arithmetic and logic operations ------------------------------------- */
+
+static void asm_fparith(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+  right = (left >> 8); left &= 255;
+  emit_dnm(as, ai, (dest & 31), (left & 31), (right & 31));
+}
+
+static void asm_fpunary(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  Reg dest = ra_dest(as, ir, RSET_FPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR);
+  emit_dn(as, ai, (dest & 31), (left & 31));
+}
+
+static void asm_fpmath(ASMState *as, IRIns *ir)
+{
+  IRFPMathOp fpm = (IRFPMathOp)ir->op2;
+  if (fpm == IRFPM_SQRT) {
+    asm_fpunary(as, ir, A64I_FSQRTd);
+  } else if (fpm <= IRFPM_TRUNC) {
+    asm_fpunary(as, ir, fpm == IRFPM_FLOOR ? A64I_FRINTMd :
+			fpm == IRFPM_CEIL ? A64I_FRINTPd : A64I_FRINTZd);
+  } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) {
+    return;
+  } else {
+    asm_callid(as, ir, IRCALL_lj_vm_floor + fpm);
+  }
+}
+
+static int asm_swapops(ASMState *as, IRRef lref, IRRef rref)
+{
+  IRIns *ir;
+  if (irref_isk(rref))
+    return 0;  /* Don't swap constants to the left. */
+  if (irref_isk(lref))
+    return 1;  /* But swap constants to the right. */
+  ir = IR(rref);
+  if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) ||
+      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<o >= IR_BSHL && ir->o <= IR_BSAR) ||
+      (ir->o == IR_ADD && ir->op1 == ir->op2) ||
+      (ir->o == IR_CONV && ir->op2 == ((IRT_I64<op1, rref = ir->op2;
+  Reg left, dest = ra_dest(as, ir, RSET_GPR);
+  uint32_t m;
+  if ((ai & ~A64I_S) != A64I_SUBw && asm_swapops(as, lref, rref)) {
+    IRRef tmp = lref; lref = rref; rref = tmp;
+  }
+  left = ra_hintalloc(as, lref, dest, RSET_GPR);
+  if (irt_is64(ir->t)) ai |= A64I_X;
+  m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
+  if (irt_isguard(ir->t)) {  /* For IR_ADDOV etc. */
+    asm_guardcc(as, CC_VS);
+    ai |= A64I_S;
+  }
+  emit_dn(as, ai^m, dest, left);
+}
+
+static void asm_intop_s(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  if (as->flagmcp == as->mcp) {  /* Drop cmp r, #0. */
+    as->flagmcp = NULL;
+    as->mcp++;
+    ai |= A64I_S;
+  }
+  asm_intop(as, ir, ai);
+}
+
+static void asm_intneg(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+  emit_dm(as, irt_is64(ir->t) ? A64I_NEGx : A64I_NEGw, dest, left);
+}
+
+/* NYI: use add/shift for MUL(OV) with constants. FOLD only does 2^k. */
+static void asm_intmul(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest));
+  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+  if (irt_isguard(ir->t)) {  /* IR_MULOV */
+    asm_guardcc(as, CC_NE);
+    emit_dm(as, A64I_MOVw, dest, dest);  /* Zero-extend. */
+    emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest);
+    emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest);
+    emit_dnm(as, A64I_SMULL, dest, right, left);
+  } else {
+    emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right);
+  }
+}
+
+static void asm_add(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FADDd);
+    return;
+  }
+  asm_intop_s(as, ir, A64I_ADDw);
+}
+
+static void asm_sub(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FSUBd);
+    return;
+  }
+  asm_intop_s(as, ir, A64I_SUBw);
+}
+
+static void asm_mul(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fparith(as, ir, A64I_FMULd);
+    return;
+  }
+  asm_intmul(as, ir);
+}
+
+static void asm_div(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isnum(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 :
+					  IRCALL_lj_carith_divu64);
+  else
+#endif
+    asm_fparith(as, ir, A64I_FDIVd);
+}
+
+static void asm_pow(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isnum(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 :
+					  IRCALL_lj_carith_powu64);
+  else
+#endif
+    asm_callid(as, ir, IRCALL_lj_vm_powi);
+}
+
+#define asm_addov(as, ir)	asm_add(as, ir)
+#define asm_subov(as, ir)	asm_sub(as, ir)
+#define asm_mulov(as, ir)	asm_mul(as, ir)
+
+#define asm_abs(as, ir)		asm_fpunary(as, ir, A64I_FABS)
+#define asm_atan2(as, ir)	asm_callid(as, ir, IRCALL_atan2)
+#define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
+
+static void asm_mod(ASMState *as, IRIns *ir)
+{
+#if LJ_HASFFI
+  if (!irt_isint(ir->t))
+    asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 :
+					  IRCALL_lj_carith_modu64);
+  else
+#endif
+    asm_callid(as, ir, IRCALL_lj_vm_modi);
+}
+
+static void asm_neg(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t)) {
+    asm_fpunary(as, ir, A64I_FNEGd);
+    return;
+  }
+  asm_intneg(as, ir);
+}
+
+static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai)
+{
+  if (as->flagmcp == as->mcp && ai == A64I_ANDw) {
+    /* Try to drop cmp r, #0. */
+    as->flagmcp = NULL;
+    as->mcp++;
+    ai += A64I_ANDSw - A64I_ANDw;
+  }
+  if (ir->op2 == 0) {
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR);
+    if (irt_is64(ir->t)) ai |= A64I_X;
+    emit_d(as, ai^m, dest);
+  } else {
+    asm_intop(as, ir, ai);
+  }
+}
+
+#define asm_bnot(as, ir)	asm_bitop(as, ir, A64I_MVNw)
+#define asm_band(as, ir)	asm_bitop(as, ir, A64I_ANDw)
+#define asm_bor(as, ir)		asm_bitop(as, ir, A64I_ORRw)
+#define asm_bxor(as, ir)	asm_bitop(as, ir, A64I_EORw)
+
+static void asm_bswap(ASMState *as, IRIns *ir)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+  emit_dn(as, irt_is64(ir->t) ? A64I_REVx : A64I_REVw, dest, left);
+}
+
+static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh)
+{
+  int shmask = irt_is64(ir->t) ? 63 : 31;
+  if (irref_isk(ir->op2)) {  /* Constant shifts. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+    int32_t shift = (IR(ir->op2)->i & shmask);
+
+    if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw;
+    switch (sh) {
+    case A64SH_LSL:
+      emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left);
+      break;
+    case A64SH_LSR: case A64SH_ASR:
+      emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left);
+      break;
+    case A64SH_ROR:
+      emit_dnm(as, ai | A64F_IMMS(shift), dest, left, left);
+      break;
+    }
+  } else {  /* Variable-length shifts. */
+    Reg dest = ra_dest(as, ir, RSET_GPR);
+    Reg left = ra_alloc1(as, ir->op1, RSET_GPR);
+    Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+    emit_dnm(as, (shmask == 63 ? A64I_SHRx : A64I_SHRw) | A64F_BSH(sh), dest, left, right);
+  }
+}
+
+#define asm_bshl(as, ir)	asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSL)
+#define asm_bshr(as, ir)	asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSR)
+#define asm_bsar(as, ir)	asm_bitshift(as, ir, A64I_SBFMw, A64SH_ASR)
+#define asm_bror(as, ir)	asm_bitshift(as, ir, A64I_EXTRw, A64SH_ROR)
+#define asm_brol(as, ir)	lua_assert(0)
+
+static void asm_intmin_max(ASMState *as, IRIns *ir, A64CC cc)
+{
+  Reg dest = ra_dest(as, ir, RSET_GPR);
+  Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR);
+  Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left));
+  emit_dnm(as, A64I_CSELw|A64F_CC(cc), dest, left, right);
+  emit_nm(as, A64I_CMPw, left, right);
+}
+
+static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc)
+{
+  Reg dest = (ra_dest(as, ir, RSET_FPR) & 31);
+  Reg right, left = ra_alloc2(as, ir, RSET_FPR);
+  right = ((left >> 8) & 31); left &= 31;
+  emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right);
+  emit_nm(as, A64I_FCMPd, left, right);
+}
+
+static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc)
+{
+  if (irt_isnum(ir->t))
+    asm_fpmin_max(as, ir, fcc);
+  else
+    asm_intmin_max(as, ir, cc);
+}
+
+#define asm_max(as, ir)		asm_min_max(as, ir, CC_GT, CC_HI)
+#define asm_min(as, ir)		asm_min_max(as, ir, CC_LT, CC_LO)
+
+/* -- Comparisons --------------------------------------------------------- */
+
+/* Map of comparisons to flags. ORDER IR. */
+static const uint8_t asm_compmap[IR_ABC+1] = {
+  /* op  FP swp  int cc   FP cc */
+  /* LT       */ CC_GE + (CC_HS << 4),
+  /* GE    x  */ CC_LT + (CC_HI << 4),
+  /* LE       */ CC_GT + (CC_HI << 4),
+  /* GT    x  */ CC_LE + (CC_HS << 4),
+  /* ULT   x  */ CC_HS + (CC_LS << 4),
+  /* UGE      */ CC_LO + (CC_LO << 4),
+  /* ULE   x  */ CC_HI + (CC_LO << 4),
+  /* UGT      */ CC_LS + (CC_LS << 4),
+  /* EQ       */ CC_NE + (CC_NE << 4),
+  /* NE       */ CC_EQ + (CC_EQ << 4),
+  /* ABC      */ CC_LS + (CC_LS << 4)  /* Same as UGT. */
+};
+
+/* FP comparisons. */
+static void asm_fpcomp(ASMState *as, IRIns *ir)
+{
+  Reg left, right;
+  A64Ins ai;
+  int swp = ((ir->o ^ (ir->o >> 2)) & ~(ir->o >> 3) & 1);
+  if (!swp && irref_isk(ir->op2) && ir_knum(IR(ir->op2))->u64 == 0) {
+    left = (ra_alloc1(as, ir->op1, RSET_FPR) & 31);
+    right = 0;
+    ai = A64I_FCMPZd;
+  } else {
+    left = ra_alloc2(as, ir, RSET_FPR);
+    if (swp) {
+      right = (left & 31); left = ((left >> 8) & 31);
+    } else {
+      right = ((left >> 8) & 31); left &= 31;
+    }
+    ai = A64I_FCMPd;
+  }
+  asm_guardcc(as, (asm_compmap[ir->o] >> 4));
+  emit_nm(as, ai, left, right);
+}
+
+/* Integer comparisons. */
+static void asm_intcomp(ASMState *as, IRIns *ir)
+{
+  A64CC oldcc, cc = (asm_compmap[ir->o] & 15);
+  A64Ins ai = irt_is64(ir->t) ? A64I_CMPx : A64I_CMPw;
+  IRRef lref = ir->op1, rref = ir->op2;
+  Reg left;
+  uint32_t m;
+  int cmpprev0 = 0;
+  lua_assert(irt_is64(ir->t) || irt_isint(ir->t) ||
+	     irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t));
+  if (asm_swapops(as, lref, rref)) {
+    IRRef tmp = lref; lref = rref; rref = tmp;
+    if (cc >= CC_GE) cc ^= 7;  /* LT <-> GT, LE <-> GE */
+    else if (cc > CC_NE) cc ^= 11;  /* LO <-> HI, LS <-> HS */
+  }
+  oldcc = cc;
+  if (irref_isk(rref) && IR(rref)->i == 0) {
+    IRIns *irl = IR(lref);
+    if (cc == CC_GE) cc = CC_PL;
+    else if (cc == CC_LT) cc = CC_MI;
+    else if (cc > CC_NE) goto notst;  /* Other conds don't work with tst. */
+    cmpprev0 = (irl+1 == ir);
+    /* Combine comp(BAND(left, right), 0) into tst left, right. */
+    if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) {
+      IRRef blref = irl->op1, brref = irl->op2;
+      uint32_t m2 = 0;
+      Reg bleft;
+      if (asm_swapops(as, blref, brref)) {
+	Reg tmp = blref; blref = brref; brref = tmp;
+      }
+      if (irref_isk(brref)) {
+	/* NYI: use tbz/tbnz, if applicable. */
+	m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t));
+	if (!m2)
+	  goto notst;  /* Not beneficial if we miss a constant operand. */
+      }
+      bleft = ra_alloc1(as, blref, RSET_GPR);
+      ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw);
+      if (!m2)
+	m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft));
+      asm_guardcc(as, cc);
+      emit_n(as, ai^m2, bleft);
+      return;
+    }
+    /* NYI: use cbz/cbnz for EQ/NE 0. */
+  }
+notst:
+  left = ra_alloc1(as, lref, RSET_GPR);
+  m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left));
+  asm_guardcc(as, cc);
+  emit_n(as, ai^m, left);
+  /* Signed comparison with zero and referencing previous ins? */
+  if (cmpprev0 && (oldcc <= CC_NE || oldcc >= CC_GE))
+    as->flagmcp = as->mcp;  /* Allow elimination of the compare. */
+}
+
+static void asm_comp(ASMState *as, IRIns *ir)
+{
+  if (irt_isnum(ir->t))
+    asm_fpcomp(as, ir);
+  else
+    asm_intcomp(as, ir);
+}
+
+#define asm_equal(as, ir)	asm_comp(as, ir)
+
+/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */
+
+/* Hiword op of a split 64 bit op. Previous op must be the loword op. */
+static void asm_hiop(ASMState *as, IRIns *ir)
+{
+  UNUSED(as); UNUSED(ir); lua_assert(0);  /* Unused on 64 bit. */
+}
+
+/* -- Profiling ----------------------------------------------------------- */
+
+static void asm_prof(ASMState *as, IRIns *ir)
+{
+  uint32_t k = emit_isk13(HOOK_PROFILE, 0);
+  lua_assert(k != 0);
+  UNUSED(ir);
+  asm_guardcc(as, CC_NE);
+  emit_n(as, A64I_TSTw^k, RID_TMP);
+  emit_lsptr(as, A64I_LDRB, RID_TMP, (void *)&J2G(as->J)->hookmask);
+}
+
+/* -- Stack handling ------------------------------------------------------ */
+
+/* Check Lua stack size for overflow. Use exit handler as fallback. */
+static void asm_stack_check(ASMState *as, BCReg topslot,
+			    IRIns *irp, RegSet allow, ExitNo exitno)
+{
+  Reg pbase;
+  uint32_t k;
+  if (irp) {
+    if (!ra_hasspill(irp->s)) {
+      pbase = irp->r;
+      lua_assert(ra_hasreg(pbase));
+    } else if (allow) {
+      pbase = rset_pickbot(allow);
+    } else {
+      pbase = RID_RET;
+      emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0);  /* Restore temp register. */
+    }
+  } else {
+    pbase = RID_BASE;
+  }
+  emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno));
+  emit_cond_branch(as, CC_LS^1, as->mcp+1);
+  k = emit_isk12((8*topslot));
+  lua_assert(k);
+  emit_n(as, A64I_CMPx^k, RID_TMP);
+  emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase);
+  emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP,
+	   (int32_t)offsetof(lua_State, maxstack));
+  if (irp) {  /* Must not spill arbitrary registers in head of side trace. */
+    if (ra_hasspill(irp->s))
+      emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s));
+    emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L));
+    if (ra_hasspill(irp->s) && !allow)
+      emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0);  /* Save temp register. */
+  } else {
+    emit_getgl(as, RID_TMP, cur_L);
+  }
+}
+
+/* Restore Lua stack from on-trace state. */
+static void asm_stack_restore(ASMState *as, SnapShot *snap)
+{
+  SnapEntry *map = &as->T->snapmap[snap->mapofs];
+#ifdef LUA_USE_ASSERT
+  SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2];
+#endif
+  MSize n, nent = snap->nent;
+  /* Store the value of all modified slots to the Lua stack. */
+  for (n = 0; n < nent; n++) {
+    SnapEntry sn = map[n];
+    BCReg s = snap_slot(sn);
+    int32_t ofs = 8*((int32_t)s-1-LJ_FR2);
+    IRRef ref = snap_ref(sn);
+    IRIns *ir = IR(ref);
+    if ((sn & SNAP_NORESTORE))
+      continue;
+    if (irt_isnum(ir->t)) {
+      Reg src = ra_alloc1(as, ref, RSET_FPR);
+      emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs);
+    } else {
+      RegSet allow = rset_exclude(RSET_GPR, RID_BASE);
+      lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t));
+      if (!irref_isk(ref)) {
+	Reg type, src;
+	if (irt_is64(ir->t)) {
+	  type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow);
+	  src = ra_alloc1(as, ref, rset_exclude(allow, type));
+	  emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
+	  emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type);
+	} else if (irt_isinteger(ir->t)) {
+	  type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow);
+	  src = ra_alloc1(as, ref, rset_exclude(allow, type));
+	  emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs);
+	  emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src);
+	} else {
+	  type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow);
+	  emit_lso(as, A64I_STRx, type, RID_BASE, ofs);
+	}
+      } else {
+	TValue k;
+	lj_ir_kvalue(as->J->L, &k, ir);
+	emit_lso(as, A64I_STRx,
+		 ra_allock(as, tvisnil(&k) ? -1 : (int64_t)k.u64, allow),
+		 RID_BASE, ofs);
+      }
+    }
+    checkmclim(as);
+  }
+  lua_assert(map + nent == flinks);
+}
+
+/* -- GC handling --------------------------------------------------------- */
+
+/* Check GC threshold and do one or more GC steps. */
+static void asm_gc_check(ASMState *as)
+{
+  const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
+  IRRef args[2];
+  MCLabel l_end;
+  Reg tmp1, tmp2;
+  ra_evictset(as, RSET_SCRATCH);
+  l_end = emit_label(as);
+  /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */
+  asm_guardcc(as, CC_NE);  /* Assumes asm_snap_prep() already done. */
+  emit_n(as, A64I_CMPx^A64I_K12, RID_RET);
+  args[0] = ASMREF_TMP1;  /* global_State *g */
+  args[1] = ASMREF_TMP2;  /* MSize steps     */
+  asm_gencall(as, ci, args);
+  tmp1 = ra_releasetmp(as, ASMREF_TMP1);
+  tmp2 = ra_releasetmp(as, ASMREF_TMP2);
+  emit_loadi(as, tmp2, as->gcsteps);
+  /* Jump around GC step if GC total < GC threshold. */
+  emit_cond_branch(as, CC_LS, l_end);
+  emit_nm(as, A64I_CMPx, RID_TMP, tmp2);
+  emit_lso(as, A64I_LDRx, tmp2, tmp1,
+	   (int32_t)offsetof(global_State, gc.threshold));
+  emit_lso(as, A64I_LDRx, RID_TMP, tmp1,
+	   (int32_t)offsetof(global_State, gc.total));
+  ra_allockreg(as, i64ptr(J2G(as->J)), tmp1);
+  as->gcsteps = 0;
+  checkmclim(as);
+}
+
+/* -- Loop handling ------------------------------------------------------- */
+
+/* Fixup the loop branch. */
+static void asm_loop_fixup(ASMState *as)
+{
+  MCode *p = as->mctop;
+  MCode *target = as->mcp;
+  if (as->loopinv) {  /* Inverted loop branch? */
+    ptrdiff_t delta = target - (p - 2);
+    lua_assert(((delta + 0x40000) >> 19) == 0);
+    /* asm_guardcc already inverted the b.cc and patched the final bl. */
+    p[-2] |= ((uint32_t)delta & 0x7ffff) << 5;
+  } else {
+    ptrdiff_t delta = target - (p - 1);
+    p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu);
+  }
+}
+
+/* -- Head of trace ------------------------------------------------------- */
+
+/* Reload L register from g->cur_L. */
+static void asm_head_lreg(ASMState *as)
+{
+  IRIns *ir = IR(ASMREF_L);
+  if (ra_used(ir)) {
+    Reg r = ra_dest(as, ir, RSET_GPR);
+    emit_getgl(as, r, cur_L);
+    ra_evictk(as);
+  }
+}
+
+/* Coalesce BASE register for a root trace. */
+static void asm_head_root_base(ASMState *as)
+{
+  IRIns *ir;
+  asm_head_lreg(as);
+  ir = IR(REF_BASE);
+  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+    ra_spill(as, ir);
+  ra_destreg(as, ir, RID_BASE);
+}
+
+/* Coalesce BASE register for a side trace. */
+static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow)
+{
+  IRIns *ir;
+  asm_head_lreg(as);
+  ir = IR(REF_BASE);
+  if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t)))
+    ra_spill(as, ir);
+  if (ra_hasspill(irp->s)) {
+    rset_clear(allow, ra_dest(as, ir, allow));
+  } else {
+    Reg r = irp->r;
+    lua_assert(ra_hasreg(r));
+    rset_clear(allow, r);
+    if (r != ir->r && !rset_test(as->freeset, r))
+      ra_restore(as, regcost_ref(as->cost[r]));
+    ra_destreg(as, ir, r);
+  }
+  return allow;
+}
+
+/* -- Tail of trace ------------------------------------------------------- */
+
+/* Fixup the tail code. */
+static void asm_tail_fixup(ASMState *as, TraceNo lnk)
+{
+  MCode *p = as->mctop;
+  MCode *target;
+  /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */
+  int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED));
+  if (spadj == 0) {
+    as->mctop = --p;
+  } else {
+    /* Patch stack adjustment. */
+    uint32_t k = emit_isk12(spadj);
+    lua_assert(k);
+    p[-2] = (A64I_ADDx^k) | A64F_D(RID_SP) | A64F_N(RID_SP);
+  }
+  /* Patch exit branch. */
+  target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
+  p[-1] = A64I_B | (((target-p)+1)&0x03ffffffu);
+}
+
+/* Prepare tail of code. */
+static void asm_tail_prep(ASMState *as)
+{
+  MCode *p = as->mctop - 1;  /* Leave room for exit branch. */
+  if (as->loopref) {
+    as->invmcp = as->mcp = p;
+  } else {
+    as->mcp = p-1;  /* Leave room for stack pointer adjustment. */
+    as->invmcp = NULL;
+  }
+  *p = 0;  /* Prevent load/store merging. */
+}
+
+/* -- Trace setup --------------------------------------------------------- */
+
+/* Ensure there are enough stack slots for call arguments. */
+static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci)
+{
+  IRRef args[CCI_NARGS_MAX*2];
+  uint32_t i, nargs = CCI_XNARGS(ci);
+  int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR;
+  asm_collectargs(as, ir, ci, args);
+  for (i = 0; i < nargs; i++) {
+    if (args[i] && irt_isfp(IR(args[i])->t)) {
+      if (nfpr > 0) nfpr--; else nslots += 2;
+    } else {
+      if (ngpr > 0) ngpr--; else nslots += 2;
+    }
+  }
+  if (nslots > as->evenspill)  /* Leave room for args in stack slots. */
+    as->evenspill = nslots;
+  return REGSP_HINT(RID_RET);
+}
+
+static void asm_setup_target(ASMState *as)
+{
+  /* May need extra exit for asm_stack_check on side traces. */
+  asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0));
+}
+
+/* -- Trace patching ------------------------------------------------------ */
+
+/* Patch exit jumps of existing machine code to a new target. */
+void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
+{
+  MCode *p = T->mcode;
+  MCode *pe = (MCode *)((char *)p + T->szmcode);
+  MCode *cstart = NULL, *cend = p;
+  MCode *mcarea = lj_mcode_patch(J, p, 0);
+  MCode *px = exitstub_addr(J, exitno);
+  for (; p < pe; p++) {
+    /* Look for bl exitstub, replace with b target. */
+    uint32_t ins = *p;
+    if ((ins & 0xfc000000u) == 0x94000000u &&
+	((ins ^ (px-p)) & 0x03ffffffu) == 0) {
+      *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu);
+      cend = p+1;
+      if (!cstart) cstart = p;
+    }
+  }
+  lua_assert(cstart != NULL);
+  lj_mcode_sync(cstart, cend);
+  lj_mcode_patch(J, mcarea, 1);
+}
+
diff --git a/src/lj_ccall.c b/src/lj_ccall.c
index b599be33..a3ae8b05 100644
--- a/src/lj_ccall.c
+++ b/src/lj_ccall.c
@@ -331,7 +331,7 @@
 
 #define CCALL_HANDLE_COMPLEXARG \
   /* Pass complex by value in separate (!) FPRs or on stack. */ \
-  isfp = ctr->size == 2*sizeof(float) ? 2 : 1;
+  isfp = sz == 2*sizeof(float) ? 2 : 1;
 
 #define CCALL_HANDLE_REGARG \
   if (LJ_TARGET_IOS && isva) { \
diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h
index 82708077..362d6202 100644
--- a/src/lj_dispatch.h
+++ b/src/lj_dispatch.h
@@ -107,6 +107,7 @@ typedef struct GG_State {
 #define J2G(J)		(&J2GG(J)->g)
 #define G2J(gl)		(&G2GG(gl)->J)
 #define L2J(L)		(&L2GG(L)->J)
+#define GG_G2J		(GG_OFS(J) - GG_OFS(g))
 #define GG_G2DISP	(GG_OFS(dispatch) - GG_OFS(g))
 #define GG_DISP2G	(GG_OFS(g) - GG_OFS(dispatch))
 #define GG_DISP2J	(GG_OFS(J) - GG_OFS(dispatch))
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
new file mode 100644
index 00000000..eb8f7fc7
--- /dev/null
+++ b/src/lj_emit_arm64.h
@@ -0,0 +1,397 @@
+/*
+** ARM64 instruction emitter.
+** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
+**
+** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
+** Sponsored by Cisco Systems, Inc.
+*/
+
+/* -- Constant encoding --------------------------------------------------- */
+
+static uint64_t get_k64val(IRIns *ir)
+{
+  if (ir->o == IR_KINT64) {
+    return ir_kint64(ir)->u64;
+  } else if (ir->o == IR_KGC) {
+    return (uint64_t)ir_kgc(ir);
+  } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
+    return (uint64_t)ir_kptr(ir);
+  } else {
+    lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL);
+    return ir->i;  /* Sign-extended. */
+  }
+}
+
+/* Encode constant in K12 format for data processing instructions. */
+static uint32_t emit_isk12(int64_t n)
+{
+  uint64_t k = (n < 0) ? -n : n;
+  uint32_t m = (n < 0) ? 0x40000000 : 0;
+  if (k < 0x1000) {
+    return A64I_K12|m|A64F_U12(k);
+  } else if ((k & 0xfff000) == k) {
+    return A64I_K12|m|0x400000|A64F_U12(k>>12);
+  }
+  return 0;
+}
+
+#define emit_clz64(n)	__builtin_clzll(n)
+#define emit_ctz64(n)	__builtin_ctzll(n)
+
+/* Encode constant in K13 format for logical data processing instructions. */
+static uint32_t emit_isk13(uint64_t n, int is64)
+{
+  int inv = 0, w = 128, lz, tz;
+  if (n & 1) { n = ~n; w = 64; inv = 1; }  /* Avoid wrap-around of ones. */
+  if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
+  do {  /* Find the repeat width. */
+    if (is64 && (uint32_t)(n^(n>>32))) break;
+    n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break;
+    n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
+    n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
+    n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
+    n = n & 0x3; w = 2;
+  } while (0);
+  lz = emit_clz64(n);
+  tz = emit_ctz64(n);
+  if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
+  if (inv)
+    return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
+  else
+    return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
+}
+
+static uint32_t emit_isfpk64(uint64_t n)
+{
+  uint64_t etop9 = ((n >> 54) & 0x1ff);
+  if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
+    return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
+  }
+  return ~0u;
+}
+
+/* -- Emit basic instructions --------------------------------------------- */
+
+static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
+}
+
+static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
+{
+  *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
+}
+
+static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
+{
+  *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
+}
+
+static void emit_d(ASMState *as, A64Ins ai, Reg rd)
+{
+  *--as->mcp = ai | A64F_D(rd);
+}
+
+static void emit_n(ASMState *as, A64Ins ai, Reg rn)
+{
+  *--as->mcp = ai | A64F_N(rn);
+}
+
+static int emit_checkofs(A64Ins ai, int64_t ofs)
+{
+  int scale = (ai >> 30) & 3;
+  if (ofs < 0 || (ofs & ((1<= -256 && ofs <= 255) ? -1 : 0;
+  } else {
+    return (ofs < (4096<> 30) & 3;
+  lua_assert(ot);
+  /* Combine LDR/STR pairs to LDP/STP. */
+  if ((sc == 2 || sc == 3) &&
+      (!(ai & 0x400000) || rd != rn) &&
+      as->mcp != as->mcloop) {
+    uint32_t prev = *as->mcp & ~A64F_D(31);
+    int ofsm = ofs - (1<>sc)) ||
+	prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
+      aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
+    } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
+	       prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
+      aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
+      ofsm = ofs;
+    } else {
+      goto nopair;
+    }
+    if (ofsm >= (-64<mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
+	(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
+      return;
+    }
+  }
+nopair:
+  if (ot == 1)
+    *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
+  else
+    *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
+}
+
+/* -- Emit loads/stores --------------------------------------------------- */
+
+/* Prefer rematerialization of BASE/L from global_State over spills. */
+#define emit_canremat(ref)	((ref) <= ASMREF_L)
+
+/* Try to find an N-step delta relative to other consts with N < lim. */
+static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
+{
+  RegSet work = ~as->freeset & RSET_GPR;
+  if (lim <= 1) return 0;  /* Can't beat that. */
+  while (work) {
+    Reg r = rset_picktop(work);
+    IRRef ref = regcost_ref(as->cost[r]);
+    lua_assert(r != rd);
+    if (ref < REF_TRUE) {
+      uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
+				     get_k64val(IR(ref));
+      int64_t delta = (int64_t)(k - kx);
+      if (delta == 0) {
+	emit_dm(as, A64I_MOVx, rd, r);
+	return 1;
+      } else {
+	uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
+	if (k12) {
+	  emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
+	  return 1;
+	}
+	/* Do other ops or multi-step deltas pay off? Probably not.
+	** E.g. XOR rarely helps with pointer consts.
+	*/
+      }
+    }
+    rset_clear(work, r);
+  }
+  return 0;  /* Failed. */
+}
+
+static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
+{
+  uint32_t k13 = emit_isk13(u64, is64);
+  if (k13) {  /* Can the constant be represented as a bitmask immediate? */
+    emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
+  } else {
+    int i, zeros = 0, ones = 0, neg;
+    if (!is64) u64 = (int64_t)(int32_t)u64;  /* Sign-extend. */
+    /* Count homogeneous 16 bit fragments. */
+    for (i = 0; i < 4; i++) {
+      uint64_t frag = (u64 >> i*16) & 0xffff;
+      zeros += (frag == 0);
+      ones += (frag == 0xffff);
+    }
+    neg = ones > zeros;  /* Use MOVN if it pays off. */
+    if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
+      int shift = 0, lshift = 0;
+      uint64_t n64 = neg ? ~u64 : u64;
+      if (n64 != 0) {
+	/* Find first/last fragment to be filled. */
+	shift = (63-emit_clz64(n64)) & ~15;
+	lshift = emit_ctz64(n64) & ~15;
+      }
+      /* MOVK requires the original value (u64). */
+      while (shift > lshift) {
+	uint32_t u16 = (u64 >> shift) & 0xffff;
+	/* Skip fragments that are correctly filled by MOVN/MOVZ. */
+	if (u16 != (neg ? 0xffff : 0))
+	  emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
+	shift -= 16;
+      }
+      /* But MOVN needs an inverted value (n64). */
+      emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
+		 A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
+    }
+  }
+}
+
+/* Load a 32 bit constant into a GPR. */
+#define emit_loadi(as, rd, i)	emit_loadk(as, rd, i, 0)
+
+/* Load a 64 bit constant into a GPR. */
+#define emit_loadu64(as, rd, i)	emit_loadk(as, rd, i, A64I_X)
+
+#define emit_loada(as, r, addr)	emit_loadu64(as, (r), (uintptr_t)(addr))
+
+#define glofs(as, k) \
+  ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
+#define mcpofs(as, k) \
+  ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp))
+#define checkmcpofs(as, k) \
+  ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0)
+
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+
+/* Get/set from constant pointer. */
+static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
+{
+  /* First, check if ip + offset is in range. */
+  if ((ai & 0x00400000) && checkmcpofs(as, p)) {
+    emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
+  } else {
+    Reg base = RID_GL;  /* Next, try GL + offset. */
+    int64_t ofs = glofs(as, p);
+    if (!emit_checkofs(ai, ofs)) {  /* Else split up into base reg + offset. */
+      int64_t i64 = i64ptr(p);
+      base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
+      ofs = i64 & 0x7fffull;
+    }
+    emit_lso(as, ai, r, base, ofs);
+  }
+}
+
+/* Load 64 bit IR constant into register. */
+static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
+{
+  const uint64_t *k = &ir_k64(ir)->u64;
+  int64_t ofs;
+  if (r >= RID_MAX_GPR) {
+    uint32_t fpk = emit_isfpk64(*k);
+    if (fpk != ~0u) {
+      emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
+      return;
+    }
+  }
+  ofs = glofs(as, k);
+  if (emit_checkofs(A64I_LDRx, ofs)) {
+    emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
+	     (r & 31), RID_GL, ofs);
+  } else {
+    if (r >= RID_MAX_GPR) {
+      emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
+      r = RID_TMP;
+    }
+    if (checkmcpofs(as, k))
+      emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
+    else
+      emit_loadu64(as, r, *k);
+  }
+}
+
+/* Get/set global_State fields. */
+#define emit_getgl(as, r, field) \
+  emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
+#define emit_setgl(as, r, field) \
+  emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
+
+/* Trace number is determined from pc of exit instruction. */
+#define emit_setvmstate(as, i)	UNUSED(i)
+
+/* -- Emit control-flow instructions -------------------------------------- */
+
+/* Label for internal jumps. */
+typedef MCode *MCLabel;
+
+/* Return label pointing to current PC. */
+#define emit_label(as)		((as)->mcp)
+
+static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x40000) >> 19) == 0);
+  *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond;
+  as->mcp = p;
+}
+
+static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
+{
+  MCode *p = as->mcp;
+  ptrdiff_t delta = target - (p - 1);
+  lua_assert(((delta + 0x02000000) >> 26) == 0);
+  *--p = ai | ((uint32_t)delta & 0x03ffffffu);
+  as->mcp = p;
+}
+
+#define emit_jmp(as, target)	emit_branch(as, A64I_B, (target))
+
+static void emit_call(ASMState *as, void *target)
+{
+  MCode *p = --as->mcp;
+  ptrdiff_t delta = (char *)target - (char *)p;
+  if ((((delta>>2) + 0x02000000) >> 26) == 0) {
+    *p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu);
+  } else {  /* Target out of range: need indirect call. But don't use R0-R7. */
+    Reg r = ra_allock(as, i64ptr(target),
+		      RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
+    *p = A64I_BLR | A64F_N(r);
+  }
+}
+
+/* -- Emit generic operations --------------------------------------------- */
+
+/* Generic move between two regs. */
+static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
+{
+  if (dst >= RID_MAX_GPR) {
+    emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
+	    (dst & 31), (src & 31));
+    return;
+  }
+  if (as->mcp != as->mcloop) {  /* Swap early registers for loads/stores. */
+    MCode ins = *as->mcp, swp = (src^dst);
+    if ((ins & 0xbf800000) == 0xb9000000) {
+      if (!((ins ^ (dst << 5)) & 0x000003e0))
+	*as->mcp = ins ^ (swp << 5);  /* Swap N in load/store. */
+      if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
+	*as->mcp = ins ^ swp;  /* Swap D in store. */
+    }
+  }
+  emit_dm(as, A64I_MOVx, dst, src);
+}
+
+/* Generic load of register with base and (small) offset address. */
+static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+  if (r >= RID_MAX_GPR)
+    emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
+  else
+    emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
+}
+
+/* Generic store of register with base and (small) offset address. */
+static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
+{
+  if (r >= RID_MAX_GPR)
+    emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
+  else
+    emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
+}
+
+/* Emit an arithmetic operation with a constant operand. */
+static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
+		     int32_t i, RegSet allow)
+{
+  uint32_t k = emit_isk12(i);
+  if (k)
+    emit_dn(as, ai^k, dest, src);
+  else
+    emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
+}
+
+/* Add offset to pointer. */
+static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
+{
+  if (ofs)
+    emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
+		 ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
+}
+
+#define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
+
diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c
index 8b72be7d..8bc2474c 100644
--- a/src/lj_gdbjit.c
+++ b/src/lj_gdbjit.c
@@ -296,6 +296,9 @@ enum {
 #elif LJ_TARGET_ARM
   DW_REG_SP = 13,
   DW_REG_RA = 14,
+#elif LJ_TARGET_ARM64
+  DW_REG_SP = 31,
+  DW_REG_RA = 30,
 #elif LJ_TARGET_PPC
   DW_REG_SP = 1,
   DW_REG_RA = 65,
@@ -374,6 +377,8 @@ static const ELFheader elfhdr_template = {
   .machine = 62,
 #elif LJ_TARGET_ARM
   .machine = 40,
+#elif LJ_TARGET_ARM64
+  .machine = 183,
 #elif LJ_TARGET_PPC
   .machine = 20,
 #elif LJ_TARGET_MIPS
@@ -563,6 +568,13 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx)
       int i;
       for (i = 11; i >= 4; i--) { DB(DW_CFA_offset|i); DUV(2+(11-i)); }
     }
+#elif LJ_TARGET_ARM64
+    {
+      int i;
+      DB(DW_CFA_offset|31); DUV(2);
+      for (i = 28; i >= 19; i--) { DB(DW_CFA_offset|i); DUV(3+(28-i)); }
+      for (i = 15; i >= 8; i--) { DB(DW_CFA_offset|32|i); DUV(28-i); }
+    }
 #elif LJ_TARGET_PPC
     {
       int i;
diff --git a/src/lj_target.h b/src/lj_target.h
index abea8d5b..c069eb95 100644
--- a/src/lj_target.h
+++ b/src/lj_target.h
@@ -55,7 +55,7 @@ typedef uint32_t RegSP;
 /* Bitset for registers. 32 registers suffice for most architectures.
 ** Note that one set holds bits for both GPRs and FPRs.
 */
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
+#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
 typedef uint64_t RegSet;
 #else
 typedef uint32_t RegSet;
@@ -69,7 +69,7 @@ typedef uint32_t RegSet;
 #define rset_set(rs, r)		(rs |= RID2RSET(r))
 #define rset_clear(rs, r)	(rs &= ~RID2RSET(r))
 #define rset_exclude(rs, r)	(rs & ~RID2RSET(r))
-#if LJ_TARGET_PPC || LJ_TARGET_MIPS
+#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
 #define rset_picktop(rs)	((Reg)(__builtin_clzll(rs)^63))
 #define rset_pickbot(rs)	((Reg)__builtin_ctzll(rs))
 #else
diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h
index 57ab134f..0cef06d5 100644
--- a/src/lj_target_arm64.h
+++ b/src/lj_target_arm64.h
@@ -55,7 +55,8 @@ enum {
 
 /* Make use of all registers, except for x18, fp, lr and sp. */
 #define RSET_FIXED \
-  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP))
+  (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)|\
+   RID2RSET(RID_GL))
 #define RSET_GPR	(RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED)
 #define RSET_FPR	RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)
 #define RSET_ALL	(RSET_GPR|RSET_FPR)
@@ -73,25 +74,235 @@ enum {
 #define REGARG_LASTFPR		RID_D7
 #define REGARG_NUMFPR		8
 
+/* -- Spill slots --------------------------------------------------------- */
+
+/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs.
+**
+** SPS_FIXED: Available fixed spill slots in interpreter frame.
+** This definition must match with the vm_arm64.dasc file.
+** Pre-allocate some slots to avoid sp adjust in every root trace.
+**
+** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots.
+*/
+#define SPS_FIXED	4
+#define SPS_FIRST	2
+
+#define SPOFS_TMP	0
+
+#define sps_scale(slot)		(4 * (int32_t)(slot))
+#define sps_align(slot)		(((slot) - SPS_FIXED + 3) & ~3)
+
+/* -- Exit state ---------------------------------------------------------- */
+
+/* This definition must match with the *.dasc file(s). */
+typedef struct {
+  lua_Number fpr[RID_NUM_FPR];	/* Floating-point registers. */
+  intptr_t gpr[RID_NUM_GPR];	/* General-purpose registers. */
+  int32_t spill[256];		/* Spill slots. */
+} ExitState;
+
+/* PC after instruction that caused an exit. Used to find the trace number. */
+#define EXITSTATE_PCREG		RID_LR
+/* Highest exit + 1 indicates stack check. */
+#define EXITSTATE_CHECKEXIT	1
+
+#define EXITSTUB_SPACING	4
+#define EXITSTUBS_PER_GROUP	32
+
+
 /* -- Instructions -------------------------------------------------------- */
 
 /* Instruction fields. */
 #define A64F_D(r)	(r)
-#define A64F_N(r)       ((r) << 5)
-#define A64F_A(r)       ((r) << 10)
-#define A64F_M(r)       ((r) << 16)
+#define A64F_N(r)	((r) << 5)
+#define A64F_A(r)	((r) << 10)
+#define A64F_M(r)	((r) << 16)
+#define A64F_IMMS(x)	((x) << 10)
+#define A64F_IMMR(x)	((x) << 16)
 #define A64F_U16(x)	((x) << 5)
+#define A64F_U12(x)	((x) << 10)
 #define A64F_S26(x)	(x)
 #define A64F_S19(x)	((x) << 5)
+#define A64F_S9(x)	((x) << 12)
+#define A64F_SH(sh, x)	(((sh) << 22) | ((x) << 10))
+#define A64F_EX(ex)	(A64I_EX | ((ex) << 13))
+#define A64F_EXSH(ex,x)	(A64I_EX | ((ex) << 13) | ((x) << 10))
+#define A64F_FP8(x)	((x) << 13)
+#define A64F_CC(cc)	((cc) << 12)
+#define A64F_LSL16(x)	(((x) / 16) << 21)
+#define A64F_BSH(sh)	((sh) << 10)
 
 typedef enum A64Ins {
+  A64I_S = 0x20000000,
+  A64I_X = 0x80000000,
+  A64I_EX = 0x00200000,
+  A64I_K12 = 0x1a000000,
+  A64I_K13 = 0x18000000,
+  A64I_LS_U = 0x01000000,
+  A64I_LS_S = 0x00800000,
+  A64I_LS_R = 0x01200800,
+  A64I_LS_UXTWx = 0x00005000,
+  A64I_LS_LSLx = 0x00007000,
+
+  A64I_ADDw = 0x0b000000,
+  A64I_ADDx = 0x8b000000,
+  A64I_ADDSw = 0x2b000000,
+  A64I_ADDSx = 0xab000000,
+  A64I_NEGw = 0x4b0003e0,
+  A64I_NEGx = 0xcb0003e0,
+  A64I_SUBw = 0x4b000000,
+  A64I_SUBx = 0xcb000000,
+  A64I_SUBSw = 0x6b000000,
+  A64I_SUBSx = 0xeb000000,
+
+  A64I_MULw = 0x1b007c00,
+  A64I_MULx = 0x9b007c00,
+  A64I_SMULL = 0x9b207c00,
+
+  A64I_ANDw = 0x0a000000,
+  A64I_ANDx = 0x8a000000,
+  A64I_ANDSw = 0x6a000000,
+  A64I_ANDSx = 0xea000000,
+  A64I_EORw = 0x4a000000,
+  A64I_EORx = 0xca000000,
+  A64I_ORRw = 0x2a000000,
+  A64I_ORRx = 0xaa000000,
+  A64I_TSTw  = 0x6a00001f,
+  A64I_TSTx  = 0xea00001f,
+
+  A64I_CMPw = 0x6b00001f,
+  A64I_CMPx = 0xeb00001f,
+  A64I_CMNw = 0x2b00001f,
+  A64I_CMNx = 0xab00001f,
+  A64I_CCMPw = 0x7a400000,
+  A64I_CCMPx = 0xfa400000,
+  A64I_CSELw = 0x1a800000,
+  A64I_CSELx = 0x9a800000,
+
+  A64I_ASRw = 0x13007c00,
+  A64I_ASRx = 0x9340fc00,
+  A64I_LSLx = 0xd3400000,
+  A64I_LSRx = 0xd340fc00,
+  A64I_SHRw = 0x1ac02000,
+  A64I_SHRx = 0x9ac02000,	/* lsl/lsr/asr/ror x0, x0, x0 */
+  A64I_REVw = 0x5ac00800,
+  A64I_REVx = 0xdac00c00,
+
+  A64I_EXTRw = 0x13800000,
+  A64I_EXTRx = 0x93c00000,
+  A64I_SBFMw = 0x13000000,
+  A64I_SBFMx = 0x93400000,
+  A64I_SXTBw = 0x13001c00,
+  A64I_SXTHw = 0x13003c00,
+  A64I_SXTW = 0x93407c00,
+  A64I_UBFMw = 0x53000000,
+  A64I_UBFMx = 0xd3400000,
+  A64I_UXTBw = 0x53001c00,
+  A64I_UXTHw = 0x53003c00,
+
+  A64I_MOVw = 0x2a0003e0,
+  A64I_MOVx = 0xaa0003e0,
+  A64I_MVNw = 0x2a2003e0,
+  A64I_MVNx = 0xaa2003e0,
+  A64I_MOVKw = 0x72800000,
+  A64I_MOVKx = 0xf2800000,
   A64I_MOVZw = 0x52800000,
   A64I_MOVZx = 0xd2800000,
+  A64I_MOVNw = 0x12800000,
+  A64I_MOVNx = 0x92800000,
+
+  A64I_LDRB = 0x39400000,
+  A64I_LDRH = 0x79400000,
+  A64I_LDRw = 0xb9400000,
+  A64I_LDRx = 0xf9400000,
   A64I_LDRLw = 0x18000000,
   A64I_LDRLx = 0x58000000,
-  A64I_NOP = 0xd503201f,
+  A64I_STRB = 0x39000000,
+  A64I_STRH = 0x79000000,
+  A64I_STRw = 0xb9000000,
+  A64I_STRx = 0xf9000000,
+  A64I_STPw = 0x29000000,
+  A64I_STPx = 0xa9000000,
+  A64I_LDPw = 0x29400000,
+  A64I_LDPx = 0xa9400000,
+
   A64I_B = 0x14000000,
+  A64I_BCC = 0x54000000,
+  A64I_BL = 0x94000000,
   A64I_BR = 0xd61f0000,
+  A64I_BLR = 0xd63f0000,
+
+  A64I_NOP = 0xd503201f,
+
+  /* FP */
+  A64I_FADDd = 0x1e602800,
+  A64I_FSUBd = 0x1e603800,
+  A64I_FMADDd = 0x1f400000,
+  A64I_FMSUBd = 0x1f408000,
+  A64I_FNMADDd = 0x1f600000,
+  A64I_FNMSUBd = 0x1f608000,
+  A64I_FMULd = 0x1e600800,
+  A64I_FDIVd = 0x1e601800,
+  A64I_FNEGd = 0x1e614000,
+  A64I_FABS = 0x1e60c000,
+  A64I_FSQRTd = 0x1e61c000,
+  A64I_LDRs = 0xbd400000,
+  A64I_LDRd = 0xfd400000,
+  A64I_STRs = 0xbd000000,
+  A64I_STRd = 0xfd000000,
+  A64I_LDPs = 0x2d400000,
+  A64I_LDPd = 0x6d400000,
+  A64I_STPs = 0x2d000000,
+  A64I_STPd = 0x6d000000,
+  A64I_FCMPd = 0x1e602000,
+  A64I_FCMPZd = 0x1e602008,
+  A64I_FCSELd = 0x1e600c00,
+  A64I_FRINTMd = 0x1e654000,
+  A64I_FRINTPd = 0x1e64c000,
+  A64I_FRINTZd = 0x1e65c000,
+
+  A64I_FCVT_F32_F64 = 0x1e624000,
+  A64I_FCVT_F64_F32 = 0x1e22c000,
+  A64I_FCVT_F32_S32 = 0x1e220000,
+  A64I_FCVT_F64_S32 = 0x1e620000,
+  A64I_FCVT_F32_U32 = 0x1e230000,
+  A64I_FCVT_F64_U32 = 0x1e630000,
+  A64I_FCVT_F32_S64 = 0x9e220000,
+  A64I_FCVT_F64_S64 = 0x9e620000,
+  A64I_FCVT_F32_U64 = 0x9e230000,
+  A64I_FCVT_F64_U64 = 0x9e630000,
+  A64I_FCVT_S32_F64 = 0x1e780000,
+  A64I_FCVT_S32_F32 = 0x1e380000,
+  A64I_FCVT_U32_F64 = 0x1e790000,
+  A64I_FCVT_U32_F32 = 0x1e390000,
+  A64I_FCVT_S64_F64 = 0x9e780000,
+  A64I_FCVT_S64_F32 = 0x9e380000,
+  A64I_FCVT_U64_F64 = 0x9e790000,
+  A64I_FCVT_U64_F32 = 0x9e390000,
+
+  A64I_FMOV_S = 0x1e204000,
+  A64I_FMOV_D = 0x1e604000,
+  A64I_FMOV_R_S = 0x1e260000,
+  A64I_FMOV_S_R = 0x1e270000,
+  A64I_FMOV_R_D = 0x9e660000,
+  A64I_FMOV_D_R = 0x9e670000,
+  A64I_FMOV_DI = 0x1e601000,
 } A64Ins;
 
+typedef enum A64Shift {
+  A64SH_LSL, A64SH_LSR, A64SH_ASR, A64SH_ROR
+} A64Shift;
+
+typedef enum A64Extend {
+  A64EX_UXTB, A64EX_UXTH, A64EX_UXTW, A64EX_UXTX,
+  A64EX_SXTB, A64EX_SXTH, A64EX_SXTW, A64EX_SXTX,
+} A64Extend;
+
+/* ARM condition codes. */
+typedef enum A64CC {
+  CC_EQ, CC_NE, CC_CS, CC_CC, CC_MI, CC_PL, CC_VS, CC_VC,
+  CC_HI, CC_LS, CC_GE, CC_LT, CC_GT, CC_LE, CC_AL,
+  CC_HS = CC_CS, CC_LO = CC_CC
+} A64CC;
+
 #endif
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 7a881bdd..a6227bf7 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -236,12 +236,17 @@
 |.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro
 |.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro
 |
-#define GL_J(field)	(GG_OFS(J) + (int)offsetof(jit_State, field))
+#define GL_J(field)	(GG_G2J + (int)offsetof(jit_State, field))
 |
 #define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
 |
 |.macro hotcheck, delta
-|  NYI
+|  lsr CARG1, PC, #1
+|  and CARG1, CARG1, #126
+|  add CARG1, CARG1, #GG_G2DISP+GG_DISP2HOT
+|  ldrh CARG2w, [GL, CARG1]
+|  subs CARG2, CARG2, #delta
+|  strh CARG2w, [GL, CARG1]
 |.endmacro
 |
 |.macro hotloop
@@ -869,7 +874,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  bl extern lj_meta_for	// (lua_State *L, TValue *base)
   |  ldr INSw, [PC, #-4]
   |.if JIT
-  |   uxtb TMP0, INS
+  |   uxtb TMP0w, INSw
   |.endif
   |  decode_RA RA, INS
   |  decode_RD RC, INS
@@ -1732,7 +1737,20 @@ static void build_subroutines(BuildCtx *ctx)
   |//-----------------------------------------------------------------------
   |
   |->vm_record:				// Dispatch target for recording phase.
-  |  NYI
+  |.if JIT
+  |  ldrb CARG1w, GL->hookmask
+  |  tst CARG1, #HOOK_VMEVENT		// No recording while in vmevent.
+  |  bne >5
+  |  // Decrement the hookcount for consistency, but always do the call.
+  |   ldr CARG2w, GL->hookcount
+  |  tst CARG1, #HOOK_ACTIVE
+  |  bne >1
+  |   sub CARG2w, CARG2w, #1
+  |  tst CARG1, #LUA_MASKLINE|LUA_MASKCOUNT
+  |  beq >1
+  |   str CARG2w, GL->hookcount
+  |  b >1
+  |.endif
   |
   |->vm_rethook:			// Dispatch target for return hooks.
   |  ldrb TMP2w, GL->hookmask
@@ -1774,7 +1792,21 @@ static void build_subroutines(BuildCtx *ctx)
   |  b <4
   |
   |->vm_hotloop:			// Hot loop counter underflow.
-  |  NYI
+  |.if JIT
+  |  ldr LFUNC:CARG3, [BASE, FRAME_FUNC]  // Same as curr_topL(L).
+  |   add CARG1, GL, #GG_G2DISP+GG_DISP2J
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |   str PC, SAVE_PC
+  |  ldr CARG3, LFUNC:CARG3->pc
+  |   mov CARG2, PC
+  |   str L, [GL, #GL_J(L)]
+  |  ldrb CARG3w, [CARG3, #PC2PROTO(framesize)]
+  |   str BASE, L->base
+  |  add CARG3, BASE, CARG3, lsl #3
+  |  str CARG3, L->top
+  |  bl extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
+  |  b <3
+  |.endif
   |
   |->vm_callhook:			// Dispatch target for call hooks.
   |  mov CARG2, PC
@@ -1804,7 +1836,54 @@ static void build_subroutines(BuildCtx *ctx)
   |  br CRET1
   |
   |->cont_stitch:			// Trace stitching.
-  |  NYI
+  |.if JIT
+  |  // RA = resultptr, CARG4 = meta base
+  |   ldr RB, SAVE_MULTRES
+  |  ldr INSw, [PC, #-4]
+  |    ldr TRACE:CARG3, [CARG4, #-40]	// Save previous trace.
+  |   subs RB, RB, #8
+  |  decode_RA RC, INS			// Call base.
+  |    and CARG3, CARG3, #LJ_GCVMASK
+  |   beq >2
+  |1:  // Move results down.
+  |  ldr CARG1, [RA]
+  |    add RA, RA, #8
+  |   subs RB, RB, #8
+  |  str CARG1, [BASE, RC, lsl #3]
+  |    add RC, RC, #1
+  |   bne <1
+  |2:
+  |   decode_RA RA, INS
+  |   decode_RB RB, INS
+  |   add RA, RA, RB
+  |3:
+  |   cmp RA, RC
+  |   bhi >9				// More results wanted?
+  |
+  |  ldrh RAw, TRACE:CARG3->traceno
+  |  ldrh RCw, TRACE:CARG3->link
+  |  cmp RCw, RAw
+  |  beq ->cont_nop			// Blacklisted.
+  |  cmp RCw, #0
+  |  bne =>BC_JLOOP			// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  mov CARG1, #GL_J(exitno)
+  |  str RA, [GL, CARG1]
+  |  mov CARG1, #GL_J(L)
+  |  str L, [GL, CARG1]
+  |  str BASE, L->base
+  |  add CARG1, GL, #GG_G2J
+  |  mov CARG2, PC
+  |  bl extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
+  |  ldr BASE, L->base
+  |  b ->cont_nop
+  |
+  |9:  // Fill up results with nil.
+  |  str TISNIL, [BASE, RC, lsl #3]
+  |  add RC, RC, #1
+  |  b <3
+  |.endif
   |
   |->vm_profhook:			// Dispatch target for profiler hook.
 #if LJ_HASPROFILE
@@ -1822,10 +1901,120 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Trace exit handler -------------------------------------------------
   |//-----------------------------------------------------------------------
   |
+  |.macro savex_, a, b
+  |  stp d..a, d..b, [sp, #a*8]
+  |  stp x..a, x..b, [sp, #32*8+a*8]
+  |.endmacro
+  |
   |->vm_exit_handler:
-  |  NYI
+  |.if JIT
+  |  sub     sp, sp, #(64*8)
+  |  savex_, 0, 1
+  |  savex_, 2, 3
+  |  savex_, 4, 5
+  |  savex_, 6, 7
+  |  savex_, 8, 9
+  |  savex_, 10, 11
+  |  savex_, 12, 13
+  |  savex_, 14, 15
+  |  savex_, 16, 17
+  |  savex_, 18, 19
+  |  savex_, 20, 21
+  |  savex_, 22, 23
+  |  savex_, 24, 25
+  |  savex_, 26, 27
+  |  savex_, 28, 29
+  |  stp d30, d31, [sp, #30*8]
+  |  ldr CARG1, [sp, #64*8]	// Load original value of lr.
+  |   add CARG3, sp, #64*8	// Recompute original value of sp.
+  |   mv_vmstate CARG4, EXIT
+  |  ldr CARG2w, [CARG1, #-4]!	// Get exit instruction.
+  |   stp CARG1, CARG3, [sp, #62*8]	// Store exit pc/sp in RID_LR/RID_SP.
+  |  lsl CARG2, CARG2, #38
+  |  add CARG1, CARG1, CARG2, asr #36
+  |   ldr CARG2w, [lr]		// Load exit stub group offset.
+  |   sub CARG1, CARG1, lr
+  |   sub CARG1, CARG1, #4
+  |  ldr L, GL->cur_L
+  |   add CARG1, CARG2, CARG1, lsr #2	// Compute exit number.
+  |    ldr BASE, GL->jit_base
+  |   st_vmstate CARG4
+  |   str CARG1w, [GL, #GL_J(exitno)]
+  |    str BASE, L->base
+  |  str L, [GL, #GL_J(L)]
+  |   str xzr, GL->jit_base
+  |  add CARG1, GL, #GG_G2J
+  |  mov CARG2, sp
+  |  bl extern lj_trace_exit		// (jit_State *J, ExitState *ex)
+  |  // Returns MULTRES (unscaled) or negated error code.
+  |  ldr CARG2, L->cframe
+  |   ldr BASE, L->base
+  |  and sp, CARG2, #CFRAME_RAWMASK
+  |   ldr PC, SAVE_PC			// Get SAVE_PC.
+  |  str L, SAVE_L			// Set SAVE_L (on-trace resume/yield).
+  |  b >1
+  |.endif
+  |
   |->vm_exit_interp:
-  |  NYI
+  |  // CARG1 = MULTRES or negated error code, BASE, PC and GL set.
+  |.if JIT
+  |  ldr L, SAVE_L
+  |1:
+  |  cmp CARG1w, #0
+  |  blt >9				// Check for error from exit.
+  |   lsl RC, CARG1, #3
+  |  ldr LFUNC:CARG2, [BASE, FRAME_FUNC]
+  |    movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48
+  |    movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16
+  |    movn TISNIL, #0
+  |  and LFUNC:CARG2, CARG2, #LJ_GCVMASK
+  |   str RC, SAVE_MULTRES
+  |   str BASE, L->base
+  |  ldr CARG2, LFUNC:CARG2->pc
+  |   str xzr, GL->jit_base
+  |    mv_vmstate CARG4, INTERP
+  |  ldr KBASE, [CARG2, #PC2PROTO(k)]
+  |  // Modified copy of ins_next which handles function header dispatch, too.
+  |  ldrb RBw, [PC]
+  |   ldr INSw, [PC], #4
+  |    st_vmstate CARG4
+  |  cmp RBw, #BC_FUNCC+2		// Fast function?
+  |   add TMP1, GL, INS, uxtb #3
+  |  bhs >4
+  |2:
+  |  cmp RBw, #BC_FUNCF			// Function header?
+  |  add TMP0, GL, RB, uxtb #3
+  |  ldr RB, [TMP0, #GG_G2DISP]
+  |   decode_RA RA, INS
+  |   lsr TMP0, INS, #16
+  |   csel RC, TMP0, RC, lo
+  |   blo >5
+  |   ldr CARG3, [BASE, FRAME_FUNC]
+  |   sub RC, RC, #8
+  |   add RA, BASE, RA, lsl #3	// Yes: RA = BASE+framesize*8, RC = nargs*8
+  |   and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |5:
+  |  br RB
+  |
+  |4:  // Check frame below fast function.
+  |  ldr CARG1, [BASE, FRAME_PC]
+  |  ands CARG2, CARG1, #FRAME_TYPE
+  |  bne <2			// Trace stitching continuation?
+  |  // Otherwise set KBASE for Lua function below fast function.
+  |  ldr CARG3, [CARG1, #-4]
+  |  decode_RA CARG1, CARG3
+  |  sub CARG2, BASE, CARG1, lsl #3
+  |  ldr LFUNC:CARG3, [CARG2, #-32]
+  |  and LFUNC:CARG3, CARG3, #LJ_GCVMASK
+  |  ldr CARG3, LFUNC:CARG3->pc
+  |  ldr KBASE, [CARG3, #PC2PROTO(k)]
+  |  b <2
+  |
+  |9:  // Rethrow error from the right C frame.
+  |  neg CARG2, CARG1
+  |  mov CARG1, L
+  |  bl extern lj_err_throw		// (lua_State *L, int errcode)
+  |.endif
   |
   |//-----------------------------------------------------------------------
   |//-- Math helper functions ----------------------------------------------
@@ -3387,6 +3576,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     if (op == BC_FORI) {
       |  csel PC, RC, PC, gt
     } else if (op == BC_JFORI) {
+      |  mov PC, RC
       |  ldrh RCw, [RC, #-2]
     } else if (op == BC_IFORL) {
       |  csel PC, RC, PC, le
@@ -3488,7 +3678,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   case BC_JLOOP:
     |.if JIT
-    |  NYI
+    |  // RA = base (ignored), RC = traceno
+    |  ldr CARG1, [GL, #GL_J(trace)]
+    |   mov CARG2, #0  // Traces on ARM64 don't store the trace #, so use 0.
+    |  ldr TRACE:RC, [CARG1, RC, lsl #3]
+    |   st_vmstate CARG2
+    |  ldr RA, TRACE:RC->mcode
+    |   str BASE, GL->jit_base
+    |   str L, GL->tmpbuf.L
+    |  sub sp, sp, #16	// See SPS_FIXED. Avoids sp adjust in every root trace.
+    |  br RA
     |.endif
     break;
 
@@ -3546,10 +3745,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_IFUNCV:
     |  // BASE = new base, RA = BASE+framesize*8, CARG3 = LFUNC, RC = nargs*8
     |  ldr CARG1, L->maxstack
+    |   movn TMP0, #~LJ_TFUNC
     |   add TMP2, BASE, RC
+    |   add LFUNC:CARG3, CARG3, TMP0, lsl #47
     |  add RA, RA, RC
     |   add TMP0, RC, #16+FRAME_VARG
-    |   str LFUNC:CARG3, [TMP2], #8	// Store (untagged) copy of LFUNC.
+    |   str LFUNC:CARG3, [TMP2], #8	// Store (tagged) copy of LFUNC.
     |    ldr KBASE, [PC, #-4+PC2PROTO(k)]
     |  cmp RA, CARG1
     |   str TMP0, [TMP2], #8		// Store delta + FRAME_VARG.
@@ -3736,8 +3937,8 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.uleb128 0x1\n"
 	"\t.sleb128 -8\n"
 	"\t.byte 30\n"				/* Return address is in lr. */
-	"\t.uleb128 1\n"                        /* augmentation length */
-	"\t.byte 0x1b\n"                        /* pcrel|sdata4 */
+	"\t.uleb128 1\n"			/* augmentation length */
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
 	"\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n"	/* def_cfa sp */
 	"\t.align 3\n"
 	".LECIE2:\n\n");
@@ -3748,7 +3949,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.long .LASFDE3-.Lframe2\n"
 	"\t.long lj_vm_ffi_call-.\n"
 	"\t.long %d\n"
-	"\t.uleb128 0\n"                        /* augmentation length */
+	"\t.uleb128 0\n"			/* augmentation length */
 	"\t.byte 0xe\n\t.uleb128 32\n"		/* def_cfa_offset */
 	"\t.byte 0x9d\n\t.uleb128 4\n"		/* offset fp */
 	"\t.byte 0x9e\n\t.uleb128 3\n"		/* offset lr */

From 7a0c3a1127229788fa4cfc436e1b3cbee36b7eb4 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 23:17:45 +0100
Subject: [PATCH 007/116] ARM64: Allow full VA range for mcode allocation.

---
 src/lj_mcode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lj_mcode.c b/src/lj_mcode.c
index 3eaee054..de785a09 100644
--- a/src/lj_mcode.c
+++ b/src/lj_mcode.c
@@ -206,6 +206,9 @@ static void mcode_protect(jit_State *J, int prot)
 
 #if LJ_TARGET_X64
 #define mcode_validptr(p)	((p) && (uintptr_t)(p) < (uintptr_t)1<<47)
+#elif LJ_TARGET_ARM64
+/* We have no clue about the valid VA range. It could be 39 - 52 bits. */
+#define mcode_validptr(p)	(p)
 #else
 #define mcode_validptr(p)	((p) && (uintptr_t)(p) < 0xffff0000)
 #endif

From 2b77da35bc77e2d34062d9168884095d9145a993 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Sun, 20 Nov 2016 23:32:17 +0100
Subject: [PATCH 008/116] ARM64: Reject special case in emit_isk13().

---
 src/lj_emit_arm64.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
index eb8f7fc7..52e75559 100644
--- a/src/lj_emit_arm64.h
+++ b/src/lj_emit_arm64.h
@@ -46,7 +46,9 @@ static uint32_t emit_isk13(uint64_t n, int is64)
   if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
   do {  /* Find the repeat width. */
     if (is64 && (uint32_t)(n^(n>>32))) break;
-    n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break;
+    n = (uint32_t)n;
+    if (!n) return 0;  /* Ditto when passing n=0xffffffff and is64=0. */
+    w = 32; if ((n^(n>>16)) & 0xffff) break;
     n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
     n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
     n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;

From a56654460d9ca636536c8204a358207a698e625b Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Mon, 21 Nov 2016 15:43:17 +0100
Subject: [PATCH 009/116] Generalize deferred constant handling in backend to
 64 bit.

---
 src/lj_asm.c       | 34 ++++++++++++++++++++++++++++++----
 src/lj_emit_arm.h  |  2 +-
 src/lj_emit_mips.h |  4 ++--
 src/lj_emit_ppc.h  |  2 +-
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/lj_asm.c b/src/lj_asm.c
index 2cb5abea..f0a11ca8 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -91,7 +91,7 @@ typedef struct ASMState {
   MCode *realign;	/* Realign loop if not NULL. */
 
 #ifdef RID_NUM_KREF
-  int32_t krefk[RID_NUM_KREF];
+  intptr_t krefk[RID_NUM_KREF];
 #endif
   IRRef1 phireg[RID_MAX];  /* PHI register references. */
   uint16_t parentmap[LJ_MAX_JSLOTS];  /* Parent instruction to RegSP map. */
@@ -144,7 +144,7 @@ static LJ_AINLINE void checkmclim(ASMState *as)
 #define ra_krefreg(ref)		((Reg)(RID_MIN_KREF + (Reg)(ref)))
 #define ra_krefk(as, ref)	(as->krefk[(ref)])
 
-static LJ_AINLINE void ra_setkref(ASMState *as, Reg r, int32_t k)
+static LJ_AINLINE void ra_setkref(ASMState *as, Reg r, intptr_t k)
 {
   IRRef ref = (IRRef)(r - RID_MIN_KREF);
   as->krefk[ref] = k;
@@ -324,7 +324,11 @@ static Reg ra_rematk(ASMState *as, IRRef ref)
     lua_assert(!rset_test(as->freeset, r));
     ra_free(as, r);
     ra_modified(as, r);
+#if LJ_64
+    emit_loadu64(as, r, ra_krefk(as, ref));
+#else
     emit_loadi(as, r, ra_krefk(as, ref));
+#endif
     return r;
   }
   ir = IR(ref);
@@ -526,7 +530,7 @@ static void ra_evictk(ASMState *as)
 
 #ifdef RID_NUM_KREF
 /* Allocate a register for a constant. */
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow)
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow)
 {
   /* First try to find a register which already holds the same constant. */
   RegSet pick, work = ~as->freeset & RSET_GPR;
@@ -535,9 +539,31 @@ static Reg ra_allock(ASMState *as, int32_t k, RegSet allow)
     IRRef ref;
     r = rset_pickbot(work);
     ref = regcost_ref(as->cost[r]);
+#if LJ_64
+    if (ref < ASMREF_L) {
+      if (ra_iskref(ref)) {
+	if (k == ra_krefk(as, ref))
+	  return r;
+      } else {
+	IRIns *ir = IR(ref);
+	if ((ir->o == IR_KINT64 && k == (int64_t)ir_kint64(ir)->u64) ||
+#if LJ_GC64
+	    (ir->o == IR_KINT && k == ir->i) ||
+	    (ir->o == IR_KGC && k == (intptr_t)ir_kgc(ir)) ||
+	    ((ir->o == IR_KPTR || ir->o == IR_KKPTR) &&
+	     k == (intptr_t)ir_kptr(ir))
+#else
+	    (ir->o != IR_KINT64 && k == ir->i)
+#endif
+	   )
+	  return r;
+      }
+    }
+#else
     if (ref < ASMREF_L &&
 	k == (ra_iskref(ref) ? ra_krefk(as, ref) : IR(ref)->i))
       return r;
+#endif
     rset_clear(work, r);
   }
   pick = as->freeset & allow;
@@ -557,7 +583,7 @@ static Reg ra_allock(ASMState *as, int32_t k, RegSet allow)
 }
 
 /* Allocate a specific register for a constant. */
-static void ra_allockreg(ASMState *as, int32_t k, Reg r)
+static void ra_allockreg(ASMState *as, intptr_t k, Reg r)
 {
   Reg kr = ra_allock(as, k, RID2RSET(r));
   if (kr != r) {
diff --git a/src/lj_emit_arm.h b/src/lj_emit_arm.h
index 496b37a3..f7c93d84 100644
--- a/src/lj_emit_arm.h
+++ b/src/lj_emit_arm.h
@@ -207,7 +207,7 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i)
 
 #define emit_loada(as, r, addr)		emit_loadi(as, (r), i32ptr((addr)))
 
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow);
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
 
 /* Get/set from constant pointer. */
 static void emit_lsptr(ASMState *as, ARMIns ai, Reg r, void *p)
diff --git a/src/lj_emit_mips.h b/src/lj_emit_mips.h
index d35f830b..93c35ea4 100644
--- a/src/lj_emit_mips.h
+++ b/src/lj_emit_mips.h
@@ -94,8 +94,8 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i)
 
 #define emit_loada(as, r, addr)		emit_loadi(as, (r), i32ptr((addr)))
 
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow);
-static void ra_allockreg(ASMState *as, int32_t k, Reg r);
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
+static void ra_allockreg(ASMState *as, intptr_t k, Reg r);
 
 /* Get/set from constant pointer. */
 static void emit_lsptr(ASMState *as, MIPSIns mi, Reg r, void *p, RegSet allow)
diff --git a/src/lj_emit_ppc.h b/src/lj_emit_ppc.h
index 5163012a..34a6efb7 100644
--- a/src/lj_emit_ppc.h
+++ b/src/lj_emit_ppc.h
@@ -98,7 +98,7 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i)
 
 #define emit_loada(as, r, addr)		emit_loadi(as, (r), i32ptr((addr)))
 
-static Reg ra_allock(ASMState *as, int32_t k, RegSet allow);
+static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
 
 /* Get/set from constant pointer. */
 static void emit_lsptr(ASMState *as, PPCIns pi, Reg r, void *p, RegSet allow)

From c3cae04153213d9779c5563056b017a19da2d283 Mon Sep 17 00:00:00 2001
From: Mike Pall 
Date: Mon, 21 Nov 2016 16:02:10 +0100
Subject: [PATCH 010/116] Update contact info.

---
 doc/contact.html | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/contact.html b/doc/contact.html
index 9f358805..bb81cf06 100644
--- a/doc/contact.html
+++ b/doc/contact.html
@@ -59,8 +59,15 @@
 
 

+If you want to report bugs, propose fixes or suggest enhancements, +please use the +GitHub issue tracker. +

+

Please send general questions to the » LuaJIT mailing list. +

+

You can also send any questions you have directly to me:

From 81259898ea177bb7b4becebf3d7686603f6b373b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 24 Nov 2016 18:56:19 +0100 Subject: [PATCH 011/116] ARM64: Emit more efficient trace exits. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 73 +++++++++++++++++++------------------------ src/lj_target_arm64.h | 14 ++++++--- src/vm_arm64.dasc | 31 +++++++++--------- 3 files changed, 57 insertions(+), 61 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 0a2f5306..19b3331d 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -47,53 +47,41 @@ static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow) /* -- Guard handling ------------------------------------------------------ */ -/* Generate an exit stub group at the bottom of the reserved MCode memory. */ -static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) -{ - MCode *mxp = as->mcbot; - int i; - if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop) - asm_mclimit(as); - /* str lr, [sp]; bl ->vm_exit_handler; .long group. */ - *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP); - *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); - mxp++; - *mxp++ = group*EXITSTUBS_PER_GROUP; - for (i = 0; i < EXITSTUBS_PER_GROUP; i++) - *mxp++ = A64I_B | ((-3-i)&0x03ffffffu); - lj_mcode_sync(as->mcbot, mxp); - lj_mcode_commitbot(as->J, mxp); - as->mcbot = mxp; - as->mclim = as->mcbot + MCLIM_REDZONE; - return mxp - EXITSTUBS_PER_GROUP; -} - /* Setup all needed exit stubs. */ static void asm_exitstub_setup(ASMState *as, ExitNo nexits) { ExitNo i; - if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) - lj_trace_err(as->J, LJ_TRERR_SNAPOV); - for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++) - if (as->J->exitstubgroup[i] == NULL) - as->J->exitstubgroup[i] = asm_exitstub_gen(as, i); + MCode *mxp = as->mctop; + if (mxp - (nexits + 3 + MCLIM_REDZONE) < as->mclim) + asm_mclimit(as); + /* 1: str lr,[sp]; bl ->vm_exit_handler; movz w0,traceno; bl <1; bl <1; ... */ + for (i = nexits-1; (int32_t)i >= 0; i--) + *--mxp = A64I_BL|((-3-i)&0x03ffffffu); + *--mxp = A64I_MOVZw|A64F_U16(as->T->traceno); + mxp--; + *mxp = A64I_BL|(((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); + *--mxp = A64I_STRx|A64F_D(RID_LR)|A64F_N(RID_SP); + as->mctop = mxp; +} + +static MCode *asm_exitstub_addr(ASMState *as, ExitNo exitno) +{ + /* Keep this in-sync with exitstub_trace_addr(). */ + return as->mctop + exitno + 3; } /* Emit conditional branch to exit for guard. */ static void asm_guardcc(ASMState *as, A64CC cc) { - MCode *target = exitstub_addr(as->J, as->snapno); + MCode *target = asm_exitstub_addr(as, as->snapno); MCode *p = as->mcp; if (LJ_UNLIKELY(p == as->invmcp)) { as->loopinv = 1; - *p = A64I_BL | ((target-p) & 0x03ffffffu); + *p = A64I_B | ((target-p) & 0x03ffffffu); emit_cond_branch(as, cc^1, p-1); return; } - /* No conditional calls. Emit b.cc/bl instead. */ - /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */ - emit_branch(as, A64I_BL, target); - emit_cond_branch(as, cc^1, p); + emit_cond_branch(as, cc, target); } /* -- Operand fusion ------------------------------------------------------ */ @@ -1568,8 +1556,7 @@ static void asm_stack_check(ASMState *as, BCReg topslot, } else { pbase = RID_BASE; } - emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno)); - emit_cond_branch(as, CC_LS^1, as->mcp+1); + emit_cond_branch(as, CC_LS, asm_exitstub_addr(as, exitno)); k = emit_isk12((8*topslot)); lua_assert(k); emit_n(as, A64I_CMPx^k, RID_TMP); @@ -1744,7 +1731,8 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk) /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */ int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED)); if (spadj == 0) { - as->mctop = --p; + *--p = A64I_NOP; + as->mctop = p; } else { /* Patch stack adjustment. */ uint32_t k = emit_isk12(spadj); @@ -1805,13 +1793,18 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MCode *pe = (MCode *)((char *)p + T->szmcode); MCode *cstart = NULL, *cend = p; MCode *mcarea = lj_mcode_patch(J, p, 0); - MCode *px = exitstub_addr(J, exitno); + MCode *px = exitstub_trace_addr(T, exitno); for (; p < pe; p++) { - /* Look for bl exitstub, replace with b target. */ + /* Look for bcc/b exitstub, replace with bcc/b target. */ uint32_t ins = *p; - if ((ins & 0xfc000000u) == 0x94000000u && - ((ins ^ (px-p)) & 0x03ffffffu) == 0) { - *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu); + if ((ins & 0xff000000u) == 0x54000000u && + ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); + cend = p+1; + if (!cstart) cstart = p; + } else if ((ins & 0xfc000000u) == 0x14000000u && + ((ins ^ (px-p)) & 0x03ffffffu) == 0) { + *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); cend = p+1; if (!cstart) cstart = p; } diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 0cef06d5..1cd02fe8 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -101,14 +101,18 @@ typedef struct { int32_t spill[256]; /* Spill slots. */ } ExitState; -/* PC after instruction that caused an exit. Used to find the trace number. */ -#define EXITSTATE_PCREG RID_LR /* Highest exit + 1 indicates stack check. */ #define EXITSTATE_CHECKEXIT 1 -#define EXITSTUB_SPACING 4 -#define EXITSTUBS_PER_GROUP 32 - +/* Return the address of a per-trace exit stub. */ +static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) +{ + while (*p == 0xd503201f) p++; /* Skip A64I_NOP. */ + return p + 3 + exitno; +} +/* Avoid dependence on lj_jit.h if only including lj_target.h. */ +#define exitstub_trace_addr(T, exitno) \ + exitstub_trace_addr_((MCode *)((char *)(T)->mcode + (T)->szmcode), (exitno)) /* -- Instructions -------------------------------------------------------- */ diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index a6227bf7..86c78fa5 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -1927,22 +1927,21 @@ static void build_subroutines(BuildCtx *ctx) | stp d30, d31, [sp, #30*8] | ldr CARG1, [sp, #64*8] // Load original value of lr. | add CARG3, sp, #64*8 // Recompute original value of sp. - | mv_vmstate CARG4, EXIT - | ldr CARG2w, [CARG1, #-4]! // Get exit instruction. - | stp CARG1, CARG3, [sp, #62*8] // Store exit pc/sp in RID_LR/RID_SP. - | lsl CARG2, CARG2, #38 - | add CARG1, CARG1, CARG2, asr #36 - | ldr CARG2w, [lr] // Load exit stub group offset. - | sub CARG1, CARG1, lr - | sub CARG1, CARG1, #4 - | ldr L, GL->cur_L - | add CARG1, CARG2, CARG1, lsr #2 // Compute exit number. - | ldr BASE, GL->jit_base - | st_vmstate CARG4 - | str CARG1w, [GL, #GL_J(exitno)] - | str BASE, L->base - | str L, [GL, #GL_J(L)] - | str xzr, GL->jit_base + | mv_vmstate CARG4, EXIT + | stp xzr, CARG3, [sp, #62*8] // Store 0/sp in RID_LR/RID_SP. + | sub CARG1, CARG1, lr + | ldr L, GL->cur_L + | lsr CARG1, CARG1, #2 + | ldr BASE, GL->jit_base + | sub CARG1, CARG1, #2 + | ldr CARG2w, [lr] // Load trace number. + | st_vmstate CARG4 + | str BASE, L->base + | ubfx CARG2w, CARG2w, #5, #16 + | str CARG1w, [GL, #GL_J(exitno)] + | str CARG2w, [GL, #GL_J(parent)] + | str L, [GL, #GL_J(L)] + | str xzr, GL->jit_base | add CARG1, GL, #GG_G2J | mov CARG2, sp | bl extern lj_trace_exit // (jit_State *J, ExitState *ex) From d7243e1de0cb86608956a9af107aff829ad99aeb Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 24 Nov 2016 19:14:17 +0100 Subject: [PATCH 012/116] Eliminate use of lightuserdata derived from static data pointers. Required for >47 bit VA, e.g. ARM64. --- src/lib_debug.c | 8 ++++---- src/lib_package.c | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lib_debug.c b/src/lib_debug.c index cda7137e..6628d943 100644 --- a/src/lib_debug.c +++ b/src/lib_debug.c @@ -283,13 +283,13 @@ LJLIB_CF(debug_setuservalue) /* ------------------------------------------------------------------------ */ -static const char KEY_HOOK = 'h'; +#define KEY_HOOK ((void *)0x3004) static void hookf(lua_State *L, lua_Debug *ar) { static const char *const hooknames[] = {"call", "return", "line", "count", "tail return"}; - lua_pushlightuserdata(L, (void *)&KEY_HOOK); + lua_pushlightuserdata(L, KEY_HOOK); lua_rawget(L, LUA_REGISTRYINDEX); if (lua_isfunction(L, -1)) { lua_pushstring(L, hooknames[(int)ar->event]); @@ -334,7 +334,7 @@ LJLIB_CF(debug_sethook) count = luaL_optint(L, arg+3, 0); func = hookf; mask = makemask(smask, count); } - lua_pushlightuserdata(L, (void *)&KEY_HOOK); + lua_pushlightuserdata(L, KEY_HOOK); lua_pushvalue(L, arg+1); lua_rawset(L, LUA_REGISTRYINDEX); lua_sethook(L, func, mask, count); @@ -349,7 +349,7 @@ LJLIB_CF(debug_gethook) if (hook != NULL && hook != hookf) { /* external hook? */ lua_pushliteral(L, "external hook"); } else { - lua_pushlightuserdata(L, (void *)&KEY_HOOK); + lua_pushlightuserdata(L, KEY_HOOK); lua_rawget(L, LUA_REGISTRYINDEX); /* get hook */ } lua_pushstring(L, unmakemask(mask, buff)); diff --git a/src/lib_package.c b/src/lib_package.c index 8c336b02..898897b1 100644 --- a/src/lib_package.c +++ b/src/lib_package.c @@ -399,8 +399,7 @@ static int lj_cf_package_loader_preload(lua_State *L) /* ------------------------------------------------------------------------ */ -static const int sentinel_ = 0; -#define sentinel ((void *)&sentinel_) +#define sentinel ((void *)0x4004) static int lj_cf_package_require(lua_State *L) { From 6538c8a18711a6eb009def36050acd5f02e42aec Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 25 Nov 2016 09:23:08 +0100 Subject: [PATCH 013/116] Document 47 bit limit for lightuserdata. --- doc/status.html | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/status.html b/doc/status.html index d10033b0..0acf78c9 100644 --- a/doc/status.html +++ b/doc/status.html @@ -97,6 +97,17 @@ handled correctly. The error may fall through an on-trace lua_atpanic on x64. This issue will be fixed with the new garbage collector. +
  • +LuaJIT on 64 bit systems provides a limited range of 47 bits for the +legacy lightuserdata data type. +This is only relevant on x64 systems which use the negative part of the +virtual address space in user mode, e.g. Solaris/x64, and on ARM64 systems +configured with a 48 bit or 52 bit VA. +Avoid using lightuserdata to hold pointers that may point outside +of that range, e.g. variables on the stack. In general, avoid this data +type for new code and replace it with (much more performant) FFI bindings. +FFI cdata pointers can address the full 64 bit range. +

  • From 3ad2bbf58600f8ba2918b56b0a7ab305df19cfe5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 29 Nov 2016 19:30:40 +0100 Subject: [PATCH 014/116] ARM64: Make use of tbz/tbnz and cbz/cbnz. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 83 ++++++++++++++++++++++++++++++++++--------- src/lj_emit_arm64.h | 19 ++++++++++ src/lj_target_arm64.h | 6 ++++ 3 files changed, 91 insertions(+), 17 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 19b3331d..eea957b5 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -84,6 +84,34 @@ static void asm_guardcc(ASMState *as, A64CC cc) emit_cond_branch(as, cc, target); } +/* Emit test and branch instruction to exit for guard. */ +static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit) +{ + MCode *target = asm_exitstub_addr(as, as->snapno); + MCode *p = as->mcp; + if (LJ_UNLIKELY(p == as->invmcp)) { + as->loopinv = 1; + *p = A64I_B | ((target-p) & 0x03ffffffu); + emit_tnb(as, ai^0x01000000u, r, bit, p-1); + return; + } + emit_tnb(as, ai, r, bit, target); +} + +/* Emit compare and branch instruction to exit for guard. */ +static void asm_guardcnb(ASMState *as, A64Ins ai, Reg r) +{ + MCode *target = asm_exitstub_addr(as, as->snapno); + MCode *p = as->mcp; + if (LJ_UNLIKELY(p == as->invmcp)) { + as->loopinv = 1; + *p = A64I_B | ((target-p) & 0x03ffffffu); + emit_cnb(as, ai^0x01000000u, r, p-1); + return; + } + emit_cnb(as, ai, r, target); +} + /* -- Operand fusion ------------------------------------------------------ */ /* Limit linear search to this distance. Avoids O(n^2) behavior. */ @@ -482,10 +510,9 @@ static void asm_strto(ASMState *as, IRIns *ir) dest = ra_dest(as, ir, RSET_FPR); } } - asm_guardcc(as, CC_EQ); if (destused) emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); - emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET); + asm_guardcnb(as, A64I_CBZ, RID_RET); args[0] = ir->op1; /* GCstr *str */ args[1] = ASMREF_TMP1; /* TValue *n */ asm_gencall(as, ci, args); @@ -1465,13 +1492,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir) else if (cc > CC_NE) cc ^= 11; /* LO <-> HI, LS <-> HS */ } oldcc = cc; - if (irref_isk(rref) && IR(rref)->i == 0) { + if (irref_isk(rref) && get_k64val(IR(rref)) == 0) { IRIns *irl = IR(lref); if (cc == CC_GE) cc = CC_PL; else if (cc == CC_LT) cc = CC_MI; - else if (cc > CC_NE) goto notst; /* Other conds don't work with tst. */ + else if (cc > CC_NE) goto nocombine; /* Other conds don't work with tst. */ cmpprev0 = (irl+1 == ir); - /* Combine comp(BAND(left, right), 0) into tst left, right. */ + /* Combine and-cmp-bcc into tbz/tbnz or and-cmp into tst. */ if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) { IRRef blref = irl->op1, brref = irl->op2; uint32_t m2 = 0; @@ -1480,10 +1507,13 @@ static void asm_intcomp(ASMState *as, IRIns *ir) Reg tmp = blref; blref = brref; brref = tmp; } if (irref_isk(brref)) { - /* NYI: use tbz/tbnz, if applicable. */ - m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t)); - if (!m2) - goto notst; /* Not beneficial if we miss a constant operand. */ + uint64_t k = get_k64val(IR(brref)); + if (k && !(k & (k-1)) && (cc == CC_EQ || cc == CC_NE)) { + asm_guardtnb(as, cc == CC_EQ ? A64I_TBZ : A64I_TBNZ, + ra_alloc1(as, blref, RSET_GPR), emit_ctz64(k)); + return; + } + m2 = emit_isk13(k, irt_is64(irl->t)); } bleft = ra_alloc1(as, blref, RSET_GPR); ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw); @@ -1493,9 +1523,15 @@ static void asm_intcomp(ASMState *as, IRIns *ir) emit_n(as, ai^m2, bleft); return; } - /* NYI: use cbz/cbnz for EQ/NE 0. */ + if (cc == CC_EQ || cc == CC_NE) { + /* Combine cmp-bcc into cbz/cbnz. */ + ai = cc == CC_EQ ? A64I_CBZ : A64I_CBNZ; + if (irt_is64(ir->t)) ai |= A64I_X; + asm_guardcnb(as, ai, ra_alloc1(as, lref, RSET_GPR)); + return; + } } -notst: +nocombine: left = ra_alloc1(as, lref, RSET_GPR); m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left)); asm_guardcc(as, cc); @@ -1638,8 +1674,7 @@ static void asm_gc_check(ASMState *as) ra_evictset(as, RSET_SCRATCH); l_end = emit_label(as); /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */ - asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */ - emit_n(as, A64I_CMPx^A64I_K12, RID_RET); + asm_guardcnb(as, A64I_CBNZ, RID_RET); /* Assumes asm_snap_prep() is done. */ args[0] = ASMREF_TMP1; /* global_State *g */ args[1] = ASMREF_TMP2; /* MSize steps */ asm_gencall(as, ci, args); @@ -1666,10 +1701,10 @@ static void asm_loop_fixup(ASMState *as) MCode *p = as->mctop; MCode *target = as->mcp; if (as->loopinv) { /* Inverted loop branch? */ + uint32_t mask = (p[-2] & 0x7e000000) == 0x36000000 ? 0x3fffu : 0x7ffffu; ptrdiff_t delta = target - (p - 2); - lua_assert(((delta + 0x40000) >> 19) == 0); - /* asm_guardcc already inverted the b.cc and patched the final bl. */ - p[-2] |= ((uint32_t)delta & 0x7ffff) << 5; + /* asm_guard* already inverted the bcc/tnb/cnb and patched the final b. */ + p[-2] |= ((uint32_t)delta & mask) << 5; } else { ptrdiff_t delta = target - (p - 1); p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu); @@ -1795,18 +1830,32 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MCode *mcarea = lj_mcode_patch(J, p, 0); MCode *px = exitstub_trace_addr(T, exitno); for (; p < pe; p++) { - /* Look for bcc/b exitstub, replace with bcc/b target. */ + /* Look for exitstub branch, replace with branch to target. */ uint32_t ins = *p; if ((ins & 0xff000000u) == 0x54000000u && ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + /* Patch bcc exitstub. */ *p = (ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u); cend = p+1; if (!cstart) cstart = p; } else if ((ins & 0xfc000000u) == 0x14000000u && ((ins ^ (px-p)) & 0x03ffffffu) == 0) { + /* Patch b exitstub. */ *p = (ins & 0xfc000000u) | ((target-p) & 0x03ffffffu); cend = p+1; if (!cstart) cstart = p; + } else if ((ins & 0x7e000000u) == 0x34000000u && + ((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) { + /* Patch cbz/cbnz exitstub. */ + *p = (ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u); + cend = p+1; + if (!cstart) cstart = p; + } else if ((ins & 0x7e000000u) == 0x36000000u && + ((ins ^ ((px-p)<<5)) & 0x0007ffe0u) == 0) { + /* Patch tbz/tbnz exitstub. */ + *p = (ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u); + cend = p+1; + if (!cstart) cstart = p; } } lua_assert(cstart != NULL); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 52e75559..1eb14204 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -321,6 +321,25 @@ static void emit_branch(ASMState *as, A64Ins ai, MCode *target) as->mcp = p; } +static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - (p - 1); + lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0); + if (bit > 31) ai |= A64I_X; + *--p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; + as->mcp = p; +} + +static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - (p - 1); + lua_assert(((delta + 0x40000) >> 19) == 0); + *--p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; + as->mcp = p; +} + #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) static void emit_call(ASMState *as, void *target) diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 1cd02fe8..6c8771c6 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -127,7 +127,9 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) #define A64F_U12(x) ((x) << 10) #define A64F_S26(x) (x) #define A64F_S19(x) ((x) << 5) +#define A64F_S14(x) ((x) << 5) #define A64F_S9(x) ((x) << 12) +#define A64F_BIT(x) ((x) << 19) #define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10)) #define A64F_EX(ex) (A64I_EX | ((ex) << 13)) #define A64F_EXSH(ex,x) (A64I_EX | ((ex) << 13) | ((x) << 10)) @@ -235,6 +237,10 @@ typedef enum A64Ins { A64I_BL = 0x94000000, A64I_BR = 0xd61f0000, A64I_BLR = 0xd63f0000, + A64I_TBZ = 0x36000000, + A64I_TBNZ = 0x37000000, + A64I_CBZ = 0x34000000, + A64I_CBNZ = 0x35000000, A64I_NOP = 0xd503201f, From 22511fbe2b284d722f3a7ea901f1ae54992c9c5e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 09:42:43 +0100 Subject: [PATCH 015/116] ARM64: Fix pc-relative loads of consts. Cleanup branch codegen. Thanks to Zhongwei Yao. --- src/lj_emit_arm64.h | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 1eb14204..6686802b 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -234,7 +234,7 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64) #define glofs(as, k) \ ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) #define mcpofs(as, k) \ - ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp)) + ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1))) #define checkmcpofs(as, k) \ ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0) @@ -305,39 +305,35 @@ typedef MCode *MCLabel; static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; - as->mcp = p; + *p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; } static void emit_branch(ASMState *as, A64Ins ai, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(((delta + 0x02000000) >> 26) == 0); - *--p = ai | ((uint32_t)delta & 0x03ffffffu); - as->mcp = p; + *p = ai | ((uint32_t)delta & 0x03ffffffu); } static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0); if (bit > 31) ai |= A64I_X; - *--p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; - as->mcp = p; + *p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r; } static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) { - MCode *p = as->mcp; - ptrdiff_t delta = target - (p - 1); + MCode *p = --as->mcp; + ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *--p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; - as->mcp = p; + *p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; } #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) From 48b00297b3b37fe77f59f2f4b6fa63358b442f15 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 18:34:10 +0100 Subject: [PATCH 016/116] ARM64: Add missing ldrb/strb instructions to disassembler. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/jit/dis_arm64.lua | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index 909b33bc..8cb608e6 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -411,15 +411,19 @@ local map_lsriro = { shift = 26, mask = 1, [0] = { shift = 30, mask = 3, - [1] = { + [0] = { shift = 22, mask = 3, - [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO" + [0] = "strbDwO", "ldrbDwO", "ldrsbDxO", "ldrsbDwO" }, - [2] = { + { + shift = 22, mask = 3, + [0] = "strhDwO", "ldrhDwO", "ldrshDxO", "ldrshDwO" + }, + { shift = 22, mask = 3, [0] = "strDwO", "ldrDwO", "ldrswDxO" }, - [3] = { + { shift = 22, mask = 3, [0] = "strDxO", "ldrDxO" } @@ -982,7 +986,7 @@ local function disass_ins(ctx) local sz = band(rshift(op, 30), 3) -- extension to be applied if opt == 3 then - if s == 0 then x = nil + if s == 0 then x = x.."]" else x = x..", lsl #"..sz.."]" end elseif opt == 2 or opt == 6 or opt == 7 then if s == 0 then x = x..", "..map_extend[opt].."]" From 2ac2cd4699d2e3a2eaa55417eae901216204fb37 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 18:38:32 +0100 Subject: [PATCH 017/116] ARM64: Reorganize operand extension definitions. --- src/lj_asm_arm64.h | 8 ++++---- src/lj_target_arm64.h | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index eea957b5..fff0b3fd 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -277,7 +277,7 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, if (!emit_checkofs(ai, ofs)) { Reg rn = ra_alloc1(as, ref, allow); Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn)); - emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm); + emit_dnm(as, (ai^A64I_LS_R)|A64I_LS_UXTWx, rd, rn, rm); return; } } @@ -936,7 +936,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp); } if (ofs & FUSE_REG) - emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31)); + emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx|A64I_LS_SH, tmp, idx, (ofs & 31)); else emit_lso(as, A64I_LDRx, tmp, idx, ofs); } @@ -951,7 +951,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir) src = ra_alloc1(as, ir->op2, RSET_FPR); idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd); if (ofs & FUSE_REG) - emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31)); + emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx|A64I_LS_SH, (src & 31), idx, (ofs &31)); else emit_lso(as, A64I_STRd, (src & 31), idx, ofs); } else { @@ -968,7 +968,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir) idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type), A64I_STRx); if (ofs & FUSE_REG) - emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31)); + emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx|A64I_LS_SH, tmp, idx, (ofs & 31)); else emit_lso(as, A64I_STRx, tmp, idx, ofs); if (ra_hasreg(src)) { diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 6c8771c6..e1210045 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -147,8 +147,11 @@ typedef enum A64Ins { A64I_LS_U = 0x01000000, A64I_LS_S = 0x00800000, A64I_LS_R = 0x01200800, - A64I_LS_UXTWx = 0x00005000, - A64I_LS_LSLx = 0x00007000, + A64I_LS_SH = 0x00001000, + A64I_LS_UXTWx = 0x00004000, + A64I_LS_SXTWx = 0x0000c000, + A64I_LS_SXTXx = 0x0000e000, + A64I_LS_LSLx = 0x00006000, A64I_ADDw = 0x0b000000, A64I_ADDx = 0x8b000000, From bfeb1167cd77194c1d49368e3c1468f134be337c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2016 18:40:31 +0100 Subject: [PATCH 018/116] ARM64: Fuse XLOAD/XSTORE with STRREF/ADD/BSHL/CONV. --- src/lj_asm_arm64.h | 53 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index fff0b3fd..c202bc82 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -232,7 +232,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) irl->o == IR_CONV && irl->op2 == ((IRT_I64<op1)) { + !neverfuse(as)) { Reg m = ra_alloc1(as, irl->op1, allow); return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift); } else { @@ -257,19 +257,60 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, int32_t ofs = 0; if (ra_noreg(ir->r) && canfuse(as, ir)) { if (ir->o == IR_ADD) { - if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs)) + if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs)) { ref = ir->op1; - /* NYI: Fuse add with two registers. */ + } else { + Reg rn, rm; + IRRef lref = ir->op1, rref = ir->op2; + IRIns *irl = IR(lref); + if (mayfuse(as, irl->op1)) { + unsigned int shift = 4; + if (irl->o == IR_BSHL && irref_isk(irl->op2)) { + shift = (IR(irl->op2)->i & 63); + } else if (irl->o == IR_ADD && irl->op1 == irl->op2) { + shift = 1; + } + if ((ai >> 30) == shift) { + lref = irl->op1; + irl = IR(lref); + ai |= A64I_LS_SH; + } + } + if (irl->o == IR_CONV && + irl->op2 == ((IRT_I64<op1; + ai |= A64I_LS_SXTWx; + } else { + ai |= A64I_LS_LSLx; + } + rm = ra_alloc1(as, lref, allow); + rn = ra_alloc1(as, rref, rset_exclude(allow, rm)); + emit_dnm(as, (ai^A64I_LS_R), rd, rn, rm); + return; + } } else if (ir->o == IR_STRREF) { if (asm_isk32(as, ir->op2, &ofs)) { ref = ir->op1; } else if (asm_isk32(as, ir->op1, &ofs)) { ref = ir->op2; } else { - /* NYI: Fuse ADD with constant. */ Reg rn = ra_alloc1(as, ir->op1, allow); - uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn)); - emit_lso(as, ai, rd, rd, sizeof(GCstr)); + IRIns *irr = IR(ir->op2); + uint32_t m; + if (irr+1 == ir && !ra_used(irr) && + irr->o == IR_ADD && irref_isk(irr->op2)) { + ofs = sizeof(GCstr) + IR(irr->op2)->i; + if (emit_checkofs(ai, ofs)) { + Reg rm = ra_alloc1(as, irr->op1, rset_exclude(allow, rn)); + m = A64F_M(rm) | A64F_EX(A64EX_SXTW); + goto skipopm; + } + } + m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn)); + ofs = sizeof(GCstr); + skipopm: + emit_lso(as, ai, rd, rd, ofs); emit_dn(as, A64I_ADDx^m, rd, rn); return; } From 2772cbc36e13200d5b028585abf506a5d19daaba Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 01:38:09 +0100 Subject: [PATCH 019/116] ARM64: Fuse FP multiply-add/sub. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 27 +++++++++++++++++++++++++-- src/lj_emit_arm64.h | 5 +++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index c202bc82..25016f4a 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -327,6 +327,27 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, emit_lso(as, ai, (rd & 31), base, ofs); } +/* Fuse FP multiply-add/sub. */ +static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) +{ + IRRef lref = ir->op1, rref = ir->op2; + IRIns *irm; + if (lref != rref && + ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && + ra_noreg(irm->r)) || + (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && + (rref = lref, ai = air, ra_noreg(irm->r))))) { + Reg dest = ra_dest(as, ir, RSET_FPR); + Reg add = ra_hintalloc(as, rref, dest, RSET_FPR); + Reg left = ra_alloc2(as, irm, + rset_exclude(rset_exclude(RSET_FPR, dest), add)); + Reg right = (left >> 8); left &= 255; + emit_dnma(as, ai, (dest & 31), (left & 31), (right & 31), (add & 31)); + return 1; + } + return 0; +} + /* -- Calls --------------------------------------------------------------- */ /* Generate a call to a C function. */ @@ -1308,7 +1329,8 @@ static void asm_intmul(ASMState *as, IRIns *ir) static void asm_add(ASMState *as, IRIns *ir) { if (irt_isnum(ir->t)) { - asm_fparith(as, ir, A64I_FADDd); + if (!asm_fusemadd(as, ir, A64I_FMADDd, A64I_FMADDd)) + asm_fparith(as, ir, A64I_FADDd); return; } asm_intop_s(as, ir, A64I_ADDw); @@ -1317,7 +1339,8 @@ static void asm_add(ASMState *as, IRIns *ir) static void asm_sub(ASMState *as, IRIns *ir) { if (irt_isnum(ir->t)) { - asm_fparith(as, ir, A64I_FSUBd); + if (!asm_fusemadd(as, ir, A64I_FNMSUBd, A64I_FMSUBd)) + asm_fparith(as, ir, A64I_FSUBd); return; } asm_intop_s(as, ir, A64I_SUBw); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 6686802b..e0f43689 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -74,6 +74,11 @@ static uint32_t emit_isfpk64(uint64_t n) /* -- Emit basic instructions --------------------------------------------- */ +static void emit_dnma(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm, Reg ra) +{ + *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm) | A64F_A(ra); +} + static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm) { *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm); From 3975b6c9f4c59e2913e36f62a99653754fd33fe1 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 04:09:29 +0100 Subject: [PATCH 020/116] ARM64: Fuse various BAND/BSHL/BSHR/BSAR combinations. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 60 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 25016f4a..d14f0224 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -348,6 +348,36 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) return 0; } +/* Fuse BAND + BSHL/BSHR into UBFM. */ +static int asm_fuseandshift(ASMState *as, IRIns *ir) +{ + lua_assert(ir->o == IR_BAND); + if (!neverfuse(as) && irref_isk(ir->op2)) { + uint64_t mask = get_k64val(IR(ir->op2)); + IRIns *irl = IR(ir->op1); + if (irref_isk(irl->op2) && (irl->o == IR_BSHR || irl->o == IR_BSHL)) { + int32_t shmask = irt_is64(irl->t) ? 63 : 31; + int32_t shift = (IR(irl->op2)->i & shmask); + int32_t imms = shift; + if (irl->o == IR_BSHL) { + mask >>= shift; + shift = (shmask-shift+1) & shmask; + imms = 0; + } + if (mask && !((mask+1) & mask)) { /* Contiguous 1-bits at the bottom. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, irl->op1, RSET_GPR); + A64Ins ai = shmask == 63 ? A64I_UBFMx : A64I_UBFMw; + imms += 63 - emit_clz64(mask); + if (imms > shmask) imms = shmask; + emit_dn(as, ai | A64F_IMMS(imms) | A64F_IMMR(shift), dest, left); + return 1; + } + } + } + return 0; +} + /* -- Calls --------------------------------------------------------------- */ /* Generate a call to a C function. */ @@ -1423,8 +1453,14 @@ static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai) } } +static void asm_band(ASMState *as, IRIns *ir) +{ + if (asm_fuseandshift(as, ir)) + return; + asm_bitop(as, ir, A64I_ANDw); +} + #define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) -#define asm_band(as, ir) asm_bitop(as, ir, A64I_ANDw) #define asm_bor(as, ir) asm_bitop(as, ir, A64I_ORRw) #define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) @@ -1437,16 +1473,28 @@ static void asm_bswap(ASMState *as, IRIns *ir) static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh) { - int shmask = irt_is64(ir->t) ? 63 : 31; + int32_t shmask = irt_is64(ir->t) ? 63 : 31; if (irref_isk(ir->op2)) { /* Constant shifts. */ - Reg dest = ra_dest(as, ir, RSET_GPR); - Reg left = ra_alloc1(as, ir->op1, RSET_GPR); + Reg left, dest = ra_dest(as, ir, RSET_GPR); int32_t shift = (IR(ir->op2)->i & shmask); - if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw; + + /* Fuse BSHL + BSHR/BSAR into UBFM/SBFM aka UBFX/SBFX/UBFIZ/SBFIZ. */ + if (!neverfuse(as) && (sh == A64SH_LSR || sh == A64SH_ASR)) { + IRIns *irl = IR(ir->op1); + if (irl->o == IR_BSHL && irref_isk(irl->op2)) { + int32_t shift2 = (IR(irl->op2)->i & shmask); + shift = ((shift - shift2) & shmask); + shmask -= shift2; + ir = irl; + } + } + + left = ra_alloc1(as, ir->op1, RSET_GPR); switch (sh) { case A64SH_LSL: - emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left); + emit_dn(as, ai | A64F_IMMS(shmask-shift) | + A64F_IMMR((shmask-shift+1)&shmask), dest, left); break; case A64SH_LSR: case A64SH_ASR: emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left); From 986854cbb2fa08514e10d9d4d5ded2b7f5f60445 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 05:53:36 +0100 Subject: [PATCH 021/116] ARM64: Fix code generation for S19 offsets. Contributed by Zhongwei Yao. --- src/lj_asm_arm64.h | 2 +- src/lj_emit_arm64.h | 4 ++-- src/lj_target_arm64.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index d14f0224..ab0de5cd 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -781,7 +781,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key)); } - *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE; + *l_loop = A64I_BCC | A64F_S19(as->mcp - l_loop) | CC_NE; if (!isk && irt_isaddr(kt)) { Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow); emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index e0f43689..c7eb4d81 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -313,7 +313,7 @@ static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target) MCode *p = --as->mcp; ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; + *p = A64I_BCC | A64F_S19(delta) | cond; } static void emit_branch(ASMState *as, A64Ins ai, MCode *target) @@ -338,7 +338,7 @@ static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target) MCode *p = --as->mcp; ptrdiff_t delta = target - p; lua_assert(((delta + 0x40000) >> 19) == 0); - *p = ai | A64F_S19((uint32_t)delta & 0x7ffff) | r; + *p = ai | A64F_S19(delta) | r; } #define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index e1210045..f77a58a0 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -126,7 +126,7 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno) #define A64F_U16(x) ((x) << 5) #define A64F_U12(x) ((x) << 10) #define A64F_S26(x) (x) -#define A64F_S19(x) ((x) << 5) +#define A64F_S19(x) (((uint32_t)(x) & 0x7ffffu) << 5) #define A64F_S14(x) ((x) << 5) #define A64F_S9(x) ((x) << 12) #define A64F_BIT(x) ((x) << 19) From ec2756ba786cd68a0af37ec8ffe806f3ce392d7d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Dec 2016 22:38:35 +0100 Subject: [PATCH 022/116] Add missing FOLD rule for 64 bit shift+BAND simplification. --- src/lj_opt_fold.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 5f4b8810..a72aa440 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -347,6 +347,11 @@ static uint64_t kfold_int64arith(uint64_t k1, uint64_t k2, IROp op) case IR_BAND: k1 &= k2; break; case IR_BOR: k1 |= k2; break; case IR_BXOR: k1 ^= k2; break; + case IR_BSHL: k1 <<= (k2 & 63); break; + case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 63)); break; + case IR_BSAR: k1 >>= (k2 & 63); break; + case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 63)); break; + case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 63)); break; #endif default: UNUSED(k2); lua_assert(0); break; } @@ -1653,6 +1658,14 @@ LJFOLDF(simplify_shiftk_andk) fins->op2 = (IRRef1)lj_ir_kint(J, k); fins->ot = IRTI(IR_BAND); return RETRYFOLD; + } else if (irk->o == IR_KINT64) { + uint64_t k = kfold_int64arith(ir_k64(irk)->u64, fright->i, (IROp)fins->o); + IROpT ot = fleft->ot; + fins->op1 = fleft->op1; + fins->op1 = (IRRef1)lj_opt_fold(J); + fins->op2 = (IRRef1)lj_ir_kint64(J, k); + fins->ot = ot; + return RETRYFOLD; } return NEXTFOLD; } From 44b99ff14d0a543da54fce27793464edbbfacd16 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 9 Dec 2016 18:16:12 +0100 Subject: [PATCH 023/116] ARM64: Fuse BOR(BSHL, BSHR) into EXTR/ROR. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index ab0de5cd..b771d2f1 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -378,6 +378,34 @@ static int asm_fuseandshift(ASMState *as, IRIns *ir) return 0; } +/* Fuse BOR(BSHL, BSHR) into EXTR/ROR. */ +static int asm_fuseorshift(ASMState *as, IRIns *ir) +{ + IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); + lua_assert(ir->o == IR_BOR); + if (!neverfuse(as) && ((irl->o == IR_BSHR && irr->o == IR_BSHL) || + (irl->o == IR_BSHL && irr->o == IR_BSHR))) { + if (irref_isk(irl->op2) && irref_isk(irr->op2)) { + IRRef lref = irl->op1, rref = irr->op1; + uint32_t lshift = IR(irl->op2)->i, rshift = IR(irr->op2)->i; + if (irl->o == IR_BSHR) { /* BSHR needs to be the right operand. */ + uint32_t tmp2; + IRRef tmp1 = lref; lref = rref; rref = tmp1; + tmp2 = lshift; lshift = rshift; rshift = tmp2; + } + if (rshift + lshift == (irt_is64(ir->t) ? 64 : 32)) { + A64Ins ai = irt_is64(ir->t) ? A64I_EXTRx : A64I_EXTRw; + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, lref, RSET_GPR); + Reg right = ra_alloc1(as, rref, rset_exclude(RSET_GPR, left)); + emit_dnm(as, ai | A64F_IMMS(rshift), dest, left, right); + return 1; + } + } + } + return 0; +} + /* -- Calls --------------------------------------------------------------- */ /* Generate a call to a C function. */ @@ -1460,8 +1488,14 @@ static void asm_band(ASMState *as, IRIns *ir) asm_bitop(as, ir, A64I_ANDw); } +static void asm_bor(ASMState *as, IRIns *ir) +{ + if (asm_fuseorshift(as, ir)) + return; + asm_bitop(as, ir, A64I_ORRw); +} + #define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) -#define asm_bor(as, ir) asm_bitop(as, ir, A64I_ORRw) #define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) static void asm_bswap(ASMState *as, IRIns *ir) From 4ccd876a65f7ea4c52a7b44330bc1c279dd8afff Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 9 Dec 2016 18:24:48 +0100 Subject: [PATCH 024/116] ARM64: Use the correct FUSE check. Oops, my bad. --- src/lj_asm_arm64.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index b771d2f1..372429db 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -232,7 +232,7 @@ static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) irl->o == IR_CONV && irl->op2 == ((IRT_I64<op1, allow); return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift); } else { @@ -278,7 +278,7 @@ static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, } if (irl->o == IR_CONV && irl->op2 == ((IRT_I64<op1; ai |= A64I_LS_SXTWx; } else { @@ -351,10 +351,10 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) /* Fuse BAND + BSHL/BSHR into UBFM. */ static int asm_fuseandshift(ASMState *as, IRIns *ir) { + IRIns *irl = IR(ir->op1); lua_assert(ir->o == IR_BAND); - if (!neverfuse(as) && irref_isk(ir->op2)) { + if (canfuse(as, irl) && irref_isk(ir->op2)) { uint64_t mask = get_k64val(IR(ir->op2)); - IRIns *irl = IR(ir->op1); if (irref_isk(irl->op2) && (irl->o == IR_BSHR || irl->o == IR_BSHL)) { int32_t shmask = irt_is64(irl->t) ? 63 : 31; int32_t shift = (IR(irl->op2)->i & shmask); @@ -383,8 +383,9 @@ static int asm_fuseorshift(ASMState *as, IRIns *ir) { IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); lua_assert(ir->o == IR_BOR); - if (!neverfuse(as) && ((irl->o == IR_BSHR && irr->o == IR_BSHL) || - (irl->o == IR_BSHL && irr->o == IR_BSHR))) { + if (canfuse(as, irl) && canfuse(as, irr) && + ((irl->o == IR_BSHR && irr->o == IR_BSHL) || + (irl->o == IR_BSHL && irr->o == IR_BSHR))) { if (irref_isk(irl->op2) && irref_isk(irr->op2)) { IRRef lref = irl->op1, rref = irr->op1; uint32_t lshift = IR(irl->op2)->i, rshift = IR(irr->op2)->i; @@ -1511,11 +1512,11 @@ static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh) if (irref_isk(ir->op2)) { /* Constant shifts. */ Reg left, dest = ra_dest(as, ir, RSET_GPR); int32_t shift = (IR(ir->op2)->i & shmask); + IRIns *irl = IR(ir->op1); if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw; /* Fuse BSHL + BSHR/BSAR into UBFM/SBFM aka UBFX/SBFX/UBFIZ/SBFIZ. */ - if (!neverfuse(as) && (sh == A64SH_LSR || sh == A64SH_ASR)) { - IRIns *irl = IR(ir->op1); + if ((sh == A64SH_LSR || sh == A64SH_ASR) && canfuse(as, irl)) { if (irl->o == IR_BSHL && irref_isk(irl->op2)) { int32_t shift2 = (IR(irl->op2)->i & shmask); shift = ((shift - shift2) & shmask); From 197380748052fcc5781fb357d3ac77dcee353004 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 13 Dec 2016 21:30:13 +0100 Subject: [PATCH 025/116] Add "proto" field to jit.util.funcinfo(). --- src/lib_jit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib_jit.c b/src/lib_jit.c index 592538bd..176488e8 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -204,6 +204,7 @@ LJLIB_CF(jit_util_funcinfo) lua_setfield(L, -2, "source"); lj_debug_pushloc(L, pt, pc); lua_setfield(L, -2, "loc"); + setprotoV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "proto")), pt); } else { GCfunc *fn = funcV(L->base); GCtab *t; From fb61f7cbe3ec983dfc9087bde04496aa4bbaa31b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 15 Dec 2016 22:45:28 +0100 Subject: [PATCH 026/116] Add "proto" field to jit.util.funcinfo(). Backport. --- src/lib_jit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib_jit.c b/src/lib_jit.c index 921b84c8..868f697c 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -199,6 +199,7 @@ LJLIB_CF(jit_util_funcinfo) lua_setfield(L, -2, "source"); lj_debug_pushloc(L, pt, pc); lua_setfield(L, -2, "loc"); + setprotoV(L, lj_tab_setstr(L, t, lj_str_newlit(L, "proto")), pt); } else { GCfunc *fn = funcV(L->base); GCtab *t; From ebec2530befabb8777f3e46d22980112c942dba8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 15 Dec 2016 22:47:40 +0100 Subject: [PATCH 027/116] ARM64: Fuse BOR/BXOR and BNOT into ORN/EON. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. --- src/lj_asm_arm64.h | 52 +++++++++++++++++++++++++++++-------------- src/lj_target_arm64.h | 1 + 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 372429db..28050bf8 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -1464,40 +1464,58 @@ static void asm_neg(ASMState *as, IRIns *ir) asm_intneg(as, ir); } -static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai) +static void asm_band(ASMState *as, IRIns *ir) { - if (as->flagmcp == as->mcp && ai == A64I_ANDw) { + A64Ins ai = A64I_ANDw; + if (asm_fuseandshift(as, ir)) + return; + if (as->flagmcp == as->mcp) { /* Try to drop cmp r, #0. */ as->flagmcp = NULL; as->mcp++; - ai += A64I_ANDSw - A64I_ANDw; + ai = A64I_ANDSw; } - if (ir->op2 == 0) { - Reg dest = ra_dest(as, ir, RSET_GPR); - uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR); + asm_intop(as, ir, ai); +} + +static void asm_borbxor(ASMState *as, IRIns *ir, A64Ins ai) +{ + IRRef lref = ir->op1, rref = ir->op2; + IRIns *irl = IR(lref), *irr = IR(rref); + if ((canfuse(as, irl) && irl->o == IR_BNOT && !irref_isk(rref)) || + (canfuse(as, irr) && irr->o == IR_BNOT && !irref_isk(lref))) { + Reg left, dest = ra_dest(as, ir, RSET_GPR); + uint32_t m; + if (irl->o == IR_BNOT) { + IRRef tmp = lref; lref = rref; rref = tmp; + } + left = ra_alloc1(as, lref, RSET_GPR); + ai |= A64I_ON; if (irt_is64(ir->t)) ai |= A64I_X; - emit_d(as, ai^m, dest); + m = asm_fuseopm(as, ai, IR(rref)->op1, rset_exclude(RSET_GPR, left)); + emit_dn(as, ai^m, dest, left); } else { asm_intop(as, ir, ai); } } -static void asm_band(ASMState *as, IRIns *ir) -{ - if (asm_fuseandshift(as, ir)) - return; - asm_bitop(as, ir, A64I_ANDw); -} - static void asm_bor(ASMState *as, IRIns *ir) { if (asm_fuseorshift(as, ir)) return; - asm_bitop(as, ir, A64I_ORRw); + asm_borbxor(as, ir, A64I_ORRw); } -#define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) -#define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) +#define asm_bxor(as, ir) asm_borbxor(as, ir, A64I_EORw) + +static void asm_bnot(ASMState *as, IRIns *ir) +{ + A64Ins ai = A64I_MVNw; + Reg dest = ra_dest(as, ir, RSET_GPR); + uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR); + if (irt_is64(ir->t)) ai |= A64I_X; + emit_d(as, ai^m, dest); +} static void asm_bswap(ASMState *as, IRIns *ir) { diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index f77a58a0..9e9fbd01 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -142,6 +142,7 @@ typedef enum A64Ins { A64I_S = 0x20000000, A64I_X = 0x80000000, A64I_EX = 0x00200000, + A64I_ON = 0x00200000, A64I_K12 = 0x1a000000, A64I_K13 = 0x18000000, A64I_LS_U = 0x01000000, From 8e5d7bec0d110aa4ccd7e8492f697ff2a88a55ed Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 30 Dec 2016 17:54:10 +0100 Subject: [PATCH 028/116] ARM64: Remove unused variables in disassembler. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to François Perrad. --- src/jit/dis_arm64.lua | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua index 8cb608e6..6e3f9ff4 100644 --- a/src/jit/dis_arm64.lua +++ b/src/jit/dis_arm64.lua @@ -13,10 +13,9 @@ -- NYI: Advanced SIMD and VFP instructions. ------------------------------------------------------------------------------ -local type, tonumber = type, tonumber +local type = type local sub, byte, format = string.sub, string.byte, string.format local match, gmatch, gsub = string.match, string.gmatch, string.gsub -local rep = string.rep local concat = table.concat local bit = require("bit") local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex @@ -864,7 +863,6 @@ local function disass_ins(ctx) local operands = {} local suffix = "" local last, name, pat - local vr local map_reg ctx.op = op ctx.rel = nil @@ -1014,7 +1012,6 @@ local function disass_ins(ctx) elseif p == "I" then local shf = band(rshift(op, 22), 3) local imm12 = band(rshift(op, 10), 0x0fff) - local n = #operands local rn, rd = band(rshift(op, 5), 31), band(op, 31) if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then name = altname From a2013dd39abfc036fc02d277ae3f053209d2c4fd Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 10:46:45 +0100 Subject: [PATCH 029/116] Fix cross-endian jit.bcsave for MIPS target. --- src/jit/bcsave.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index 5c417c06..70b92aaf 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -239,7 +239,7 @@ typedef struct { hdr.type = f16(1) hdr.machine = f16(({ x86=3, x64=62, arm=40, ppc=20, ppcspe=20, mips=8, mipsel=8 })[ctx.arch]) if ctx.arch == "mips" or ctx.arch == "mipsel" then - hdr.flags = 0x50001006 + hdr.flags = f32(0x50001006) end hdr.version = f32(1) hdr.shofs = fofs(ffi.offsetof(o, "sect")) From a1e13fa6e4382bcae44f97bf5954f0e57cfad90c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 10:55:31 +0100 Subject: [PATCH 030/116] Fix HTML formatting. --- doc/extensions.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index 1efaca9d..c361b5d5 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -251,8 +251,8 @@ enabled:
  • load(string|reader [, chunkname [,mode [,env]]]).
  • loadstring() is an alias for load().
  • loadfile(filename [,mode [,env]]).
  • -
  • math.log(x [,base]). -
  • string.rep(s, n [,sep]). +
  • math.log(x [,base]).
  • +
  • string.rep(s, n [,sep]).
  • string.format(): %q reversible. %s checks __tostring. %a and "%A added.
  • @@ -295,7 +295,7 @@ instead of true. exit status.
  • debug.setmetatable() returns object.
  • debug.getuservalue() and debug.setuservalue().
  • -
  • Remove math.mod(), string.gfind(). +
  • Remove math.mod(), string.gfind().
  • Note: this provides only partial compatibility with Lua 5.2 at the From c1981676907cedde9ffe2bbdfb28d2f786ff69d9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 11:37:28 +0100 Subject: [PATCH 031/116] Add some more extensions from Lua 5.2/5.3. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contributed by François Perrad. --- doc/extensions.html | 4 ++++ src/Makefile.dep | 4 ++-- src/host/buildvm_libbc.h | 17 ++++++++++++++--- src/lib_base.c | 7 +++++++ src/lib_io.c | 11 +++++------ src/lib_package.c | 4 ++++ src/lib_table.c | 20 ++++++++++++++++++++ 7 files changed, 56 insertions(+), 11 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index 70dc6995..b048f137 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -303,6 +303,7 @@ enabled:

  • os.exit(status|true|false [,close]).
  • package.searchpath(name, path [, sep [, rep]]).
  • package.loadlib(name, "*").
  • +
  • package.searchers.
  • debug.getinfo() returns nparams and isvararg for option "u".
  • debug.getlocal() accepts function instead of level.
  • @@ -350,6 +351,9 @@ LuaJIT supports some extensions from Lua 5.3:
    • Unicode escape '\u{XX...}' embeds the UTF-8 encoding in string literals.
    • The argument table arg can be read (and modified) by LUA_INIT and -e chunks.
    • +
    • io.read() and file:read() accept formats with or without a leading *.
    • +
    • table.move(a1, f, e, t [,a2]).
    • +
    • coroutine.isyieldable().

    C++ Exception Interoperability

    diff --git a/src/Makefile.dep b/src/Makefile.dep index 4ef002e9..2b1cb5ef 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -3,8 +3,8 @@ lib_aux.o: lib_aux.c lua.h luaconf.h lauxlib.h lj_obj.h lj_def.h \ lj_dispatch.h lj_bc.h lj_traceerr.h lj_lib.h lj_alloc.h lib_base.o: lib_base.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \ lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_str.h \ - lj_tab.h lj_meta.h lj_state.h lj_ctype.h lj_cconv.h lj_bc.h lj_ff.h \ - lj_ffdef.h lj_dispatch.h lj_jit.h lj_ir.h lj_char.h lj_strscan.h \ + lj_tab.h lj_meta.h lj_state.h lj_frame.h lj_bc.h lj_ctype.h lj_cconv.h \ + lj_ff.h lj_ffdef.h lj_dispatch.h lj_jit.h lj_ir.h lj_char.h lj_strscan.h \ lj_strfmt.h lj_lib.h lj_libdef.h lib_bit.o: lib_bit.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \ lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_strscan.h \ diff --git a/src/host/buildvm_libbc.h b/src/host/buildvm_libbc.h index 45f8f8cb..b2600bd5 100644 --- a/src/host/buildvm_libbc.h +++ b/src/host/buildvm_libbc.h @@ -15,7 +15,12 @@ static const uint8_t libbc_code[] = { 8,2,0,0,88,3,23,128,59,3,2,0,43,4,0,0,64,4,2,0,76,3,2,0,88,3,18,128,16,1,14, 0,41,3,1,0,3,3,1,0,88,3,14,128,3,1,2,0,88,3,12,128,59,3,1,0,22,4,1,1,18,5,2, 0,41,6,1,0,77,4,4,128,23,8,1,7,59,9,7,0,64,9,8,0,79,4,252,127,43,4,0,0,64,4, -2,0,76,3,2,0,75,0,1,0,0,2,0 +2,0,76,3,2,0,75,0,1,0,0,2,0,5,12,0,0,0,35,16,0,12,0,16,1,14,0,16,2,14,0,16, +3,14,0,11,4,0,0,88,5,1,128,18,4,0,0,16,4,12,0,3,1,2,0,88,5,24,128,33,5,1,3, +0,2,3,0,88,6,4,128,2,3,1,0,88,6,2,128,4,4,0,0,88,6,9,128,18,6,1,0,18,7,2,0, +41,8,1,0,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,6,252,127,88,6,8,128, +18,6,2,0,18,7,1,0,41,8,255,255,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79, +6,252,127,76,4,2,0,0 #else 0,1,2,0,0,1,2,24,1,0,0,76,1,2,0,241,135,158,166,3,220,203,178,130,4,0,1,2,0, 0,1,2,24,1,0,0,76,1,2,0,243,244,148,165,20,198,190,199,252,3,0,1,2,0,0,0,3, @@ -28,7 +33,12 @@ static const uint8_t libbc_code[] = { 8,2,0,0,88,3,23,128,59,3,2,0,43,4,0,0,64,4,2,0,76,3,2,0,88,3,18,128,16,1,14, 0,41,3,1,0,3,3,1,0,88,3,14,128,3,1,2,0,88,3,12,128,59,3,1,0,22,4,1,1,18,5,2, 0,41,6,1,0,77,4,4,128,23,8,1,7,59,9,7,0,64,9,8,0,79,4,252,127,43,4,0,0,64,4, -2,0,76,3,2,0,75,0,1,0,0,2,0 +2,0,76,3,2,0,75,0,1,0,0,2,0,5,12,0,0,0,35,16,0,12,0,16,1,14,0,16,2,14,0,16, +3,14,0,11,4,0,0,88,5,1,128,18,4,0,0,16,4,12,0,3,1,2,0,88,5,24,128,33,5,1,3, +0,2,3,0,88,6,4,128,2,3,1,0,88,6,2,128,4,4,0,0,88,6,9,128,18,6,1,0,18,7,2,0, +41,8,1,0,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79,6,252,127,88,6,8,128, +18,6,2,0,18,7,1,0,41,8,255,255,77,6,4,128,32,10,5,9,59,11,9,0,64,11,10,4,79, +6,252,127,76,4,2,0,0 #endif }; @@ -40,6 +50,7 @@ static const struct { const char *name; int ofs; } libbc_map[] = { {"table_foreach",136}, {"table_getn",207}, {"table_remove",226}, -{NULL,355} +{"table_move",355}, +{NULL,502} }; diff --git a/src/lib_base.c b/src/lib_base.c index 6107bde0..7c523241 100644 --- a/src/lib_base.c +++ b/src/lib_base.c @@ -23,6 +23,7 @@ #include "lj_tab.h" #include "lj_meta.h" #include "lj_state.h" +#include "lj_frame.h" #if LJ_HASFFI #include "lj_ctype.h" #include "lj_cconv.h" @@ -557,6 +558,12 @@ LJLIB_CF(coroutine_running) #endif } +LJLIB_CF(coroutine_isyieldable) +{ + setboolV(L->top++, cframe_canyield(L->cframe)); + return 1; +} + LJLIB_CF(coroutine_create) { lua_State *L1; diff --git a/src/lib_io.c b/src/lib_io.c index 31f0ea97..53c17d92 100644 --- a/src/lib_io.c +++ b/src/lib_io.c @@ -203,13 +203,12 @@ static int io_file_read(lua_State *L, FILE *fp, int start) for (n = start; nargs-- && ok; n++) { if (tvisstr(L->base+n)) { const char *p = strVdata(L->base+n); - if (p[0] != '*') - lj_err_arg(L, n+1, LJ_ERR_INVOPT); - if (p[1] == 'n') + if (p[0] == '*') p++; + if (p[0] == 'n') ok = io_file_readnum(L, fp); - else if ((p[1] & ~0x20) == 'L') - ok = io_file_readline(L, fp, (p[1] == 'l')); - else if (p[1] == 'a') + else if ((p[0] & ~0x20) == 'L') + ok = io_file_readline(L, fp, (p[0] == 'l')); + else if (p[0] == 'a') io_file_readall(L, fp); else lj_err_arg(L, n+1, LJ_ERR_INVFMT); diff --git a/src/lib_package.c b/src/lib_package.c index 898897b1..c0252b73 100644 --- a/src/lib_package.c +++ b/src/lib_package.c @@ -589,6 +589,10 @@ LUALIB_API int luaopen_package(lua_State *L) lj_lib_pushcf(L, package_loaders[i], 1); lua_rawseti(L, -2, i+1); } +#if LJ_52 + lua_pushvalue(L, -1); + lua_setfield(L, -3, "searchers"); +#endif lua_setfield(L, -2, "loaders"); lua_getfield(L, LUA_REGISTRYINDEX, "LUA_NOENV"); noenv = lua_toboolean(L, -1); diff --git a/src/lib_table.c b/src/lib_table.c index f9a3693d..0204f25d 100644 --- a/src/lib_table.c +++ b/src/lib_table.c @@ -129,6 +129,26 @@ LJLIB_LUA(table_remove) /* end */ +LJLIB_LUA(table_move) /* + function(a1, f, e, t, a2) + CHECK_tab(a1) + CHECK_int(f) + CHECK_int(e) + CHECK_int(t) + if a2 == nil then a2 = a1 end + CHECK_tab(a2) + if e >= f then + local d = t - f + if t > e or t <= f or a2 ~= a1 then + for i=f,e do a2[i+d] = a1[i] end + else + for i=e,f,-1 do a2[i+d] = a1[i] end + end + end + return a2 + end +*/ + LJLIB_CF(table_concat) LJLIB_REC(.) { GCtab *t = lj_lib_checktab(L, 1); From c94b921f924c1b37fea52e34f4e01ba8b37d77d0 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 12:21:12 +0100 Subject: [PATCH 032/116] LJ_GC64: Add build options and install instructions. --- doc/extensions.html | 3 ++- doc/install.html | 15 ++++++++++----- src/Makefile | 3 +++ src/msvcbuild.bat | 8 +++++++- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index b048f137..cb9be3f4 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -210,7 +210,8 @@ bytecode (e.g. from Lua 5.1) is incompatible and cannot be loaded.

    Note: LJ_GC64 mode requires a different frame layout, which implies a different, incompatible bytecode format for ports that use this mode (e.g. -ARM64). This may be rectified in the future. +ARM64 or MIPS64) or when explicitly enabled for x64. This may be rectified +in the future.

    table.new(narray, nhash) allocates a pre-sized table

    diff --git a/doc/install.html b/doc/install.html index efeda33c..230e9386 100644 --- a/doc/install.html +++ b/doc/install.html @@ -175,6 +175,14 @@ MSVC or WinSDK. Please read the instructions given in these files, before changing any settings.

    +

    +LuaJIT on x64 currently uses 32 bit GC objects by default. +LJ_GC64 mode may be explicitly enabled: +add XCFLAGS=-DLUAJIT_ENABLE_GC64 to the make command or run +msvcbuild gc64 for MSVC/WinSDK. Please check the note +about the bytecode format +differences, too. +

    POSIX Systems (Linux, OSX, *BSD etc.)

    Prerequisites

    @@ -584,14 +592,11 @@ intend to load Lua/C modules at runtime.
  • If you're building a 64 bit application on OSX which links directly or -indirectly against LuaJIT, you need to link your main executable -with these flags: +indirectly against LuaJIT which is not built for LJ_GC64 mode, +you need to link your main executable with these flags:
     -pagezero_size 10000 -image_base 100000000
     
    -Also, it's recommended to rebase all (self-compiled) shared libraries -which are loaded at runtime on OSX/x64 (e.g. C extension modules for Lua). -See: man rebase
  • Additional hints for initializing LuaJIT using the C API functions:

    diff --git a/src/Makefile b/src/Makefile index 4e479ae5..21e16a22 100644 --- a/src/Makefile +++ b/src/Makefile @@ -110,6 +110,9 @@ XCFLAGS= #XCFLAGS+= -DLUAJIT_NUMMODE=1 #XCFLAGS+= -DLUAJIT_NUMMODE=2 # +# Enable GC64 mode for x64. +#XCFLAGS+= -DLUAJIT_ENABLE_GC64 +# ############################################################################## ############################################################################## diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index 4334bbde..f7a1addc 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -20,6 +20,7 @@ @set LJLIB=lib /nologo /nodefaultlib @set DASMDIR=..\dynasm @set DASM=%DASMDIR%\dynasm.lua +@set DASC=vm_x86.dasc @set LJDLLNAME=lua51.dll @set LJLIBNAME=lua51.lib @set ALL_LIB=lib_base.c lib_math.c lib_bit.c lib_string.c lib_table.c lib_io.c lib_os.c lib_package.c lib_debug.c lib_jit.c lib_ffi.c @@ -39,7 +40,12 @@ if exist minilua.exe.manifest^ @set LJARCH=x86 @set LJCOMPILE=%LJCOMPILE% /arch:SSE2 :X64 -minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x86.dasc +@if "%1" neq "gc64" goto :NOGC64 +@shift +@set DASC=vm_x64.dasc +@set LJCOMPILE=%LJCOMPILE% /DLUAJIT_ENABLE_GC64 +:NOGC64 +minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC% @if errorlevel 1 goto :BAD %LJCOMPILE% /I "." /I %DASMDIR% host\buildvm*.c From b93a1dd0c831cab22f98163d0dde792a493c0eef Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 17 Jan 2017 12:35:03 +0100 Subject: [PATCH 033/116] Bump copyright date to 2017. --- COPYRIGHT | 2 +- Makefile | 2 +- README | 2 +- doc/bluequad-print.css | 2 +- doc/bluequad.css | 2 +- doc/changes.html | 4 ++-- doc/contact.html | 6 +++--- doc/ext_c_api.html | 4 ++-- doc/ext_ffi.html | 4 ++-- doc/ext_ffi_api.html | 4 ++-- doc/ext_ffi_semantics.html | 4 ++-- doc/ext_ffi_tutorial.html | 4 ++-- doc/ext_jit.html | 4 ++-- doc/extensions.html | 4 ++-- doc/faq.html | 4 ++-- doc/install.html | 4 ++-- doc/luajit.html | 6 +++--- doc/running.html | 4 ++-- doc/status.html | 4 ++-- dynasm/dasm_arm.h | 2 +- dynasm/dasm_arm.lua | 2 +- dynasm/dasm_mips.h | 2 +- dynasm/dasm_mips.lua | 2 +- dynasm/dasm_ppc.h | 2 +- dynasm/dasm_ppc.lua | 2 +- dynasm/dasm_proto.h | 2 +- dynasm/dasm_x64.lua | 2 +- dynasm/dasm_x86.h | 2 +- dynasm/dasm_x86.lua | 2 +- dynasm/dynasm.lua | 4 ++-- etc/luajit.1 | 2 +- src/Makefile | 2 +- src/host/buildvm.c | 2 +- src/host/buildvm.h | 2 +- src/host/buildvm_asm.c | 2 +- src/host/buildvm_fold.c | 2 +- src/host/buildvm_lib.c | 2 +- src/host/buildvm_peobj.c | 2 +- src/host/genminilua.lua | 2 +- src/jit/bc.lua | 2 +- src/jit/bcsave.lua | 2 +- src/jit/dis_arm.lua | 2 +- src/jit/dis_mips.lua | 2 +- src/jit/dis_mipsel.lua | 2 +- src/jit/dis_ppc.lua | 2 +- src/jit/dis_x64.lua | 2 +- src/jit/dis_x86.lua | 2 +- src/jit/dump.lua | 2 +- src/jit/v.lua | 2 +- src/lib_aux.c | 2 +- src/lib_base.c | 2 +- src/lib_bit.c | 2 +- src/lib_debug.c | 2 +- src/lib_ffi.c | 2 +- src/lib_init.c | 2 +- src/lib_io.c | 2 +- src/lib_jit.c | 2 +- src/lib_math.c | 2 +- src/lib_os.c | 2 +- src/lib_package.c | 2 +- src/lib_string.c | 2 +- src/lib_table.c | 2 +- src/lj_api.c | 2 +- src/lj_arch.h | 2 +- src/lj_asm.c | 2 +- src/lj_asm.h | 2 +- src/lj_asm_arm.h | 2 +- src/lj_asm_mips.h | 2 +- src/lj_asm_ppc.h | 2 +- src/lj_asm_x86.h | 2 +- src/lj_bc.c | 2 +- src/lj_bc.h | 2 +- src/lj_bcdump.h | 2 +- src/lj_bcread.c | 2 +- src/lj_bcwrite.c | 2 +- src/lj_carith.c | 2 +- src/lj_carith.h | 2 +- src/lj_ccall.c | 2 +- src/lj_ccall.h | 2 +- src/lj_ccallback.c | 2 +- src/lj_ccallback.h | 2 +- src/lj_cconv.c | 2 +- src/lj_cconv.h | 2 +- src/lj_cdata.c | 2 +- src/lj_cdata.h | 2 +- src/lj_clib.c | 2 +- src/lj_clib.h | 2 +- src/lj_cparse.c | 2 +- src/lj_cparse.h | 2 +- src/lj_crecord.c | 2 +- src/lj_crecord.h | 2 +- src/lj_ctype.c | 2 +- src/lj_ctype.h | 2 +- src/lj_debug.c | 2 +- src/lj_debug.h | 2 +- src/lj_def.h | 2 +- src/lj_dispatch.c | 2 +- src/lj_dispatch.h | 2 +- src/lj_emit_arm.h | 2 +- src/lj_emit_mips.h | 2 +- src/lj_emit_ppc.h | 2 +- src/lj_emit_x86.h | 2 +- src/lj_err.c | 2 +- src/lj_err.h | 2 +- src/lj_errmsg.h | 2 +- src/lj_ff.h | 2 +- src/lj_ffrecord.c | 2 +- src/lj_ffrecord.h | 2 +- src/lj_frame.h | 2 +- src/lj_func.c | 2 +- src/lj_func.h | 2 +- src/lj_gc.c | 2 +- src/lj_gc.h | 2 +- src/lj_gdbjit.c | 2 +- src/lj_gdbjit.h | 2 +- src/lj_ir.c | 2 +- src/lj_ir.h | 2 +- src/lj_ircall.h | 2 +- src/lj_iropt.h | 2 +- src/lj_jit.h | 2 +- src/lj_lex.c | 2 +- src/lj_lex.h | 2 +- src/lj_lib.c | 2 +- src/lj_lib.h | 2 +- src/lj_load.c | 2 +- src/lj_mcode.c | 2 +- src/lj_mcode.h | 2 +- src/lj_meta.c | 2 +- src/lj_meta.h | 2 +- src/lj_obj.c | 2 +- src/lj_obj.h | 2 +- src/lj_opt_dce.c | 2 +- src/lj_opt_fold.c | 2 +- src/lj_opt_loop.c | 2 +- src/lj_opt_mem.c | 2 +- src/lj_opt_narrow.c | 2 +- src/lj_opt_sink.c | 2 +- src/lj_opt_split.c | 2 +- src/lj_parse.c | 2 +- src/lj_parse.h | 2 +- src/lj_record.c | 2 +- src/lj_record.h | 2 +- src/lj_snap.c | 2 +- src/lj_snap.h | 2 +- src/lj_state.c | 2 +- src/lj_state.h | 2 +- src/lj_str.c | 2 +- src/lj_str.h | 2 +- src/lj_strscan.c | 2 +- src/lj_strscan.h | 2 +- src/lj_tab.c | 2 +- src/lj_tab.h | 2 +- src/lj_target.h | 2 +- src/lj_target_arm.h | 2 +- src/lj_target_mips.h | 2 +- src/lj_target_ppc.h | 2 +- src/lj_target_x86.h | 2 +- src/lj_trace.c | 2 +- src/lj_trace.h | 2 +- src/lj_traceerr.h | 2 +- src/lj_udata.c | 2 +- src/lj_udata.h | 2 +- src/lj_vm.h | 2 +- src/lj_vmevent.c | 2 +- src/lj_vmevent.h | 2 +- src/lj_vmmath.c | 2 +- src/ljamalg.c | 2 +- src/luaconf.h | 2 +- src/luajit.c | 2 +- src/luajit.h | 4 ++-- src/lualib.h | 2 +- src/msvcbuild.bat | 2 +- src/vm_arm.dasc | 2 +- src/vm_mips.dasc | 2 +- src/vm_ppc.dasc | 2 +- src/vm_ppcspe.dasc | 2 +- src/vm_x86.dasc | 2 +- 177 files changed, 195 insertions(+), 195 deletions(-) diff --git a/COPYRIGHT b/COPYRIGHT index b614d3eb..6ed40025 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,7 +1,7 @@ =============================================================================== LuaJIT -- a Just-In-Time Compiler for Lua. http://luajit.org/ -Copyright (C) 2005-2016 Mike Pall. All rights reserved. +Copyright (C) 2005-2017 Mike Pall. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index eb9572d8..b29f1031 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ # For MSVC, please follow the instructions given in src/msvcbuild.bat. # For MinGW and Cygwin, cd to src and run make with the Makefile there. # -# Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h +# Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h ############################################################################## MAJVER= 2 diff --git a/README b/README index e5bb1c62..1b0abed2 100644 --- a/README +++ b/README @@ -5,7 +5,7 @@ LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language. Project Homepage: http://luajit.org/ -LuaJIT is Copyright (C) 2005-2016 Mike Pall. +LuaJIT is Copyright (C) 2005-2017 Mike Pall. LuaJIT is free software, released under the MIT license. See full Copyright Notice in the COPYRIGHT file or in luajit.h. diff --git a/doc/bluequad-print.css b/doc/bluequad-print.css index 975a55bf..62e1c165 100644 --- a/doc/bluequad-print.css +++ b/doc/bluequad-print.css @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2016 Mike Pall. +/* Copyright (C) 2004-2017 Mike Pall. * * You are welcome to use the general ideas of this design for your own sites. * But please do not steal the stylesheet, the layout or the color scheme. diff --git a/doc/bluequad.css b/doc/bluequad.css index 5dca9064..be2c4bf2 100644 --- a/doc/bluequad.css +++ b/doc/bluequad.css @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2016 Mike Pall. +/* Copyright (C) 2004-2017 Mike Pall. * * You are welcome to use the general ideas of this design for your own sites. * But please do not steal the stylesheet, the layout or the color scheme. diff --git a/doc/changes.html b/doc/changes.html index 96eef660..8811efc5 100644 --- a/doc/changes.html +++ b/doc/changes.html @@ -4,7 +4,7 @@ LuaJIT Change History - + @@ -968,7 +968,7 @@ This is the initial non-public release of LuaJIT.

    Building LuaJIT

    The supplied Makefiles try to auto-detect the settings needed for your diff --git a/doc/running.html b/doc/running.html index 1232b84f..331c22df 100644 --- a/doc/running.html +++ b/doc/running.html @@ -186,7 +186,7 @@ itself. For a description of their options and output format, please read the comment block at the start of their source. They can be found in the lib directory of the source distribution or installed under the jit directory. By default -this is /usr/local/share/luajit-2.0.4/jit on POSIX +this is /usr/local/share/luajit-2.0.5/jit on POSIX systems.

    diff --git a/etc/luajit.pc b/etc/luajit.pc index a652b40d..36840ab8 100644 --- a/etc/luajit.pc +++ b/etc/luajit.pc @@ -1,7 +1,7 @@ # Package information for LuaJIT to be used by pkg-config. majver=2 minver=0 -relver=4 +relver=5 version=${majver}.${minver}.${relver} abiver=5.1 diff --git a/src/Makefile b/src/Makefile index bd172dbe..f7f81a4e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,7 +12,7 @@ MAJVER= 2 MINVER= 0 -RELVER= 4 +RELVER= 5 ABIVER= 5.1 NODOTABIVER= 51 diff --git a/src/jit/bc.lua b/src/jit/bc.lua index a4cd5f20..a2f84aaf 100644 --- a/src/jit/bc.lua +++ b/src/jit/bc.lua @@ -41,7 +41,7 @@ -- Cache some library functions and objects. local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local jutil = require("jit.util") local vmdef = require("jit.vmdef") local bit = require("bit") diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index d548b1a7..aa677dfc 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -11,7 +11,7 @@ ------------------------------------------------------------------------------ local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local bit = require("bit") -- Symbol name prefix for LuaJIT bytecode. diff --git a/src/jit/dump.lua b/src/jit/dump.lua index 6c6abb66..666ba438 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -55,7 +55,7 @@ -- Cache some library functions and objects. local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local jutil = require("jit.util") local vmdef = require("jit.vmdef") local funcinfo, funcbc = jutil.funcinfo, jutil.funcbc diff --git a/src/jit/v.lua b/src/jit/v.lua index 0f5407a1..47ee3941 100644 --- a/src/jit/v.lua +++ b/src/jit/v.lua @@ -59,7 +59,7 @@ -- Cache some library functions and objects. local jit = require("jit") -assert(jit.version_num == 20004, "LuaJIT core/library version mismatch") +assert(jit.version_num == 20005, "LuaJIT core/library version mismatch") local jutil = require("jit.util") local vmdef = require("jit.vmdef") local funcinfo, traceinfo = jutil.funcinfo, jutil.traceinfo diff --git a/src/luaconf.h b/src/luaconf.h index 73b80f11..b33e91b7 100644 --- a/src/luaconf.h +++ b/src/luaconf.h @@ -37,7 +37,7 @@ #endif #define LUA_LROOT "/usr/local" #define LUA_LUADIR "/lua/5.1/" -#define LUA_LJDIR "/luajit-2.0.4/" +#define LUA_LJDIR "/luajit-2.0.5/" #ifdef LUA_ROOT #define LUA_JROOT LUA_ROOT diff --git a/src/luajit.h b/src/luajit.h index 1709ca26..c5ff3acb 100644 --- a/src/luajit.h +++ b/src/luajit.h @@ -30,9 +30,9 @@ #include "lua.h" -#define LUAJIT_VERSION "LuaJIT 2.0.4" -#define LUAJIT_VERSION_NUM 20004 /* Version 2.0.4 = 02.00.04. */ -#define LUAJIT_VERSION_SYM luaJIT_version_2_0_4 +#define LUAJIT_VERSION "LuaJIT 2.0.5" +#define LUAJIT_VERSION_NUM 20005 /* Version 2.0.5 = 02.00.05. */ +#define LUAJIT_VERSION_SYM luaJIT_version_2_0_5 #define LUAJIT_COPYRIGHT "Copyright (C) 2005-2017 Mike Pall" #define LUAJIT_URL "http://luajit.org/" From e9f8abfbf5e565f26b10dbd5c66f73143ff2211b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 1 May 2017 21:02:34 +0200 Subject: [PATCH 082/116] Update changelog. --- doc/changes.html | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/changes.html b/doc/changes.html index 6522fa16..a66a8d95 100644 --- a/doc/changes.html +++ b/doc/changes.html @@ -74,6 +74,30 @@ to see whether newer versions are available.

    +

    LuaJIT 2.1.0-beta3 — 2017-05-01

    +
      +
    • Rewrite memory block allocator.
    • +
    • Add various extension from Lua 5.2/5.3.
    • +
    • Remove old Lua 5.0 compatibility defines.
    • +
    • Set arg table before evaluating LUA_INIT and -e chunks.
    • +
    • Fix FOLD rules for math.abs() and FP negation.
    • +
    • Fix soft-float math.abs() and negation.
    • +
    • Fix formatting of some small denormals at low precision.
    • +
    • LJ_GC64: Add JIT compiler support.
    • +
    • x64/LJ_GC64: Add JIT compiler backend.
    • +
    • x86/x64: Generate BMI2 shifts and rotates, if available.
    • +
    • Windows/x86: Add full exception interoperability.
    • +
    • ARM64: Add big-endian support.
    • +
    • ARM64: Add JIT compiler backend.
    • +
    • MIPS: Fix TSETR barrier.
    • +
    • MIPS: Support MIPS16 interlinking.
    • +
    • MIPS soft-float: Fix code generation for HREF.
    • +
    • MIPS64: Add MIPS64 hard-float JIT compiler backend.
    • +
    • MIPS64: Add MIPS64 hard-float/soft-float support to interpreter.
    • +
    • FFI: Compile bitfield loads/stores.
    • +
    • Various fixes common with the 2.0 branch.
    • +
    +

    LuaJIT 2.1.0-beta2 — 2016-03-03

    • Enable trace stitching.
    • @@ -149,7 +173,7 @@ Please take a look at the commit history for more details.
    • Remove internal __mode = "K" and replace with safe check.
    • Add "proto" field to jit.util.funcinfo().
    • Fix GC step size calculation.
    • -
    • Initialize uv->immutable for upvalues of loaded chunks.
    • +
    • Initialize uv->immutable for upvalues of loaded chunks.
    • Fix for cdata vs. non-cdata arithmetics/comparisons.
    • Drop leftover regs in 'for' iterator assignment, too.
    • Fix PHI remarking in SINK pass.
    • From 8271c643c21d1b2f344e339f559f2de6f3663191 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 1 May 2017 21:03:01 +0200 Subject: [PATCH 083/116] RELEASE LuaJIT-2.1.0-beta3 --- Makefile | 2 +- README | 2 +- etc/luajit.pc | 2 +- src/luaconf.h | 2 +- src/luajit.h | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index e6472e0b..0f933089 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ MAJVER= 2 MINVER= 1 RELVER= 0 -PREREL= -beta2 +PREREL= -beta3 VERSION= $(MAJVER).$(MINVER).$(RELVER)$(PREREL) ABIVER= 5.1 diff --git a/README b/README index 719e6118..2b9ae9d2 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -README for LuaJIT 2.1.0-beta2 +README for LuaJIT 2.1.0-beta3 ----------------------------- LuaJIT is a Just-In-Time (JIT) compiler for the Lua programming language. diff --git a/etc/luajit.pc b/etc/luajit.pc index 0fdd1efd..a78f1746 100644 --- a/etc/luajit.pc +++ b/etc/luajit.pc @@ -2,7 +2,7 @@ majver=2 minver=1 relver=0 -version=${majver}.${minver}.${relver}-beta2 +version=${majver}.${minver}.${relver}-beta3 abiver=5.1 prefix=/usr/local diff --git a/src/luaconf.h b/src/luaconf.h index 0c70b145..c2d29d94 100644 --- a/src/luaconf.h +++ b/src/luaconf.h @@ -37,7 +37,7 @@ #endif #define LUA_LROOT "/usr/local" #define LUA_LUADIR "/lua/5.1/" -#define LUA_LJDIR "/luajit-2.1.0-beta2/" +#define LUA_LJDIR "/luajit-2.1.0-beta3/" #ifdef LUA_ROOT #define LUA_JROOT LUA_ROOT diff --git a/src/luajit.h b/src/luajit.h index c1c801c9..708a5a11 100644 --- a/src/luajit.h +++ b/src/luajit.h @@ -30,9 +30,9 @@ #include "lua.h" -#define LUAJIT_VERSION "LuaJIT 2.1.0-beta2" +#define LUAJIT_VERSION "LuaJIT 2.1.0-beta3" #define LUAJIT_VERSION_NUM 20100 /* Version 2.1.0 = 02.01.00. */ -#define LUAJIT_VERSION_SYM luaJIT_version_2_1_0_beta2 +#define LUAJIT_VERSION_SYM luaJIT_version_2_1_0_beta3 #define LUAJIT_COPYRIGHT "Copyright (C) 2005-2017 Mike Pall" #define LUAJIT_URL "http://luajit.org/" From 1c89933f129dde76944336c6bfd05297b8d67730 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 9 May 2017 20:59:37 +0200 Subject: [PATCH 084/116] Fix LJ_MAX_JSLOTS assertion in rec_check_slots(). Thanks to Yichun Zhang. --- src/lj_record.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lj_record.c b/src/lj_record.c index 7b572764..cecacd21 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -81,9 +81,9 @@ static void rec_check_slots(jit_State *J) BCReg s, nslots = J->baseslot + J->maxslot; int32_t depth = 0; cTValue *base = J->L->base - J->baseslot; - lua_assert(J->baseslot >= 1 && J->baseslot < LJ_MAX_JSLOTS); + lua_assert(J->baseslot >= 1); lua_assert(J->baseslot == 1 || (J->slot[J->baseslot-1] & TREF_FRAME)); - lua_assert(nslots < LJ_MAX_JSLOTS); + lua_assert(nslots <= LJ_MAX_JSLOTS); for (s = 0; s < nslots; s++) { TRef tr = J->slot[s]; if (tr) { From 630ff3196a06353c6a7ccd1e9ac3958f4a8ca13c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 17 May 2017 17:37:35 +0200 Subject: [PATCH 085/116] Add missing LJ_MAX_JSLOTS check. Thanks to Yichun Zhang. --- src/lj_record.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lj_record.c b/src/lj_record.c index cecacd21..bc4e8a6d 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -633,6 +633,8 @@ void lj_record_call(jit_State *J, BCReg func, ptrdiff_t nargs) J->framedepth++; J->base += func+1; J->baseslot += func+1; + if (J->baseslot + J->maxslot >= LJ_MAX_JSLOTS) + lj_trace_err(J, LJ_TRERR_STACKOV); } /* Record tail call. */ From 7381b620358c2561e8690149f1d25828fdad6675 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Jun 2017 19:16:22 +0200 Subject: [PATCH 086/116] MIPS: Use precise search for exit jump patching. Contributed by Djordje Kovacevic and Stefan Pejic. --- src/lj_asm_mips.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index 03270cca..d0a1ca51 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -1933,7 +1933,11 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MCode tjump = MIPSI_J|(((uintptr_t)target>>2)&0x03ffffffu); for (p++; p < pe; p++) { if (*p == exitload) { /* Look for load of exit number. */ - if (((p[-1] ^ (px-p)) & 0xffffu) == 0) { /* Look for exitstub branch. */ + /* Look for exitstub branch. Yes, this covers all used branch variants. */ + if (((p[-1] ^ (px-p)) & 0xffffu) == 0 && + ((p[-1] & 0xf0000000u) == MIPSI_BEQ || + (p[-1] & 0xfc1e0000u) == MIPSI_BLTZ || + (p[-1] & 0xffe00000u) == MIPSI_BC1F)) { ptrdiff_t delta = target - p; if (((delta + 0x8000) >> 16) == 0) { /* Patch in-range branch. */ patchbranch: From c7c3c4da432ddb543d4b0a9abbb245f11b26afd0 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Jun 2017 19:36:46 +0200 Subject: [PATCH 087/116] MIPS: Fix handling of spare long-range jump slots. Contributed by Djordje Kovacevic and Stefan Pejic. --- src/lj_asm_mips.h | 9 +++++---- src/lj_jit.h | 6 ++++++ src/lj_mcode.c | 6 ------ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index d0a1ca51..76311903 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -65,10 +65,9 @@ static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow) static void asm_sparejump_setup(ASMState *as) { MCode *mxp = as->mcbot; - /* Assumes sizeof(MCLink) == 8. */ - if (((uintptr_t)mxp & (LJ_PAGESIZE-1)) == 8) { + if (((uintptr_t)mxp & (LJ_PAGESIZE-1)) == sizeof(MCLink)) { lua_assert(MIPSI_NOP == 0); - memset(mxp+2, 0, MIPS_SPAREJUMP*8); + memset(mxp, 0, MIPS_SPAREJUMP*2*sizeof(MCode)); mxp += MIPS_SPAREJUMP*2; lua_assert(mxp < as->mctop); lj_mcode_sync(as->mcbot, mxp); @@ -1947,7 +1946,9 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) if (!cstart) cstart = p-1; } else { /* Branch out of range. Use spare jump slot in mcarea. */ int i; - for (i = 2; i < 2+MIPS_SPAREJUMP*2; i += 2) { + for (i = (int)(sizeof(MCLink)/sizeof(MCode)); + i < (int)(sizeof(MCLink)/sizeof(MCode)+MIPS_SPAREJUMP*2); + i += 2) { if (mcarea[i] == tjump) { delta = mcarea+i - p; goto patchbranch; diff --git a/src/lj_jit.h b/src/lj_jit.h index a2e8fd92..3f38d289 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -155,6 +155,12 @@ typedef uint8_t MCode; typedef uint32_t MCode; #endif +/* Linked list of MCode areas. */ +typedef struct MCLink { + MCode *next; /* Next area. */ + size_t size; /* Size of current area. */ +} MCLink; + /* Stack snapshot header. */ typedef struct SnapShot { uint16_t mapofs; /* Offset into snapshot map. */ diff --git a/src/lj_mcode.c b/src/lj_mcode.c index f0a1f699..5ea89f66 100644 --- a/src/lj_mcode.c +++ b/src/lj_mcode.c @@ -272,12 +272,6 @@ static void *mcode_alloc(jit_State *J, size_t sz) /* -- MCode area management ----------------------------------------------- */ -/* Linked list of MCode areas. */ -typedef struct MCLink { - MCode *next; /* Next area. */ - size_t size; /* Size of current area. */ -} MCLink; - /* Allocate a new MCode area. */ static void mcode_allocarea(jit_State *J) { From a057a07ab702e225e21848d4f918886c5b0ac06b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Jun 2017 23:56:54 +0200 Subject: [PATCH 088/116] MIPS64: Add soft-float support to JIT compiler backend. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- src/lj_arch.h | 4 +- src/lj_asm.c | 8 +- src/lj_asm_mips.h | 217 +++++++++++++++++++++++++++++++++++++-------- src/lj_crecord.c | 4 +- src/lj_emit_mips.h | 2 + src/lj_ffrecord.c | 2 +- src/lj_ircall.h | 43 ++++++--- src/lj_iropt.h | 2 +- src/lj_jit.h | 4 +- src/lj_obj.h | 3 + src/lj_opt_split.c | 2 +- src/lj_snap.c | 21 +++-- src/vm_mips64.dasc | 49 ++++++++++ 13 files changed, 286 insertions(+), 75 deletions(-) diff --git a/src/lj_arch.h b/src/lj_arch.h index c8d7138e..b7705642 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -337,9 +337,6 @@ #define LJ_ARCH_BITS 32 #define LJ_TARGET_MIPS32 1 #else -#if LJ_ABI_SOFTFP || !LJ_ARCH_HASFPU -#define LJ_ARCH_NOJIT 1 /* NYI */ -#endif #define LJ_ARCH_BITS 64 #define LJ_TARGET_MIPS64 1 #define LJ_TARGET_GC64 1 @@ -512,6 +509,7 @@ #define LJ_ABI_SOFTFP 0 #endif #define LJ_SOFTFP (!LJ_ARCH_HASFPU) +#define LJ_SOFTFP32 (LJ_SOFTFP && LJ_32) #if LJ_ARCH_ENDIAN == LUAJIT_BE #define LJ_LE 0 diff --git a/src/lj_asm.c b/src/lj_asm.c index c2cf5a95..bed2268e 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -338,7 +338,7 @@ static Reg ra_rematk(ASMState *as, IRRef ref) ra_modified(as, r); ir->r = RID_INIT; /* Do not keep any hint. */ RA_DBGX((as, "remat $i $r", ir, r)); -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 if (ir->o == IR_KNUM) { emit_loadk64(as, r, ir); } else @@ -1305,7 +1305,7 @@ static void asm_call(ASMState *as, IRIns *ir) asm_gencall(as, ci, args); } -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_pow]; @@ -1652,10 +1652,10 @@ static void asm_ir(ASMState *as, IRIns *ir) case IR_MUL: asm_mul(as, ir); break; case IR_MOD: asm_mod(as, ir); break; case IR_NEG: asm_neg(as, ir); break; -#if LJ_SOFTFP +#if LJ_SOFTFP32 case IR_DIV: case IR_POW: case IR_ABS: case IR_ATAN2: case IR_LDEXP: case IR_FPMATH: case IR_TOBIT: - lua_assert(0); /* Unused for LJ_SOFTFP. */ + lua_assert(0); /* Unused for LJ_SOFTFP32. */ break; #else case IR_DIV: asm_div(as, ir); break; diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index 05af3d09..1406a873 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -290,7 +290,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) { ra_leftov(as, gpr, ref); gpr++; -#if LJ_64 +#if LJ_64 && !LJ_SOFTFP fpr++; #endif } @@ -301,7 +301,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) emit_spstore(as, ir, r, ofs); ofs += irt_isnum(ir->t) ? 8 : 4; #else - emit_spstore(as, ir, r, ofs + ((LJ_BE && (LJ_SOFTFP || r < RID_MAX_GPR) && !irt_is64(ir->t)) ? 4 : 0)); + emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_isfp(ir->t) && !irt_is64(ir->t)) ? 4 : 0)); ofs += 8; #endif } @@ -312,7 +312,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) #endif if (gpr <= REGARG_LASTGPR) { gpr++; -#if LJ_64 +#if LJ_64 && !LJ_SOFTFP fpr++; #endif } else { @@ -461,12 +461,36 @@ static void asm_tobit(ASMState *as, IRIns *ir) emit_tg(as, MIPSI_MFC1, dest, tmp); emit_fgh(as, MIPSI_ADD_D, tmp, left, right); } +#elif LJ_64 /* && LJ_SOFTFP */ +static void asm_tointg(ASMState *as, IRIns *ir, Reg r) +{ + /* The modified regs must match with the *.dasc implementation. */ + RegSet drop = RID2RSET(REGARG_FIRSTGPR)|RID2RSET(RID_RET)|RID2RSET(RID_RET+1)| + RID2RSET(RID_R1)|RID2RSET(RID_R12); + if (ra_hasreg(ir->r)) rset_clear(drop, ir->r); + ra_evictset(as, drop); + /* Return values are in RID_RET (converted value) and RID_RET+1 (status). */ + ra_destreg(as, ir, RID_RET); + asm_guard(as, MIPSI_BNE, RID_RET+1, RID_ZERO); + emit_call(as, (void *)lj_ir_callinfo[IRCALL_lj_vm_tointg].func, 0); + if (r == RID_NONE) + ra_leftov(as, REGARG_FIRSTGPR, ir->op1); + else if (r != REGARG_FIRSTGPR) + emit_move(as, REGARG_FIRSTGPR, r); +} + +static void asm_tobit(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + emit_dta(as, MIPSI_SLL, dest, dest, 0); + asm_callid(as, ir, IRCALL_lj_vm_tobit); +} #endif static void asm_conv(ASMState *as, IRIns *ir) { IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 int stfp = (st == IRT_NUM || st == IRT_FLOAT); #endif #if LJ_64 @@ -477,12 +501,13 @@ static void asm_conv(ASMState *as, IRIns *ir) lua_assert(!(irt_isint64(ir->t) || (st == IRT_I64 || st == IRT_U64))); /* Handled by SPLIT. */ #endif -#if LJ_32 && LJ_SOFTFP +#if LJ_SOFTFP32 /* FP conversions are handled by SPLIT. */ lua_assert(!irt_isfp(ir->t) && !(st == IRT_NUM || st == IRT_FLOAT)); /* Can't check for same types: SPLIT uses CONV int.int + BXOR for sfp NEG. */ #else lua_assert(irt_type(ir->t) != st); +#if !LJ_SOFTFP if (irt_isfp(ir->t)) { Reg dest = ra_dest(as, ir, RSET_FPR); if (stfp) { /* FP to FP conversion. */ @@ -608,6 +633,42 @@ static void asm_conv(ASMState *as, IRIns *ir) } } } else +#else + if (irt_isfp(ir->t)) { +#if LJ_64 && LJ_HASFFI + if (stfp) { /* FP to FP conversion. */ + asm_callid(as, ir, irt_isnum(ir->t) ? IRCALL_softfp_f2d : + IRCALL_softfp_d2f); + } else { /* Integer to FP conversion. */ + IRCallID cid = ((IRT_IS64 >> st) & 1) ? + (irt_isnum(ir->t) ? + (st == IRT_I64 ? IRCALL_fp64_l2d : IRCALL_fp64_ul2d) : + (st == IRT_I64 ? IRCALL_fp64_l2f : IRCALL_fp64_ul2f)) : + (irt_isnum(ir->t) ? + (st == IRT_INT ? IRCALL_softfp_i2d : IRCALL_softfp_ui2d) : + (st == IRT_INT ? IRCALL_softfp_i2f : IRCALL_softfp_ui2f)); + asm_callid(as, ir, cid); + } +#else + asm_callid(as, ir, IRCALL_softfp_i2d); +#endif + } else if (stfp) { /* FP to integer conversion. */ + if (irt_isguard(ir->t)) { + /* Checked conversions are only supported from number to int. */ + lua_assert(irt_isint(ir->t) && st == IRT_NUM); + asm_tointg(as, ir, RID_NONE); + } else { + IRCallID cid = irt_is64(ir->t) ? + ((st == IRT_NUM) ? + (irt_isi64(ir->t) ? IRCALL_fp64_d2l : IRCALL_fp64_d2ul) : + (irt_isi64(ir->t) ? IRCALL_fp64_f2l : IRCALL_fp64_f2ul)) : + ((st == IRT_NUM) ? + (irt_isint(ir->t) ? IRCALL_softfp_d2i : IRCALL_softfp_d2ui) : + (irt_isint(ir->t) ? IRCALL_softfp_f2i : IRCALL_softfp_f2ui)); + asm_callid(as, ir, cid); + } + } else +#endif #endif { Reg dest = ra_dest(as, ir, RSET_GPR); @@ -665,7 +726,7 @@ static void asm_strto(ASMState *as, IRIns *ir) const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num]; IRRef args[2]; int32_t ofs = 0; -#if LJ_SOFTFP +#if LJ_SOFTFP32 ra_evictset(as, RSET_SCRATCH); if (ra_used(ir)) { if (ra_hasspill(ir->s) && ra_hasspill((ir+1)->s) && @@ -806,7 +867,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) MCLabel l_end, l_loop, l_next; rset_clear(allow, tab); -#if LJ_32 && LJ_SOFTFP +#if LJ_SOFTFP32 if (!isk) { key = ra_alloc1(as, refkey, allow); rset_clear(allow, key); @@ -826,7 +887,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) } } #else - if (irt_isnum(kt)) { + if (!LJ_SOFTFP && irt_isnum(kt)) { key = ra_alloc1(as, refkey, RSET_FPR); tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key)); } else if (!irt_ispri(kt)) { @@ -882,6 +943,9 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_dta(as, MIPSI_DSRA32, tmp1, tmp1, 15); emit_tg(as, MIPSI_DMTC1, tmp1, tmpnum); emit_tsi(as, MIPSI_LD, tmp1, dest, (int32_t)offsetof(Node, key.u64)); + } else if (LJ_SOFTFP && irt_isnum(kt)) { + emit_branch(as, MIPSI_BEQ, tmp1, key, l_end); + emit_tsi(as, MIPSI_LD, tmp1, dest, (int32_t)offsetof(Node, key.u64)); } else if (irt_isaddr(kt)) { Reg refk = tmp2; if (isk) { @@ -960,7 +1024,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_dta(as, MIPSI_ROTR, dest, tmp1, (-HASH_ROT1)&31); if (irt_isnum(kt)) { emit_dst(as, MIPSI_ADDU, tmp1, tmp1, tmp1); - emit_dta(as, MIPSI_DSRA32, tmp1, tmp1, 0); + emit_dta(as, MIPSI_DSRA32, tmp1, LJ_SOFTFP ? key : tmp1, 0); emit_dta(as, MIPSI_SLL, tmp2, LJ_SOFTFP ? key : tmp1, 0); #if !LJ_SOFTFP emit_tg(as, MIPSI_DMFC1, tmp1, key); @@ -1123,7 +1187,7 @@ static MIPSIns asm_fxloadins(IRIns *ir) case IRT_U8: return MIPSI_LBU; case IRT_I16: return MIPSI_LH; case IRT_U16: return MIPSI_LHU; - case IRT_NUM: lua_assert(!LJ_SOFTFP); return MIPSI_LDC1; + case IRT_NUM: lua_assert(!LJ_SOFTFP32); if (!LJ_SOFTFP) return MIPSI_LDC1; case IRT_FLOAT: if (!LJ_SOFTFP) return MIPSI_LWC1; default: return (LJ_64 && irt_is64(ir->t)) ? MIPSI_LD : MIPSI_LW; } @@ -1134,7 +1198,7 @@ static MIPSIns asm_fxstoreins(IRIns *ir) switch (irt_type(ir->t)) { case IRT_I8: case IRT_U8: return MIPSI_SB; case IRT_I16: case IRT_U16: return MIPSI_SH; - case IRT_NUM: lua_assert(!LJ_SOFTFP); return MIPSI_SDC1; + case IRT_NUM: lua_assert(!LJ_SOFTFP32); if (!LJ_SOFTFP) return MIPSI_SDC1; case IRT_FLOAT: if (!LJ_SOFTFP) return MIPSI_SWC1; default: return (LJ_64 && irt_is64(ir->t)) ? MIPSI_SD : MIPSI_SW; } @@ -1199,7 +1263,7 @@ static void asm_xstore_(ASMState *as, IRIns *ir, int32_t ofs) static void asm_ahuvload(ASMState *as, IRIns *ir) { - int hiop = (LJ_32 && LJ_SOFTFP && (ir+1)->o == IR_HIOP); + int hiop = (LJ_SOFTFP32 && (ir+1)->o == IR_HIOP); Reg dest = RID_NONE, type = RID_TMP, idx; RegSet allow = RSET_GPR; int32_t ofs = 0; @@ -1212,7 +1276,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) } } if (ra_used(ir)) { - lua_assert((LJ_SOFTFP ? 0 : irt_isnum(ir->t)) || + lua_assert((LJ_SOFTFP32 ? 0 : irt_isnum(ir->t)) || irt_isint(ir->t) || irt_isaddr(ir->t)); dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow); rset_clear(allow, dest); @@ -1261,10 +1325,10 @@ static void asm_ahustore(ASMState *as, IRIns *ir) int32_t ofs = 0; if (ir->r == RID_SINK) return; - if (!LJ_SOFTFP && irt_isnum(ir->t)) { - src = ra_alloc1(as, ir->op2, RSET_FPR); + if (!LJ_SOFTFP32 && irt_isnum(ir->t)) { + src = ra_alloc1(as, ir->op2, LJ_SOFTFP ? RSET_GPR : RSET_FPR); idx = asm_fuseahuref(as, ir->op1, &ofs, allow); - emit_hsi(as, MIPSI_SDC1, src, idx, ofs); + emit_hsi(as, LJ_SOFTFP ? MIPSI_SD : MIPSI_SDC1, src, idx, ofs); } else { #if LJ_32 if (!irt_ispri(ir->t)) { @@ -1312,7 +1376,7 @@ static void asm_sload(ASMState *as, IRIns *ir) IRType1 t = ir->t; #if LJ_32 int32_t ofs = 8*((int32_t)ir->op1-1) + ((ir->op2 & IRSLOAD_FRAME) ? 4 : 0); - int hiop = (LJ_32 && LJ_SOFTFP && (ir+1)->o == IR_HIOP); + int hiop = (LJ_SOFTFP32 && (ir+1)->o == IR_HIOP); if (hiop) t.irt = IRT_NUM; #else @@ -1320,7 +1384,7 @@ static void asm_sload(ASMState *as, IRIns *ir) #endif lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ lua_assert(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK)); -#if LJ_32 && LJ_SOFTFP +#if LJ_SOFTFP32 lua_assert(!(ir->op2 & IRSLOAD_CONVERT)); /* Handled by LJ_SOFTFP SPLIT. */ if (hiop && ra_used(ir+1)) { type = ra_dest(as, ir+1, allow); @@ -1328,29 +1392,44 @@ static void asm_sload(ASMState *as, IRIns *ir) } #else if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) { - dest = ra_scratch(as, RSET_FPR); + dest = ra_scratch(as, LJ_SOFTFP ? allow : RSET_FPR); asm_tointg(as, ir, dest); t.irt = IRT_NUM; /* Continue with a regular number type check. */ } else #endif if (ra_used(ir)) { - lua_assert((LJ_SOFTFP ? 0 : irt_isnum(ir->t)) || + lua_assert((LJ_SOFTFP32 ? 0 : irt_isnum(ir->t)) || irt_isint(ir->t) || irt_isaddr(ir->t)); dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow); rset_clear(allow, dest); base = ra_alloc1(as, REF_BASE, allow); rset_clear(allow, base); - if (!LJ_SOFTFP && (ir->op2 & IRSLOAD_CONVERT)) { + if (!LJ_SOFTFP32 && (ir->op2 & IRSLOAD_CONVERT)) { if (irt_isint(t)) { - Reg tmp = ra_scratch(as, RSET_FPR); + Reg tmp = ra_scratch(as, LJ_SOFTFP ? RSET_GPR : RSET_FPR); +#if LJ_SOFTFP + ra_evictset(as, rset_exclude(RSET_SCRATCH, dest)); + ra_destreg(as, ir, RID_RET); + emit_call(as, (void *)lj_ir_callinfo[IRCALL_softfp_d2i].func, 0); + if (tmp != REGARG_FIRSTGPR) + emit_move(as, REGARG_FIRSTGPR, tmp); +#else emit_tg(as, MIPSI_MFC1, dest, tmp); emit_fg(as, MIPSI_TRUNC_W_D, tmp, tmp); +#endif dest = tmp; t.irt = IRT_NUM; /* Check for original type. */ } else { Reg tmp = ra_scratch(as, RSET_GPR); +#if LJ_SOFTFP + ra_evictset(as, rset_exclude(RSET_SCRATCH, dest)); + ra_destreg(as, ir, RID_RET); + emit_call(as, (void *)lj_ir_callinfo[IRCALL_softfp_i2d].func, 0); + emit_dta(as, MIPSI_SLL, REGARG_FIRSTGPR, tmp, 0); +#else emit_fg(as, MIPSI_CVT_D_W, dest, dest); emit_tg(as, MIPSI_MTC1, tmp, dest); +#endif dest = tmp; t.irt = IRT_INT; /* Check for original type. */ } @@ -1399,7 +1478,7 @@ dotypecheck: if (irt_isnum(t)) { asm_guard(as, MIPSI_BEQ, RID_TMP, RID_ZERO); emit_tsi(as, MIPSI_SLTIU, RID_TMP, RID_TMP, (int32_t)LJ_TISNUM); - if (ra_hasreg(dest)) + if (!LJ_SOFTFP && ra_hasreg(dest)) emit_hsi(as, MIPSI_LDC1, dest, base, ofs); } else { asm_guard(as, MIPSI_BNE, RID_TMP, @@ -1409,7 +1488,7 @@ dotypecheck: } emit_tsi(as, MIPSI_LD, type, base, ofs); } else if (ra_hasreg(dest)) { - if (irt_isnum(t)) + if (!LJ_SOFTFP && irt_isnum(t)) emit_hsi(as, MIPSI_LDC1, dest, base, ofs); else emit_tsi(as, irt_isint(t) ? MIPSI_LW : MIPSI_LD, dest, base, @@ -1548,26 +1627,40 @@ static void asm_fpunary(ASMState *as, IRIns *ir, MIPSIns mi) Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR); emit_fg(as, mi, dest, left); } +#endif +#if !LJ_SOFTFP32 static void asm_fpmath(ASMState *as, IRIns *ir) { if (ir->op2 == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) return; +#if !LJ_SOFTFP if (ir->op2 <= IRFPM_TRUNC) asm_callround(as, ir, IRCALL_lj_vm_floor + ir->op2); else if (ir->op2 == IRFPM_SQRT) asm_fpunary(as, ir, MIPSI_SQRT_D); else +#endif asm_callid(as, ir, IRCALL_lj_vm_floor + ir->op2); } #endif +#if !LJ_SOFTFP +#define asm_fpadd(as, ir) asm_fparith(as, ir, MIPSI_ADD_D) +#define asm_fpsub(as, ir) asm_fparith(as, ir, MIPSI_SUB_D) +#define asm_fpmul(as, ir) asm_fparith(as, ir, MIPSI_MUL_D) +#elif LJ_64 /* && LJ_SOFTFP */ +#define asm_fpadd(as, ir) asm_callid(as, ir, IRCALL_softfp_add) +#define asm_fpsub(as, ir) asm_callid(as, ir, IRCALL_softfp_sub) +#define asm_fpmul(as, ir) asm_callid(as, ir, IRCALL_softfp_mul) +#endif + static void asm_add(ASMState *as, IRIns *ir) { IRType1 t = ir->t; -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 if (irt_isnum(t)) { - asm_fparith(as, ir, MIPSI_ADD_D); + asm_fpadd(as, ir); } else #endif { @@ -1589,9 +1682,9 @@ static void asm_add(ASMState *as, IRIns *ir) static void asm_sub(ASMState *as, IRIns *ir) { -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 if (irt_isnum(ir->t)) { - asm_fparith(as, ir, MIPSI_SUB_D); + asm_fpsub(as, ir); } else #endif { @@ -1605,9 +1698,9 @@ static void asm_sub(ASMState *as, IRIns *ir) static void asm_mul(ASMState *as, IRIns *ir) { -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 if (irt_isnum(ir->t)) { - asm_fparith(as, ir, MIPSI_MUL_D); + asm_fpmul(as, ir); } else #endif { @@ -1634,7 +1727,7 @@ static void asm_mod(ASMState *as, IRIns *ir) asm_callid(as, ir, IRCALL_lj_vm_modi); } -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 static void asm_pow(ASMState *as, IRIns *ir) { #if LJ_64 && LJ_HASFFI @@ -1654,7 +1747,11 @@ static void asm_div(ASMState *as, IRIns *ir) IRCALL_lj_carith_divu64); else #endif +#if !LJ_SOFTFP asm_fparith(as, ir, MIPSI_DIV_D); +#else + asm_callid(as, ir, IRCALL_softfp_div); +#endif } #endif @@ -1664,6 +1761,13 @@ static void asm_neg(ASMState *as, IRIns *ir) if (irt_isnum(ir->t)) { asm_fpunary(as, ir, MIPSI_NEG_D); } else +#elif LJ_64 /* && LJ_SOFTFP */ + if (irt_isnum(ir->t)) { + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); + emit_dst(as, MIPSI_XOR, dest, left, + ra_allock(as, 0x8000000000000000ll, rset_exclude(RSET_GPR, dest))); + } else #endif { Reg dest = ra_dest(as, ir, RSET_GPR); @@ -1673,7 +1777,17 @@ static void asm_neg(ASMState *as, IRIns *ir) } } +#if !LJ_SOFTFP #define asm_abs(as, ir) asm_fpunary(as, ir, MIPSI_ABS_D) +#elif LJ_64 /* && LJ_SOFTFP */ +static void asm_abs(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, ir->op1, RSET_GPR); + emit_tsml(as, MIPSI_DEXTM, dest, left, 30, 0); +} +#endif + #define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2) #define asm_ldexp(as, ir) asm_callid(as, ir, IRCALL_ldexp) @@ -1918,15 +2032,21 @@ static void asm_bror(ASMState *as, IRIns *ir) } } -#if LJ_32 && LJ_SOFTFP +#if LJ_SOFTFP static void asm_sfpmin_max(ASMState *as, IRIns *ir) { CCallInfo ci = lj_ir_callinfo[(IROp)ir->o == IR_MIN ? IRCALL_lj_vm_sfmin : IRCALL_lj_vm_sfmax]; +#if LJ_64 + IRRef args[2]; + args[0] = ir->op1; + args[1] = ir->op2; +#else IRRef args[4]; args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1; args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2; +#endif asm_setupresult(as, ir, &ci); emit_call(as, (void *)ci.func, 0); ci.func = NULL; @@ -1936,7 +2056,10 @@ static void asm_sfpmin_max(ASMState *as, IRIns *ir) static void asm_min_max(ASMState *as, IRIns *ir, int ismax) { - if (!LJ_SOFTFP && irt_isnum(ir->t)) { + if (!LJ_SOFTFP32 && irt_isnum(ir->t)) { +#if LJ_SOFTFP + asm_sfpmin_max(as, ir); +#else Reg dest = ra_dest(as, ir, RSET_FPR); Reg right, left = ra_alloc2(as, ir, RSET_FPR); right = (left >> 8); left &= 255; @@ -1947,6 +2070,7 @@ static void asm_min_max(ASMState *as, IRIns *ir, int ismax) if (dest != right) emit_fg(as, MIPSI_MOV_D, dest, right); } emit_fgh(as, MIPSI_C_OLT_D, 0, ismax ? left : right, ismax ? right : left); +#endif } else { Reg dest = ra_dest(as, ir, RSET_GPR); Reg right, left = ra_alloc2(as, ir, RSET_GPR); @@ -1967,18 +2091,24 @@ static void asm_min_max(ASMState *as, IRIns *ir, int ismax) /* -- Comparisons --------------------------------------------------------- */ -#if LJ_32 && LJ_SOFTFP +#if LJ_SOFTFP /* SFP comparisons. */ static void asm_sfpcomp(ASMState *as, IRIns *ir) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_softfp_cmp]; RegSet drop = RSET_SCRATCH; Reg r; +#if LJ_64 + IRRef args[2]; + args[0] = ir->op1; + args[1] = ir->op2; +#else IRRef args[4]; args[LJ_LE ? 0 : 1] = ir->op1; args[LJ_LE ? 1 : 0] = (ir+1)->op1; args[LJ_LE ? 2 : 3] = ir->op2; args[LJ_LE ? 3 : 2] = (ir+1)->op2; +#endif - for (r = REGARG_FIRSTGPR; r <= REGARG_FIRSTGPR+3; r++) { + for (r = REGARG_FIRSTGPR; r <= REGARG_FIRSTGPR+(LJ_64?1:3); r++) { if (!rset_test(as->freeset, r) && regcost_ref(as->cost[r]) == args[r-REGARG_FIRSTGPR]) rset_clear(drop, r); @@ -2032,11 +2162,15 @@ static void asm_comp(ASMState *as, IRIns *ir) { /* ORDER IR: LT GE LE GT ULT UGE ULE UGT. */ IROp op = ir->o; - if (!LJ_SOFTFP && irt_isnum(ir->t)) { + if (!LJ_SOFTFP32 && irt_isnum(ir->t)) { +#if LJ_SOFTFP + asm_sfpcomp(as, ir); +#else Reg right, left = ra_alloc2(as, ir, RSET_FPR); right = (left >> 8); left &= 255; asm_guard(as, (op&1) ? MIPSI_BC1T : MIPSI_BC1F, 0, 0); emit_fgh(as, MIPSI_C_OLT_D + ((op&3) ^ ((op>>2)&1)), 0, left, right); +#endif } else { Reg right, left = ra_alloc1(as, ir->op1, RSET_GPR); if (op == IR_ABC) op = IR_UGT; @@ -2068,9 +2202,13 @@ static void asm_equal(ASMState *as, IRIns *ir) Reg right, left = ra_alloc2(as, ir, (!LJ_SOFTFP && irt_isnum(ir->t)) ? RSET_FPR : RSET_GPR); right = (left >> 8); left &= 255; - if (!LJ_SOFTFP && irt_isnum(ir->t)) { + if (!LJ_SOFTFP32 && irt_isnum(ir->t)) { +#if LJ_SOFTFP + asm_sfpcomp(as, ir); +#else asm_guard(as, (ir->o & 1) ? MIPSI_BC1T : MIPSI_BC1F, 0, 0); emit_fgh(as, MIPSI_C_EQ_D, 0, left, right); +#endif } else { asm_guard(as, (ir->o & 1) ? MIPSI_BEQ : MIPSI_BNE, left, right); } @@ -2263,7 +2401,7 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) if ((sn & SNAP_NORESTORE)) continue; if (irt_isnum(ir->t)) { -#if LJ_SOFTFP +#if LJ_SOFTFP32 Reg tmp; RegSet allow = rset_exclude(RSET_GPR, RID_BASE); lua_assert(irref_isk(ref)); /* LJ_SOFTFP: must be a number constant. */ @@ -2272,6 +2410,9 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) if (rset_test(as->freeset, tmp+1)) allow = RID2RSET(tmp+1); tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, allow); emit_tsi(as, MIPSI_SW, tmp, RID_BASE, ofs+(LJ_BE?0:4)); +#elif LJ_SOFTFP /* && LJ_64 */ + Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE)); + emit_tsi(as, MIPSI_SD, src, RID_BASE, ofs); #else Reg src = ra_alloc1(as, ref, RSET_FPR); emit_hsi(as, MIPSI_SDC1, src, RID_BASE, ofs); diff --git a/src/lj_crecord.c b/src/lj_crecord.c index e32ae23e..fd59e281 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -212,7 +212,7 @@ static void crec_copy_emit(jit_State *J, CRecMemList *ml, MSize mlp, ml[i].trval = emitir(IRT(IR_XLOAD, ml[i].tp), trsptr, 0); ml[i].trofs = trofs; i++; - rwin += (LJ_SOFTFP && ml[i].tp == IRT_NUM) ? 2 : 1; + rwin += (LJ_SOFTFP32 && ml[i].tp == IRT_NUM) ? 2 : 1; if (rwin >= CREC_COPY_REGWIN || i >= mlp) { /* Flush buffered stores. */ rwin = 0; for ( ; j < i; j++) { @@ -1130,7 +1130,7 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd, else tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_I8 : IRT_I16,IRCONV_SEXT); } - } else if (LJ_SOFTFP && ctype_isfp(d->info) && d->size > 4) { + } else if (LJ_SOFTFP32 && ctype_isfp(d->info) && d->size > 4) { lj_needsplit(J); } #if LJ_TARGET_X86 diff --git a/src/lj_emit_mips.h b/src/lj_emit_mips.h index 8a9ee24d..bb6593ae 100644 --- a/src/lj_emit_mips.h +++ b/src/lj_emit_mips.h @@ -12,6 +12,8 @@ static intptr_t get_k64val(IRIns *ir) return (intptr_t)ir_kgc(ir); } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { return (intptr_t)ir_kptr(ir); + } else if (LJ_SOFTFP && ir->o == IR_KNUM) { + return (intptr_t)ir_knum(ir)->u64; } else { lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL); return ir->i; /* Sign-extended. */ diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index dfdee2db..849d7a27 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -1012,7 +1012,7 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd) handle_num: tra = lj_ir_tonum(J, tra); tr = lj_ir_call(J, id, tr, trsf, tra); - if (LJ_SOFTFP) lj_needsplit(J); + if (LJ_SOFTFP32) lj_needsplit(J); break; case STRFMT_STR: if (!tref_isstr(tra)) { diff --git a/src/lj_ircall.h b/src/lj_ircall.h index 973c36e6..73120065 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -51,7 +51,7 @@ typedef struct CCallInfo { #define CCI_XARGS(ci) (((ci)->flags >> CCI_XARGS_SHIFT) & 3) #define CCI_XA (1u << CCI_XARGS_SHIFT) -#if LJ_SOFTFP || (LJ_32 && LJ_HASFFI) +#if LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI) #define CCI_XNARGS(ci) (CCI_NARGS((ci)) + CCI_XARGS((ci))) #else #define CCI_XNARGS(ci) CCI_NARGS((ci)) @@ -78,13 +78,19 @@ typedef struct CCallInfo { #define IRCALLCOND_SOFTFP_FFI(x) NULL #endif -#if LJ_SOFTFP && LJ_TARGET_MIPS32 +#if LJ_SOFTFP && LJ_TARGET_MIPS #define IRCALLCOND_SOFTFP_MIPS(x) x #else #define IRCALLCOND_SOFTFP_MIPS(x) NULL #endif -#define LJ_NEED_FP64 (LJ_TARGET_ARM || LJ_TARGET_PPC || LJ_TARGET_MIPS32) +#if LJ_SOFTFP && LJ_TARGET_MIPS64 +#define IRCALLCOND_SOFTFP_MIPS64(x) x +#else +#define IRCALLCOND_SOFTFP_MIPS64(x) NULL +#endif + +#define LJ_NEED_FP64 (LJ_TARGET_ARM || LJ_TARGET_PPC || LJ_TARGET_MIPS) #if LJ_HASFFI && (LJ_SOFTFP || LJ_NEED_FP64) #define IRCALLCOND_FP64_FFI(x) x @@ -112,6 +118,14 @@ typedef struct CCallInfo { #define XA2_FP 0 #endif +#if LJ_SOFTFP32 +#define XA_FP32 CCI_XA +#define XA2_FP32 (CCI_XA+CCI_XA) +#else +#define XA_FP32 0 +#define XA2_FP32 0 +#endif + #if LJ_32 #define XA_64 CCI_XA #define XA2_64 (CCI_XA+CCI_XA) @@ -181,20 +195,21 @@ typedef struct CCallInfo { _(ANY, pow, 2, N, NUM, XA2_FP) \ _(ANY, atan2, 2, N, NUM, XA2_FP) \ _(ANY, ldexp, 2, N, NUM, XA_FP) \ - _(SOFTFP, lj_vm_tobit, 2, N, INT, 0) \ - _(SOFTFP, softfp_add, 4, N, NUM, 0) \ - _(SOFTFP, softfp_sub, 4, N, NUM, 0) \ - _(SOFTFP, softfp_mul, 4, N, NUM, 0) \ - _(SOFTFP, softfp_div, 4, N, NUM, 0) \ - _(SOFTFP, softfp_cmp, 4, N, NIL, 0) \ + _(SOFTFP, lj_vm_tobit, 1, N, INT, XA_FP32) \ + _(SOFTFP, softfp_add, 2, N, NUM, XA2_FP32) \ + _(SOFTFP, softfp_sub, 2, N, NUM, XA2_FP32) \ + _(SOFTFP, softfp_mul, 2, N, NUM, XA2_FP32) \ + _(SOFTFP, softfp_div, 2, N, NUM, XA2_FP32) \ + _(SOFTFP, softfp_cmp, 2, N, NIL, XA2_FP32) \ _(SOFTFP, softfp_i2d, 1, N, NUM, 0) \ - _(SOFTFP, softfp_d2i, 2, N, INT, 0) \ - _(SOFTFP_MIPS, lj_vm_sfmin, 4, N, NUM, 0) \ - _(SOFTFP_MIPS, lj_vm_sfmax, 4, N, NUM, 0) \ + _(SOFTFP, softfp_d2i, 1, N, INT, XA_FP32) \ + _(SOFTFP_MIPS, lj_vm_sfmin, 2, N, NUM, XA2_FP32) \ + _(SOFTFP_MIPS, lj_vm_sfmax, 2, N, NUM, XA2_FP32) \ + _(SOFTFP_MIPS64, lj_vm_tointg, 1, N, INT, 0) \ _(SOFTFP_FFI, softfp_ui2d, 1, N, NUM, 0) \ _(SOFTFP_FFI, softfp_f2d, 1, N, NUM, 0) \ - _(SOFTFP_FFI, softfp_d2ui, 2, N, INT, 0) \ - _(SOFTFP_FFI, softfp_d2f, 2, N, FLOAT, 0) \ + _(SOFTFP_FFI, softfp_d2ui, 1, N, INT, XA_FP32) \ + _(SOFTFP_FFI, softfp_d2f, 1, N, FLOAT, XA_FP32) \ _(SOFTFP_FFI, softfp_i2f, 1, N, FLOAT, 0) \ _(SOFTFP_FFI, softfp_ui2f, 1, N, FLOAT, 0) \ _(SOFTFP_FFI, softfp_f2i, 1, N, INT, 0) \ diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 73aef0ef..a59ba3f4 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -150,7 +150,7 @@ LJ_FUNC IRType lj_opt_narrow_forl(jit_State *J, cTValue *forbase); /* Optimization passes. */ LJ_FUNC void lj_opt_dce(jit_State *J); LJ_FUNC int lj_opt_loop(jit_State *J); -#if LJ_SOFTFP || (LJ_32 && LJ_HASFFI) +#if LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI) LJ_FUNC void lj_opt_split(jit_State *J); #else #define lj_opt_split(J) UNUSED(J) diff --git a/src/lj_jit.h b/src/lj_jit.h index 2fa8efc4..f37e7927 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -374,7 +374,7 @@ enum { ((TValue *)(((intptr_t)&J->ksimd[2*(n)] + 15) & ~(intptr_t)15)) /* Set/reset flag to activate the SPLIT pass for the current trace. */ -#if LJ_SOFTFP || (LJ_32 && LJ_HASFFI) +#if LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI) #define lj_needsplit(J) (J->needsplit = 1) #define lj_resetsplit(J) (J->needsplit = 0) #else @@ -437,7 +437,7 @@ typedef struct jit_State { MSize sizesnapmap; /* Size of temp. snapshot map buffer. */ PostProc postproc; /* Required post-processing after execution. */ -#if LJ_SOFTFP || (LJ_32 && LJ_HASFFI) +#if LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI) uint8_t needsplit; /* Need SPLIT pass. */ #endif uint8_t retryrec; /* Retry recording. */ diff --git a/src/lj_obj.h b/src/lj_obj.h index 52372c3e..c7e47422 100644 --- a/src/lj_obj.h +++ b/src/lj_obj.h @@ -924,6 +924,9 @@ static LJ_AINLINE void copyTV(lua_State *L, TValue *o1, const TValue *o2) #if LJ_SOFTFP LJ_ASMF int32_t lj_vm_tobit(double x); +#if LJ_TARGET_MIPS64 +LJ_ASMF int32_t lj_vm_tointg(double x); +#endif #endif static LJ_AINLINE int32_t lj_num2bit(lua_Number n) diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c index fc935204..79ac3cce 100644 --- a/src/lj_opt_split.c +++ b/src/lj_opt_split.c @@ -8,7 +8,7 @@ #include "lj_obj.h" -#if LJ_HASJIT && (LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) +#if LJ_HASJIT && (LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI)) #include "lj_err.h" #include "lj_buf.h" diff --git a/src/lj_snap.c b/src/lj_snap.c index bb063c2b..44fa379f 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -93,7 +93,7 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots) (ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT) sn |= SNAP_NORESTORE; } - if (LJ_SOFTFP && irt_isnum(ir->t)) + if (LJ_SOFTFP32 && irt_isnum(ir->t)) sn |= SNAP_SOFTFPNUM; map[n++] = sn; } @@ -374,7 +374,7 @@ IRIns *lj_snap_regspmap(GCtrace *T, SnapNo snapno, IRIns *ir) break; } } - } else if (LJ_SOFTFP && ir->o == IR_HIOP) { + } else if (LJ_SOFTFP32 && ir->o == IR_HIOP) { ref++; } else if (ir->o == IR_PVAL) { ref = ir->op1 + REF_BIAS; @@ -486,7 +486,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T) } else { IRType t = irt_type(ir->t); uint32_t mode = IRSLOAD_INHERIT|IRSLOAD_PARENT; - if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM; + if (LJ_SOFTFP32 && (sn & SNAP_SOFTFPNUM)) t = IRT_NUM; if (ir->o == IR_SLOAD) mode |= (ir->op2 & IRSLOAD_READONLY); tr = emitir_raw(IRT(IR_SLOAD, t), s, mode); } @@ -520,7 +520,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T) if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) { if (snap_pref(J, T, map, nent, seen, irs->op2) == 0) snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1); - else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) && + else if ((LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI)) && irs+1 < irlast && (irs+1)->o == IR_HIOP) snap_pref(J, T, map, nent, seen, (irs+1)->op2); } @@ -579,10 +579,10 @@ void lj_snap_replay(jit_State *J, GCtrace *T) lua_assert(irc->o == IR_CONV && irc->op2 == IRCONV_NUM_INT); val = snap_pref(J, T, map, nent, seen, irc->op1); val = emitir(IRTN(IR_CONV), val, IRCONV_NUM_INT); - } else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) && + } else if ((LJ_SOFTFP32 || (LJ_32 && LJ_HASFFI)) && irs+1 < irlast && (irs+1)->o == IR_HIOP) { IRType t = IRT_I64; - if (LJ_SOFTFP && irt_type((irs+1)->t) == IRT_SOFTFP) + if (LJ_SOFTFP32 && irt_type((irs+1)->t) == IRT_SOFTFP) t = IRT_NUM; lj_needsplit(J); if (irref_isk(irs->op2) && irref_isk((irs+1)->op2)) { @@ -635,7 +635,7 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, int32_t *sps = &ex->spill[regsp_spill(rs)]; if (irt_isinteger(t)) { setintV(o, *sps); -#if !LJ_SOFTFP +#if !LJ_SOFTFP32 } else if (irt_isnum(t)) { o->u64 = *(uint64_t *)sps; #endif @@ -660,6 +660,9 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, #if !LJ_SOFTFP } else if (irt_isnum(t)) { setnumV(o, ex->fpr[r-RID_MIN_FPR]); +#elif LJ_64 /* && LJ_SOFTFP */ + } else if (irt_isnum(t)) { + o->u64 = ex->gpr[r-RID_MIN_GPR]; #endif #if LJ_64 && !LJ_GC64 } else if (irt_is64(t)) { @@ -813,7 +816,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, val = lj_tab_set(J->L, t, &tmp); /* NOBARRIER: The table is new (marked white). */ snap_restoreval(J, T, ex, snapno, rfilt, irs->op2, val); - if (LJ_SOFTFP && irs+1 < T->ir + T->nins && (irs+1)->o == IR_HIOP) { + if (LJ_SOFTFP32 && irs+1 < T->ir + T->nins && (irs+1)->o == IR_HIOP) { snap_restoreval(J, T, ex, snapno, rfilt, (irs+1)->op2, &tmp); val->u32.hi = tmp.u32.lo; } @@ -874,7 +877,7 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) continue; } snap_restoreval(J, T, ex, snapno, rfilt, ref, o); - if (LJ_SOFTFP && (sn & SNAP_SOFTFPNUM) && tvisint(o)) { + if (LJ_SOFTFP32 && (sn & SNAP_SOFTFPNUM) && tvisint(o)) { TValue tmp; snap_restoreval(J, T, ex, snapno, rfilt, ref+1, &tmp); o->u32.hi = tmp.u32.lo; diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc index c06270a0..75b38dee 100644 --- a/src/vm_mips64.dasc +++ b/src/vm_mips64.dasc @@ -1980,6 +1980,38 @@ static void build_subroutines(BuildCtx *ctx) |1: | jr ra |. move CRET1, r0 + | + |// FP number to int conversion with a check for soft-float. + |// Modifies CARG1, CRET1, CRET2, TMP0, AT. + |->vm_tointg: + |.if JIT + | dsll CRET2, CARG1, 1 + | beqz CRET2, >2 + |. li TMP0, 1076 + | dsrl AT, CRET2, 53 + | dsubu TMP0, TMP0, AT + | sltiu AT, TMP0, 54 + | beqz AT, >1 + |. dextm CRET2, CRET2, 0, 20 + | dinsu CRET2, AT, 21, 21 + | slt AT, CARG1, r0 + | dsrlv CRET1, CRET2, TMP0 + | dsubu CARG1, r0, CRET1 + | movn CRET1, CARG1, AT + | li CARG1, 64 + | subu TMP0, CARG1, TMP0 + | dsllv CRET2, CRET2, TMP0 // Integer check. + | sextw AT, CRET1 + | xor AT, CRET1, AT // Range check. + | jr ra + |. movz CRET2, AT, CRET2 + |1: + | jr ra + |. li CRET2, 1 + |2: + | jr ra + |. move CRET1, r0 + |.endif |.endif | |.macro .ffunc_bit, name @@ -2665,6 +2697,23 @@ static void build_subroutines(BuildCtx *ctx) |. li CRET1, 0 |.endif | + |.macro sfmin_max, name, intins + |->vm_sf .. name: + |.if JIT and not FPU + | move TMP2, ra + | bal ->vm_sfcmpolt + |. nop + | move ra, TMP2 + | move TMP0, CRET1 + | move CRET1, CARG1 + | jr ra + |. intins CRET1, CARG2, TMP0 + |.endif + |.endmacro + | + | sfmin_max min, movz + | sfmin_max max, movn + | |//----------------------------------------------------------------------- |//-- Miscellaneous functions -------------------------------------------- |//----------------------------------------------------------------------- From b0ecc6dd65a0b40e1868f20719c4f7c4880dc32d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 8 Jun 2017 00:15:15 +0200 Subject: [PATCH 089/116] FreeBSD/x64: Avoid changing resource limits, if not needed. --- src/lj_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_alloc.c b/src/lj_alloc.c index 95d15d04..9fc761c7 100644 --- a/src/lj_alloc.c +++ b/src/lj_alloc.c @@ -343,7 +343,7 @@ static void *CALL_MMAP(size_t size) } #endif -#if (defined(__FreeBSD__) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4 +#if LJ_64 && !LJ_GC64 && ((defined(__FreeBSD__) && __FreeBSD__ < 10) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4 #include From 6a71e71c1430e5a8f794a52cb2da66e2693db796 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 11 Jun 2017 10:02:08 +0200 Subject: [PATCH 090/116] Remove unused define. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested by 罗泽轩. --- src/lj_def.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lj_def.h b/src/lj_def.h index 2d8fff66..e67bb24c 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -80,7 +80,6 @@ typedef unsigned int uintptr_t; #define LJ_MIN_SBUF 32 /* Min. string buffer length. */ #define LJ_MIN_VECSZ 8 /* Min. size for growable vectors. */ #define LJ_MIN_IRSZ 32 /* Min. size for growable IR. */ -#define LJ_MIN_K64SZ 16 /* Min. size for chained K64Array. */ /* JIT compiler limits. */ #define LJ_MAX_JSLOTS 250 /* Max. # of stack slots for a trace. */ From 82151a4514e6538086f3f5e01cb8d4b22287b14f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 12 Jun 2017 09:24:00 +0200 Subject: [PATCH 091/116] Modify fix for warning from 'ar'. --- src/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index f7f81a4e..24e8c0e0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -208,7 +208,7 @@ TARGET_CC= $(STATIC_CC) TARGET_STCC= $(STATIC_CC) TARGET_DYNCC= $(DYNAMIC_CC) TARGET_LD= $(CROSS)$(CC) -TARGET_AR= $(CROSS)ar rcus 2>/dev/null +TARGET_AR= $(CROSS)ar rcus TARGET_STRIP= $(CROSS)strip TARGET_LIBPATH= $(or $(PREFIX),/usr/local)/$(or $(MULTILIB),lib) @@ -293,6 +293,7 @@ ifeq (Windows,$(TARGET_SYS)) TARGET_XSHLDFLAGS= -shared TARGET_DYNXLDOPTS= else + TARGET_AR+= 2>/dev/null ifeq (,$(shell $(TARGET_CC) -o /dev/null -c -x c /dev/null -fno-stack-protector 2>/dev/null || echo 1)) TARGET_XCFLAGS+= -fno-stack-protector endif From 7e662e4f87134f1e84f7bea80933e033c5bf53a3 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 26 Jul 2017 09:52:53 +0200 Subject: [PATCH 092/116] x64/LJ_GC64: Fix emit_rma(). --- src/lj_emit_x86.h | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index 5207f9da..5b139bd3 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -343,9 +343,27 @@ static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr) emit_rmro(as, xo, rr, RID_DISPATCH, (int32_t)dispofs(as, addr)); } else if (checki32(mcpofs(as, addr)) && checki32(mctopofs(as, addr))) { emit_rmro(as, xo, rr, RID_RIP, (int32_t)mcpofs(as, addr)); - } else if (!checki32((intptr_t)addr) && (xo == XO_MOV || xo == XO_MOVSD)) { - emit_rmro(as, xo, rr, rr, 0); - emit_loadu64(as, rr, (uintptr_t)addr); + } else if (!checki32((intptr_t)addr)) { + Reg ra = (rr & 15); + if (xo != XO_MOV) { + /* We can't allocate a register here. Use and restore DISPATCH. Ugly. */ + uint64_t dispaddr = (uintptr_t)J2GG(as->J)->dispatch; + uint8_t i8 = xo == XO_GROUP3b ? *as->mcp++ : 0; + ra = RID_DISPATCH; + if (checku32(dispaddr)) { + emit_loadi(as, ra, (int32_t)dispaddr); + } else { /* Full-size 64 bit load. */ + MCode *p = as->mcp; + *(uint64_t *)(p-8) = dispaddr; + p[-9] = (MCode)(XI_MOVri+(ra&7)); + p[-10] = 0x48 + ((ra>>3)&1); + p -= 10; + as->mcp = p; + } + if (xo == XO_GROUP3b) emit_i8(as, i8); + } + emit_rmro(as, xo, rr, ra, 0); + emit_loadu64(as, ra, (uintptr_t)addr); } else #endif { From fd37da0d586c331b0008fbfd653a9659344fe76f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 26 Jul 2017 09:52:19 +0200 Subject: [PATCH 093/116] PPC: Add soft-float support to interpreter. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- src/host/buildvm_asm.c | 2 +- src/lj_arch.h | 29 +- src/lj_ccall.c | 38 +- src/lj_ccall.h | 4 +- src/lj_ccallback.c | 30 +- src/lj_frame.h | 2 +- src/lj_ircall.h | 2 +- src/vm_ppc.dasc | 1249 +++++++++++++++++++++++++++++++++------- 8 files changed, 1101 insertions(+), 255 deletions(-) diff --git a/src/host/buildvm_asm.c b/src/host/buildvm_asm.c index ffd14903..43595b31 100644 --- a/src/host/buildvm_asm.c +++ b/src/host/buildvm_asm.c @@ -338,7 +338,7 @@ void emit_asm(BuildCtx *ctx) #if !(LJ_TARGET_PS3 || LJ_TARGET_PSVITA) fprintf(ctx->fp, "\t.section .note.GNU-stack,\"\"," ELFASM_PX "progbits\n"); #endif -#if LJ_TARGET_PPC && !LJ_TARGET_PS3 +#if LJ_TARGET_PPC && !LJ_TARGET_PS3 && !LJ_ABI_SOFTFP /* Hard-float ABI. */ fprintf(ctx->fp, "\t.gnu_attribute 4, 1\n"); #endif diff --git a/src/lj_arch.h b/src/lj_arch.h index b7705642..0145a7c0 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -254,6 +254,29 @@ #else #define LJ_ARCH_BITS 32 #define LJ_ARCH_NAME "ppc" + +#if !defined(LJ_ARCH_HASFPU) +#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) +#define LJ_ARCH_HASFPU 0 +#else +#define LJ_ARCH_HASFPU 1 +#endif +#endif + +#if !defined(LJ_ABI_SOFTFP) +#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) +#define LJ_ABI_SOFTFP 1 +#else +#define LJ_ABI_SOFTFP 0 +#endif +#endif +#endif + +#if LJ_ABI_SOFTFP +#define LJ_ARCH_NOJIT 1 /* NYI */ +#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL +#else +#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE #endif #define LJ_TARGET_PPC 1 @@ -262,7 +285,6 @@ #define LJ_TARGET_MASKSHIFT 0 #define LJ_TARGET_MASKROT 1 #define LJ_TARGET_UNIFYROT 1 /* Want only IR_BROL. */ -#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE #if LJ_TARGET_CONSOLE #define LJ_ARCH_PPC32ON64 1 @@ -415,16 +437,13 @@ #error "No support for ILP32 model on ARM64" #endif #elif LJ_TARGET_PPC -#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) -#error "No support for PowerPC CPUs without double-precision FPU" -#endif #if !LJ_ARCH_PPC64 && LJ_ARCH_ENDIAN == LUAJIT_LE #error "No support for little-endian PPC32" #endif #if LJ_ARCH_PPC64 #error "No support for PowerPC 64 bit mode (yet)" #endif -#ifdef __NO_FPRS__ +#if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT) #error "No support for PPC/e500 anymore (use LuaJIT 2.0)" #endif #elif LJ_TARGET_MIPS32 diff --git a/src/lj_ccall.c b/src/lj_ccall.c index 5c252e5b..799be487 100644 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -387,6 +387,24 @@ #define CCALL_HANDLE_COMPLEXARG \ /* Pass complex by value in 2 or 4 GPRs. */ +#define CCALL_HANDLE_GPR \ + /* Try to pass argument in GPRs. */ \ + if (n > 1) { \ + lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \ + if (ctype_isinteger(d->info) || ctype_isfp(d->info)) \ + ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ + else if (ngpr + n > maxgpr) \ + ngpr = maxgpr; /* Prevent reordering. */ \ + } \ + if (ngpr + n <= maxgpr) { \ + dp = &cc->gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } \ + +#if LJ_ABI_SOFTFP +#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR +#else #define CCALL_HANDLE_REGARG \ if (isfp) { /* Try to pass argument in FPRs. */ \ if (nfpr + 1 <= CCALL_NARG_FPR) { \ @@ -395,24 +413,16 @@ d = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ \ goto done; \ } \ - } else { /* Try to pass argument in GPRs. */ \ - if (n > 1) { \ - lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \ - if (ctype_isinteger(d->info)) \ - ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ - else if (ngpr + n > maxgpr) \ - ngpr = maxgpr; /* Prevent reordering. */ \ - } \ - if (ngpr + n <= maxgpr) { \ - dp = &cc->gpr[ngpr]; \ - ngpr += n; \ - goto done; \ - } \ + } else { \ + CCALL_HANDLE_GPR \ } +#endif +#if !LJ_ABI_SOFTFP #define CCALL_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ ctr = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ +#endif #elif LJ_TARGET_MIPS32 /* -- MIPS o32 calling conventions ---------------------------------------- */ @@ -1080,7 +1090,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, } if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG); /* Too few arguments. */ -#if LJ_TARGET_X64 || LJ_TARGET_PPC +#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) cc->nfpr = nfpr; /* Required for vararg functions. */ #endif cc->nsp = nsp; diff --git a/src/lj_ccall.h b/src/lj_ccall.h index 59f66481..6efa48c7 100644 --- a/src/lj_ccall.h +++ b/src/lj_ccall.h @@ -86,9 +86,9 @@ typedef union FPRArg { #elif LJ_TARGET_PPC #define CCALL_NARG_GPR 8 -#define CCALL_NARG_FPR 8 +#define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 8) #define CCALL_NRET_GPR 4 /* For complex double. */ -#define CCALL_NRET_FPR 1 +#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 1) #define CCALL_SPS_EXTRA 4 #define CCALL_SPS_FREE 0 diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c index 846827b1..03494a7a 100644 --- a/src/lj_ccallback.c +++ b/src/lj_ccallback.c @@ -419,6 +419,23 @@ void lj_ccallback_mcode_free(CTState *cts) #elif LJ_TARGET_PPC +#define CALLBACK_HANDLE_GPR \ + if (n > 1) { \ + lua_assert(((LJ_ABI_SOFTFP && ctype_isnum(cta->info)) || /* double. */ \ + ctype_isinteger(cta->info)) && n == 2); /* int64_t. */ \ + ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ + } \ + if (ngpr + n <= maxgpr) { \ + sp = &cts->cb.gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } + +#if LJ_ABI_SOFTFP +#define CALLBACK_HANDLE_REGARG \ + CALLBACK_HANDLE_GPR \ + UNUSED(isfp); +#else #define CALLBACK_HANDLE_REGARG \ if (isfp) { \ if (nfpr + 1 <= CCALL_NARG_FPR) { \ @@ -427,20 +444,15 @@ void lj_ccallback_mcode_free(CTState *cts) goto done; \ } \ } else { /* Try to pass argument in GPRs. */ \ - if (n > 1) { \ - lua_assert(ctype_isinteger(cta->info) && n == 2); /* int64_t. */ \ - ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ - } \ - if (ngpr + n <= maxgpr) { \ - sp = &cts->cb.gpr[ngpr]; \ - ngpr += n; \ - goto done; \ - } \ + CALLBACK_HANDLE_GPR \ } +#endif +#if !LJ_ABI_SOFTFP #define CALLBACK_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ *(double *)dp = *(float *)dp; /* FPRs always hold doubles. */ +#endif #elif LJ_TARGET_MIPS32 diff --git a/src/lj_frame.h b/src/lj_frame.h index 19c49a4a..04cb5a35 100644 --- a/src/lj_frame.h +++ b/src/lj_frame.h @@ -226,7 +226,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */ #define CFRAME_OFS_L 36 #define CFRAME_OFS_PC 32 #define CFRAME_OFS_MULTRES 28 -#define CFRAME_SIZE 272 +#define CFRAME_SIZE (LJ_ARCH_HASFPU ? 272 : 128) #define CFRAME_SHIFT_MULTRES 3 #endif #elif LJ_TARGET_MIPS32 diff --git a/src/lj_ircall.h b/src/lj_ircall.h index 73120065..9b3883b7 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -287,7 +287,7 @@ LJ_DATA const CCallInfo lj_ir_callinfo[IRCALL__MAX+1]; #define fp64_f2l __aeabi_f2lz #define fp64_f2ul __aeabi_f2ulz #endif -#elif LJ_TARGET_MIPS +#elif LJ_TARGET_MIPS || LJ_TARGET_PPC #define softfp_add __adddf3 #define softfp_sub __subdf3 #define softfp_mul __muldf3 diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc index b4260ebc..0839668c 100644 --- a/src/vm_ppc.dasc +++ b/src/vm_ppc.dasc @@ -103,6 +103,18 @@ |// Fixed register assignments for the interpreter. |// Don't use: r1 = sp, r2 and r13 = reserved (TOC, TLS or SDATA) | +|.macro .FPU, a, b +|.if FPU +| a, b +|.endif +|.endmacro +| +|.macro .FPU, a, b, c +|.if FPU +| a, b, c +|.endif +|.endmacro +| |// The following must be C callee-save (but BASE is often refetched). |.define BASE, r14 // Base of current Lua stack frame. |.define KBASE, r15 // Constants of current Lua function. @@ -116,8 +128,10 @@ |.define TISNUM, r22 |.define TISNIL, r23 |.define ZERO, r24 +|.if FPU |.define TOBIT, f30 // 2^52 + 2^51. |.define TONUM, f31 // 2^52 + 2^51 + 2^31. +|.endif | |// The following temporaries are not saved across C calls, except for RA. |.define RA, r20 // Callee-save. @@ -133,6 +147,7 @@ | |// Saved temporaries. |.define SAVE0, r21 +|.define SAVE1, r25 | |// Calling conventions. |.define CARG1, r3 @@ -141,8 +156,10 @@ |.define CARG4, r6 // Overlaps TMP3. |.define CARG5, r7 // Overlaps INS. | +|.if FPU |.define FARG1, f1 |.define FARG2, f2 +|.endif | |.define CRET1, r3 |.define CRET2, r4 @@ -213,10 +230,16 @@ |.endif |.else | +|.if FPU |.define SAVE_LR, 276(sp) |.define CFRAME_SPACE, 272 // Delta for sp. |// Back chain for sp: 272(sp) <-- sp entering interpreter |.define SAVE_FPR_, 128 // .. 128+18*8: 64 bit FPR saves. +|.else +|.define SAVE_LR, 132(sp) +|.define CFRAME_SPACE, 128 // Delta for sp. +|// Back chain for sp: 128(sp) <-- sp entering interpreter +|.endif |.define SAVE_GPR_, 56 // .. 56+18*4: 32 bit GPR saves. |.define SAVE_CR, 52(sp) // 32 bit CR save. |.define SAVE_ERRF, 48(sp) // 32 bit C frame info. @@ -226,16 +249,25 @@ |.define SAVE_PC, 32(sp) |.define SAVE_MULTRES, 28(sp) |.define UNUSED1, 24(sp) +|.if FPU |.define TMPD_LO, 20(sp) |.define TMPD_HI, 16(sp) |.define TONUM_LO, 12(sp) |.define TONUM_HI, 8(sp) +|.else +|.define SFSAVE_4, 20(sp) +|.define SFSAVE_3, 16(sp) +|.define SFSAVE_2, 12(sp) +|.define SFSAVE_1, 8(sp) +|.endif |// Next frame lr: 4(sp) |// Back chain for sp: 0(sp) <-- sp while in interpreter | +|.if FPU |.define TMPD_BLO, 23(sp) |.define TMPD, TMPD_HI |.define TONUM_D, TONUM_HI +|.endif | |.endif | @@ -245,7 +277,7 @@ |.else | stw r..reg, SAVE_GPR_+(reg-14)*4(sp) |.endif -| stfd f..reg, SAVE_FPR_+(reg-14)*8(sp) +| .FPU stfd f..reg, SAVE_FPR_+(reg-14)*8(sp) |.endmacro |.macro rest_, reg |.if GPR64 @@ -253,7 +285,7 @@ |.else | lwz r..reg, SAVE_GPR_+(reg-14)*4(sp) |.endif -| lfd f..reg, SAVE_FPR_+(reg-14)*8(sp) +| .FPU lfd f..reg, SAVE_FPR_+(reg-14)*8(sp) |.endmacro | |.macro saveregs @@ -323,6 +355,7 @@ |// Trap for not-yet-implemented parts. |.macro NYI; tw 4, sp, sp; .endmacro | +|.if FPU |// int/FP conversions. |.macro tonum_i, freg, reg | xoris reg, reg, 0x8000 @@ -346,6 +379,7 @@ |.macro toint, reg, freg | toint reg, freg, freg |.endmacro +|.endif | |//----------------------------------------------------------------------- | @@ -533,9 +567,19 @@ static void build_subroutines(BuildCtx *ctx) | beq >2 |1: | addic. TMP1, TMP1, -8 + |.if FPU | lfd f0, 0(RA) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + |.endif | addi RA, RA, 8 + |.if FPU | stfd f0, 0(BASE) + |.else + | stw CARG1, 0(BASE) + | stw CARG2, 4(BASE) + |.endif | addi BASE, BASE, 8 | bney <1 | @@ -613,23 +657,23 @@ static void build_subroutines(BuildCtx *ctx) | .toc ld TOCREG, SAVE_TOC | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp BASE, L->base - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | lwz DISPATCH, L->glref // Setup pointer to dispatch table. | li ZERO, 0 - | stw TMP3, TMPD + | .FPU stw TMP3, TMPD | li TMP1, LJ_TFALSE - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). | li TISNIL, LJ_TNIL | li_vmstate INTERP - | lfs TOBIT, TMPD + | .FPU lfs TOBIT, TMPD | lwz PC, FRAME_PC(BASE) // Fetch PC of previous frame. | la RA, -8(BASE) // Results start at BASE-8. - | stw TMP3, TMPD + | .FPU stw TMP3, TMPD | addi DISPATCH, DISPATCH, GG_G2DISP | stw TMP1, 0(RA) // Prepend false to error message. | li RD, 16 // 2 results: false + error message. | st_vmstate - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | b ->vm_returnc | |//----------------------------------------------------------------------- @@ -690,22 +734,22 @@ static void build_subroutines(BuildCtx *ctx) | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp TMP1, L->top | lwz PC, FRAME_PC(BASE) - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | stb CARG3, L->status - | stw TMP3, TMPD - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | lfs TOBIT, TMPD + | .FPU stw TMP3, TMPD + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU lfs TOBIT, TMPD | sub RD, TMP1, BASE - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | addi RD, RD, 8 - | stw TMP0, TONUM_HI + | .FPU stw TMP0, TONUM_HI | li_vmstate INTERP | li ZERO, 0 | st_vmstate | andix. TMP0, PC, FRAME_TYPE | mr MULTRES, RD - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | li TISNIL, LJ_TNIL | beq ->BC_RET_Z | b ->vm_return @@ -739,19 +783,19 @@ static void build_subroutines(BuildCtx *ctx) | lp TMP2, L->base // TMP2 = old base (used in vmeta_call). | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp TMP1, L->top - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | add PC, PC, BASE - | stw TMP3, TMPD + | .FPU stw TMP3, TMPD | li ZERO, 0 - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | lfs TOBIT, TMPD + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU lfs TOBIT, TMPD | sub PC, PC, TMP2 // PC = frame delta + frame type - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | sub NARGS8:RC, TMP1, BASE - | stw TMP0, TONUM_HI + | .FPU stw TMP0, TONUM_HI | li_vmstate INTERP - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | li TISNIL, LJ_TNIL | st_vmstate | @@ -839,15 +883,30 @@ static void build_subroutines(BuildCtx *ctx) | lwz INS, -4(PC) | subi CARG2, RB, 16 | decode_RB8 SAVE0, INS + |.if FPU | lfd f0, 0(RA) + |.else + | lwz TMP2, 0(RA) + | lwz TMP3, 4(RA) + |.endif | add TMP1, BASE, SAVE0 | stp BASE, L->base | cmplw TMP1, CARG2 | sub CARG3, CARG2, TMP1 | decode_RA8 RA, INS + |.if FPU | stfd f0, 0(CARG2) + |.else + | stw TMP2, 0(CARG2) + | stw TMP3, 4(CARG2) + |.endif | bney ->BC_CAT_Z + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux TMP2, RA, BASE + | stw TMP3, 4(RA) + |.endif | b ->cont_nop | |//-- Table indexing metamethods ----------------------------------------- @@ -900,9 +959,19 @@ static void build_subroutines(BuildCtx *ctx) | // Returns TValue * (finished) or NULL (metamethod). | cmplwi CRET1, 0 | beq >3 + |.if FPU | lfd f0, 0(CRET1) + |.else + | lwz TMP0, 0(CRET1) + | lwz TMP1, 4(CRET1) + |.endif | ins_next1 + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 | |3: // Call __index metamethod. @@ -920,7 +989,12 @@ static void build_subroutines(BuildCtx *ctx) | // Returns cTValue * or NULL. | cmplwi CRET1, 0 | beq >1 + |.if FPU | lfd f14, 0(CRET1) + |.else + | lwz SAVE0, 0(CRET1) + | lwz SAVE1, 4(CRET1) + |.endif | b ->BC_TGETR_Z |1: | stwx TISNIL, BASE, RA @@ -975,11 +1049,21 @@ static void build_subroutines(BuildCtx *ctx) | bl extern lj_meta_tset // (lua_State *L, TValue *o, TValue *k) | // Returns TValue * (finished) or NULL (metamethod). | cmplwi CRET1, 0 + |.if FPU | lfdx f0, BASE, RA + |.else + | lwzux TMP2, RA, BASE + | lwz TMP3, 4(RA) + |.endif | beq >3 | // NOBARRIER: lj_meta_tset ensures the table is not black. | ins_next1 + |.if FPU | stfd f0, 0(CRET1) + |.else + | stw TMP2, 0(CRET1) + | stw TMP3, 4(CRET1) + |.endif | ins_next2 | |3: // Call __newindex metamethod. @@ -990,7 +1074,12 @@ static void build_subroutines(BuildCtx *ctx) | add PC, TMP1, BASE | lwz LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. | li NARGS8:RC, 24 // 3 args for func(t, k, v) + |.if FPU | stfd f0, 16(BASE) // Copy value to third argument. + |.else + | stw TMP2, 16(BASE) + | stw TMP3, 20(BASE) + |.endif | b ->vm_call_dispatch_f | |->vmeta_tsetr: @@ -998,7 +1087,12 @@ static void build_subroutines(BuildCtx *ctx) | stw PC, SAVE_PC | bl extern lj_tab_setinth // (lua_State *L, GCtab *t, int32_t key) | // Returns TValue *. + |.if FPU | stfd f14, 0(CRET1) + |.else + | stw SAVE0, 0(CRET1) + | stw SAVE1, 4(CRET1) + |.endif | b ->cont_nop | |//-- Comparison metamethods --------------------------------------------- @@ -1037,9 +1131,19 @@ static void build_subroutines(BuildCtx *ctx) | |->cont_ra: // RA = resultptr | lwz INS, -4(PC) + |.if FPU | lfd f0, 0(RA) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + |.endif | decode_RA8 TMP1, INS + |.if FPU | stfdx f0, BASE, TMP1 + |.else + | stwux CARG1, TMP1, BASE + | stw CARG2, 4(TMP1) + |.endif | b ->cont_nop | |->cont_condt: // RA = resultptr @@ -1245,22 +1349,32 @@ static void build_subroutines(BuildCtx *ctx) |.macro .ffunc_n, name |->ff_ .. name: | cmplwi NARGS8:RC, 8 - | lwz CARG3, 0(BASE) + | lwz CARG1, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG2, 4(BASE) + |.endif | blt ->fff_fallback - | checknum CARG3; bge ->fff_fallback + | checknum CARG1; bge ->fff_fallback |.endmacro | |.macro .ffunc_nn, name |->ff_ .. name: | cmplwi NARGS8:RC, 16 - | lwz CARG3, 0(BASE) + | lwz CARG1, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) - | lwz CARG4, 8(BASE) + | lwz CARG3, 8(BASE) | lfd FARG2, 8(BASE) + |.else + | lwz CARG2, 4(BASE) + | lwz CARG3, 8(BASE) + | lwz CARG4, 12(BASE) + |.endif | blt ->fff_fallback + | checknum CARG1; bge ->fff_fallback | checknum CARG3; bge ->fff_fallback - | checknum CARG4; bge ->fff_fallback |.endmacro | |// Inlined GC threshold check. Caveat: uses TMP0 and TMP1. @@ -1281,14 +1395,21 @@ static void build_subroutines(BuildCtx *ctx) | bge cr1, ->fff_fallback | stw CARG3, 0(RA) | addi RD, NARGS8:RC, 8 // Compute (nresults+1)*8. + | addi TMP1, BASE, 8 + | add TMP2, RA, NARGS8:RC | stw CARG1, 4(RA) | beq ->fff_res // Done if exactly 1 argument. - | li TMP1, 8 - | subi RC, RC, 8 |1: - | cmplw TMP1, RC - | lfdx f0, BASE, TMP1 - | stfdx f0, RA, TMP1 + | cmplw TMP1, TMP2 + |.if FPU + | lfd f0, 0(TMP1) + | stfd f0, 0(TMP1) + |.else + | lwz CARG1, 0(TMP1) + | lwz CARG2, 4(TMP1) + | stw CARG1, -8(TMP1) + | stw CARG2, -4(TMP1) + |.endif | addi TMP1, TMP1, 8 | bney <1 | b ->fff_res @@ -1303,8 +1424,14 @@ static void build_subroutines(BuildCtx *ctx) | orc TMP1, TMP2, TMP0 | addi TMP1, TMP1, ~LJ_TISNUM+1 | slwi TMP1, TMP1, 3 + |.if FPU | la TMP2, CFUNC:RB->upvalue | lfdx FARG1, TMP2, TMP1 + |.else + | add TMP1, CFUNC:RB, TMP1 + | lwz CARG1, CFUNC:TMP1->upvalue[0].u32.hi + | lwz CARG2, CFUNC:TMP1->upvalue[0].u32.lo + |.endif | b ->fff_resn | |//-- Base library: getters and setters --------------------------------- @@ -1382,7 +1509,12 @@ static void build_subroutines(BuildCtx *ctx) | mr CARG1, L | bl extern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key) | // Returns cTValue *. + |.if FPU | lfd FARG1, 0(CRET1) + |.else + | lwz CARG2, 4(CRET1) + | lwz CARG1, 0(CRET1) // Caveat: CARG1 == CRET1. + |.endif | b ->fff_resn | |//-- Base library: conversions ------------------------------------------ @@ -1391,7 +1523,11 @@ static void build_subroutines(BuildCtx *ctx) | // Only handles the number case inline (without a base argument). | cmplwi NARGS8:RC, 8 | lwz CARG1, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG2, 4(BASE) + |.endif | bne ->fff_fallback // Exactly one argument. | checknum CARG1; bgt ->fff_fallback | b ->fff_resn @@ -1442,12 +1578,23 @@ static void build_subroutines(BuildCtx *ctx) | cmplwi CRET1, 0 | li CARG3, LJ_TNIL | beq ->fff_restv // End of traversal: return nil. - | lfd f0, 8(BASE) // Copy key and value to results. | la RA, -8(BASE) + |.if FPU + | lfd f0, 8(BASE) // Copy key and value to results. | lfd f1, 16(BASE) | stfd f0, 0(RA) - | li RD, (2+1)*8 | stfd f1, 8(RA) + |.else + | lwz CARG1, 8(BASE) + | lwz CARG2, 12(BASE) + | lwz CARG3, 16(BASE) + | lwz CARG4, 20(BASE) + | stw CARG1, 0(RA) + | stw CARG2, 4(RA) + | stw CARG3, 8(RA) + | stw CARG4, 12(RA) + |.endif + | li RD, (2+1)*8 | b ->fff_res | |.ffunc_1 pairs @@ -1456,17 +1603,32 @@ static void build_subroutines(BuildCtx *ctx) | bne ->fff_fallback #if LJ_52 | lwz TAB:TMP2, TAB:CARG1->metatable + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | cmplwi TAB:TMP2, 0 | la RA, -8(BASE) | bne ->fff_fallback #else + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | la RA, -8(BASE) #endif | stw TISNIL, 8(BASE) | li RD, (3+1)*8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw TMP0, 0(RA) + | stw TMP1, 4(RA) + |.endif | b ->fff_res | |.ffunc ipairs_aux @@ -1512,14 +1674,24 @@ static void build_subroutines(BuildCtx *ctx) | stfd FARG2, 0(RA) |.endif | ble >2 // Not in array part? + |.if FPU | lwzx TMP2, TMP1, TMP3 | lfdx f0, TMP1, TMP3 + |.else + | lwzux TMP2, TMP1, TMP3 + | lwz TMP3, 4(TMP1) + |.endif |1: | checknil TMP2 | li RD, (0+1)*8 | beq ->fff_res // End of iteration, return 0 results. | li RD, (2+1)*8 + |.if FPU | stfd f0, 8(RA) + |.else + | stw TMP2, 8(RA) + | stw TMP3, 12(RA) + |.endif | b ->fff_res |2: // Check for empty hash part first. Otherwise call C function. | lwz TMP0, TAB:CARG1->hmask @@ -1533,7 +1705,11 @@ static void build_subroutines(BuildCtx *ctx) | li RD, (0+1)*8 | beq ->fff_res | lwz TMP2, 0(CRET1) + |.if FPU | lfd f0, 0(CRET1) + |.else + | lwz TMP3, 4(CRET1) + |.endif | b <1 | |.ffunc_1 ipairs @@ -1542,12 +1718,22 @@ static void build_subroutines(BuildCtx *ctx) | bne ->fff_fallback #if LJ_52 | lwz TAB:TMP2, TAB:CARG1->metatable + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | cmplwi TAB:TMP2, 0 | la RA, -8(BASE) | bne ->fff_fallback #else + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | la RA, -8(BASE) #endif |.if DUALNUM @@ -1557,7 +1743,12 @@ static void build_subroutines(BuildCtx *ctx) |.endif | stw ZERO, 12(BASE) | li RD, (3+1)*8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw TMP0, 0(RA) + | stw TMP1, 4(RA) + |.endif | b ->fff_res | |//-- Base library: catch errors ---------------------------------------- @@ -1576,19 +1767,32 @@ static void build_subroutines(BuildCtx *ctx) | |.ffunc xpcall | cmplwi NARGS8:RC, 16 - | lwz CARG4, 8(BASE) + | lwz CARG3, 8(BASE) + |.if FPU | lfd FARG2, 8(BASE) | lfd FARG1, 0(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + | lwz CARG4, 12(BASE) + |.endif | blt ->fff_fallback | lbz TMP1, DISPATCH_GL(hookmask)(DISPATCH) | mr TMP2, BASE - | checkfunc CARG4; bne ->fff_fallback // Traceback must be a function. + | checkfunc CARG3; bne ->fff_fallback // Traceback must be a function. | la BASE, 16(BASE) | // Remember active hook before pcall. | rlwinm TMP1, TMP1, 32-HOOK_ACTIVE_SHIFT, 31, 31 + |.if FPU | stfd FARG2, 0(TMP2) // Swap function and traceback. - | subi NARGS8:RC, NARGS8:RC, 16 | stfd FARG1, 8(TMP2) + |.else + | stw CARG3, 0(TMP2) + | stw CARG4, 4(TMP2) + | stw CARG1, 8(TMP2) + | stw CARG2, 12(TMP2) + |.endif + | subi NARGS8:RC, NARGS8:RC, 16 | addi PC, TMP1, 16+FRAME_PCALL | b ->vm_call_dispatch | @@ -1631,9 +1835,21 @@ static void build_subroutines(BuildCtx *ctx) | stp BASE, L->top |2: // Move args to coroutine. | cmpw TMP1, NARGS8:RC + |.if FPU | lfdx f0, BASE, TMP1 + |.else + | add CARG3, BASE, TMP1 + | lwz TMP2, 0(CARG3) + | lwz TMP3, 4(CARG3) + |.endif | beq >3 + |.if FPU | stfdx f0, CARG2, TMP1 + |.else + | add CARG3, CARG2, TMP1 + | stw TMP2, 0(CARG3) + | stw TMP3, 4(CARG3) + |.endif | addi TMP1, TMP1, 8 | b <2 |3: @@ -1664,8 +1880,17 @@ static void build_subroutines(BuildCtx *ctx) | stp TMP2, L:SAVE0->top // Clear coroutine stack. |5: // Move results from coroutine. | cmplw TMP1, TMP3 + |.if FPU | lfdx f0, TMP2, TMP1 | stfdx f0, BASE, TMP1 + |.else + | add CARG3, TMP2, TMP1 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + | add CARG3, BASE, TMP1 + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | addi TMP1, TMP1, 8 | bne <5 |6: @@ -1690,12 +1915,22 @@ static void build_subroutines(BuildCtx *ctx) | andix. TMP0, PC, FRAME_TYPE | la TMP3, -8(TMP3) | li TMP1, LJ_TFALSE + |.if FPU | lfd f0, 0(TMP3) + |.else + | lwz CARG1, 0(TMP3) + | lwz CARG2, 4(TMP3) + |.endif | stp TMP3, L:SAVE0->top // Remove error from coroutine stack. | li RD, (2+1)*8 | stw TMP1, -8(BASE) // Prepend false to results. | la RA, -8(BASE) + |.if FPU | stfd f0, 0(BASE) // Copy error message. + |.else + | stw CARG1, 0(BASE) // Copy error message. + | stw CARG2, 4(BASE) + |.endif | b <7 |.else | mr CARG1, L @@ -1874,7 +2109,12 @@ static void build_subroutines(BuildCtx *ctx) | lus CARG1, 0x8000 // -(2^31). | beqy ->fff_resi |5: + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + |.endif | blex func | b ->fff_resn |.endmacro @@ -1898,10 +2138,14 @@ static void build_subroutines(BuildCtx *ctx) | |.ffunc math_log | cmplwi NARGS8:RC, 8 - | lwz CARG3, 0(BASE) - | lfd FARG1, 0(BASE) + | lwz CARG1, 0(BASE) | bne ->fff_fallback // Need exactly 1 argument. - | checknum CARG3; bge ->fff_fallback + | checknum CARG1; bge ->fff_fallback + |.if FPU + | lfd FARG1, 0(BASE) + |.else + | lwz CARG2, 4(BASE) + |.endif | blex log | b ->fff_resn | @@ -1923,17 +2167,24 @@ static void build_subroutines(BuildCtx *ctx) |.if DUALNUM |.ffunc math_ldexp | cmplwi NARGS8:RC, 16 - | lwz CARG3, 0(BASE) + | lwz TMP0, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) - | lwz CARG4, 8(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + |.endif + | lwz TMP1, 8(BASE) |.if GPR64 | lwz CARG2, 12(BASE) - |.else + |.elif FPU | lwz CARG1, 12(BASE) + |.else + | lwz CARG3, 12(BASE) |.endif | blt ->fff_fallback - | checknum CARG3; bge ->fff_fallback - | checknum CARG4; bne ->fff_fallback + | checknum TMP0; bge ->fff_fallback + | checknum TMP1; bne ->fff_fallback |.else |.ffunc_nn math_ldexp |.if GPR64 @@ -1948,8 +2199,10 @@ static void build_subroutines(BuildCtx *ctx) |.ffunc_n math_frexp |.if GPR64 | la CARG2, DISPATCH_GL(tmptv)(DISPATCH) - |.else + |.elif FPU | la CARG1, DISPATCH_GL(tmptv)(DISPATCH) + |.else + | la CARG3, DISPATCH_GL(tmptv)(DISPATCH) |.endif | lwz PC, FRAME_PC(BASE) | blex frexp @@ -1958,7 +2211,12 @@ static void build_subroutines(BuildCtx *ctx) |.if not DUALNUM | tonum_i FARG2, TMP1 |.endif + |.if FPU | stfd FARG1, 0(RA) + |.else + | stw CRET1, 0(RA) + | stw CRET2, 4(RA) + |.endif | li RD, (2+1)*8 |.if DUALNUM | stw TISNUM, 8(RA) @@ -1971,13 +2229,20 @@ static void build_subroutines(BuildCtx *ctx) |.ffunc_n math_modf |.if GPR64 | la CARG2, -8(BASE) - |.else + |.elif FPU | la CARG1, -8(BASE) + |.else + | la CARG3, -8(BASE) |.endif | lwz PC, FRAME_PC(BASE) | blex modf | la RA, -8(BASE) + |.if FPU | stfd FARG1, 0(BASE) + |.else + | stw CRET1, 0(BASE) + | stw CRET2, 4(BASE) + |.endif | li RD, (2+1)*8 | b ->fff_res | @@ -1985,13 +2250,13 @@ static void build_subroutines(BuildCtx *ctx) |.if DUALNUM | .ffunc_1 name | checknum CARG3 - | addi TMP1, BASE, 8 - | add TMP2, BASE, NARGS8:RC + | addi SAVE0, BASE, 8 + | add SAVE1, BASE, NARGS8:RC | bne >4 |1: // Handle integers. - | lwz CARG4, 0(TMP1) - | cmplw cr1, TMP1, TMP2 - | lwz CARG2, 4(TMP1) + | lwz CARG4, 0(SAVE0) + | cmplw cr1, SAVE0, SAVE1 + | lwz CARG2, 4(SAVE0) | bge cr1, ->fff_resi | checknum CARG4 | xoris TMP0, CARG1, 0x8000 @@ -2008,36 +2273,76 @@ static void build_subroutines(BuildCtx *ctx) |.if GPR64 | rldicl CARG1, CARG1, 0, 32 |.endif - | addi TMP1, TMP1, 8 + | addi SAVE0, SAVE0, 8 | b <1 |3: | bge ->fff_fallback | // Convert intermediate result to number and continue below. + |.if FPU | tonum_i FARG1, CARG1 - | lfd FARG2, 0(TMP1) + | lfd FARG2, 0(SAVE0) + |.else + | mr CARG2, CARG1 + | bl ->vm_sfi2d_1 + | lwz CARG3, 0(SAVE0) + | lwz CARG4, 4(SAVE0) + |.endif | b >6 |4: + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + |.endif | bge ->fff_fallback |5: // Handle numbers. - | lwz CARG4, 0(TMP1) - | cmplw cr1, TMP1, TMP2 - | lfd FARG2, 0(TMP1) + | lwz CARG3, 0(SAVE0) + | cmplw cr1, SAVE0, SAVE1 + |.if FPU + | lfd FARG2, 0(SAVE0) + |.else + | lwz CARG4, 4(SAVE0) + |.endif | bge cr1, ->fff_resn - | checknum CARG4; bge >7 + | checknum CARG3; bge >7 |6: + | addi SAVE0, SAVE0, 8 + |.if FPU | fsub f0, FARG1, FARG2 - | addi TMP1, TMP1, 8 |.if ismax | fsel FARG1, f0, FARG1, FARG2 |.else | fsel FARG1, f0, FARG2, FARG1 |.endif + |.else + | stw CARG1, SFSAVE_1 + | stw CARG2, SFSAVE_2 + | stw CARG3, SFSAVE_3 + | stw CARG4, SFSAVE_4 + | blex __ledf2 + | cmpwi CRET1, 0 + |.if ismax + | blt >8 + |.else + | bge >8 + |.endif + | lwz CARG1, SFSAVE_1 + | lwz CARG2, SFSAVE_2 + | b <5 + |8: + | lwz CARG1, SFSAVE_3 + | lwz CARG2, SFSAVE_4 + |.endif | b <5 |7: // Convert integer to number and continue above. - | lwz CARG2, 4(TMP1) + | lwz CARG3, 4(SAVE0) | bne ->fff_fallback - | tonum_i FARG2, CARG2 + |.if FPU + | tonum_i FARG2, CARG3 + |.else + | bl ->vm_sfi2d_2 + |.endif | b <6 |.else | .ffunc_n name @@ -2237,28 +2542,37 @@ static void build_subroutines(BuildCtx *ctx) | |.macro .ffunc_bit_op, name, ins | .ffunc_bit name - | addi TMP1, BASE, 8 - | add TMP2, BASE, NARGS8:RC + | addi SAVE0, BASE, 8 + | add SAVE1, BASE, NARGS8:RC |1: - | lwz CARG4, 0(TMP1) - | cmplw cr1, TMP1, TMP2 + | lwz CARG4, 0(SAVE0) + | cmplw cr1, SAVE0, SAVE1 |.if DUALNUM - | lwz CARG2, 4(TMP1) + | lwz CARG2, 4(SAVE0) |.else - | lfd FARG1, 0(TMP1) + | lfd FARG1, 0(SAVE0) |.endif | bgey cr1, ->fff_resi | checknum CARG4 |.if DUALNUM + |.if FPU | bnel ->fff_bitop_fb |.else + | beq >3 + | stw CARG1, SFSAVE_1 + | bl ->fff_bitop_fb + | mr CARG2, CARG1 + | lwz CARG1, SFSAVE_1 + |3: + |.endif + |.else | fadd FARG1, FARG1, TOBIT | bge ->fff_fallback | stfd FARG1, TMPD | lwz CARG2, TMPD_LO |.endif | ins CARG1, CARG1, CARG2 - | addi TMP1, TMP1, 8 + | addi SAVE0, SAVE0, 8 | b <1 |.endmacro | @@ -2280,7 +2594,14 @@ static void build_subroutines(BuildCtx *ctx) |.macro .ffunc_bit_sh, name, ins, shmod |.if DUALNUM | .ffunc_2 bit_..name + |.if FPU | checknum CARG3; bnel ->fff_tobit_fb + |.else + | checknum CARG3; beq >1 + | bl ->fff_tobit_fb + | lwz CARG2, 12(BASE) // Conversion polluted CARG2. + |1: + |.endif | // Note: no inline conversion from number for 2nd argument! | checknum CARG4; bne ->fff_fallback |.else @@ -2317,27 +2638,77 @@ static void build_subroutines(BuildCtx *ctx) |->fff_resn: | lwz PC, FRAME_PC(BASE) | la RA, -8(BASE) + |.if FPU | stfd FARG1, -8(BASE) + |.else + | stw CARG1, -8(BASE) + | stw CARG2, -4(BASE) + |.endif | b ->fff_res1 | |// Fallback FP number to bit conversion. |->fff_tobit_fb: |.if DUALNUM + |.if FPU | lfd FARG1, 0(BASE) | bgt ->fff_fallback | fadd FARG1, FARG1, TOBIT | stfd FARG1, TMPD | lwz CARG1, TMPD_LO | blr + |.else + | bgt ->fff_fallback + | mr CARG2, CARG1 + | mr CARG1, CARG3 + |// Modifies: CARG1, CARG2, TMP0, TMP1, TMP2. + |->vm_tobit: + | slwi TMP2, CARG1, 1 + | addis TMP2, TMP2, 0x0020 + | cmpwi TMP2, 0 + | bge >2 + | li TMP1, 0x3e0 + | srawi TMP2, TMP2, 21 + | not TMP1, TMP1 + | sub. TMP2, TMP1, TMP2 + | cmpwi cr7, CARG1, 0 + | blt >1 + | slwi TMP1, CARG1, 11 + | srwi TMP0, CARG2, 21 + | oris TMP1, TMP1, 0x8000 + | or TMP1, TMP1, TMP0 + | srw CARG1, TMP1, TMP2 + | bclr 4, 28 // Return if cr7[lt] == 0, no hint. + | neg CARG1, CARG1 + | blr + |1: + | addi TMP2, TMP2, 21 + | srw TMP1, CARG2, TMP2 + | slwi CARG2, CARG1, 12 + | subfic TMP2, TMP2, 20 + | slw TMP0, CARG2, TMP2 + | or CARG1, TMP1, TMP0 + | bclr 4, 28 // Return if cr7[lt] == 0, no hint. + | neg CARG1, CARG1 + | blr + |2: + | li CARG1, 0 + | blr + |.endif |.endif |->fff_bitop_fb: |.if DUALNUM - | lfd FARG1, 0(TMP1) + |.if FPU + | lfd FARG1, 0(SAVE0) | bgt ->fff_fallback | fadd FARG1, FARG1, TOBIT | stfd FARG1, TMPD | lwz CARG2, TMPD_LO | blr + |.else + | bgt ->fff_fallback + | mr CARG1, CARG4 + | b ->vm_tobit + |.endif |.endif | |//----------------------------------------------------------------------- @@ -2530,10 +2901,21 @@ static void build_subroutines(BuildCtx *ctx) | decode_RA8 RC, INS // Call base. | beq >2 |1: // Move results down. + |.if FPU | lfd f0, 0(RA) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + |.endif | addic. TMP1, TMP1, -8 | addi RA, RA, 8 + |.if FPU | stfdx f0, BASE, RC + |.else + | add CARG3, BASE, RC + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | addi RC, RC, 8 | bne <1 |2: @@ -2586,10 +2968,12 @@ static void build_subroutines(BuildCtx *ctx) |//----------------------------------------------------------------------- | |.macro savex_, a, b, c, d + |.if FPU | stfd f..a, 16+a*8(sp) | stfd f..b, 16+b*8(sp) | stfd f..c, 16+c*8(sp) | stfd f..d, 16+d*8(sp) + |.endif |.endmacro | |->vm_exit_handler: @@ -2661,16 +3045,16 @@ static void build_subroutines(BuildCtx *ctx) | lwz KBASE, PC2PROTO(k)(TMP1) | // Setup type comparison constants. | li TISNUM, LJ_TISNUM - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). - | stw TMP3, TMPD + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU stw TMP3, TMPD | li ZERO, 0 - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | lfs TOBIT, TMPD - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU lfs TOBIT, TMPD + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | li TISNIL, LJ_TNIL - | stw TMP0, TONUM_HI - | lfs TONUM, TMPD + | .FPU stw TMP0, TONUM_HI + | .FPU lfs TONUM, TMPD | // Modified copy of ins_next which handles function header dispatch, too. | lwz INS, 0(PC) | addi PC, PC, 4 @@ -2715,7 +3099,35 @@ static void build_subroutines(BuildCtx *ctx) |//-- Math helper functions ---------------------------------------------- |//----------------------------------------------------------------------- | - |// NYI: Use internal implementations of floor, ceil, trunc. + |// NYI: Use internal implementations of floor, ceil, trunc, sfcmp. + | + |.macro sfi2d, AHI, ALO + |.if not FPU + | mr. AHI, ALO + | bclr 12, 2 // Handle zero first. + | srawi TMP0, ALO, 31 + | xor TMP1, ALO, TMP0 + | sub TMP1, TMP1, TMP0 // Absolute value in TMP1. + | cntlzw AHI, TMP1 + | andix. TMP0, TMP0, 0x800 // Mask sign bit. + | slw TMP1, TMP1, AHI // Align mantissa left with leading 1. + | subfic AHI, AHI, 0x3ff+31-1 // Exponent -1 in AHI. + | slwi ALO, TMP1, 21 + | or AHI, AHI, TMP0 // Sign | Exponent. + | srwi TMP1, TMP1, 11 + | slwi AHI, AHI, 20 // Align left. + | add AHI, AHI, TMP1 // Add mantissa, increment exponent. + | blr + |.endif + |.endmacro + | + |// Input: CARG2. Output: CARG1, CARG2. Temporaries: TMP0, TMP1. + |->vm_sfi2d_1: + | sfi2d CARG1, CARG2 + | + |// Input: CARG4. Output: CARG3, CARG4. Temporaries: TMP0, TMP1. + |->vm_sfi2d_2: + | sfi2d CARG3, CARG4 | |->vm_modi: | divwo. TMP0, CARG1, CARG2 @@ -2783,21 +3195,21 @@ static void build_subroutines(BuildCtx *ctx) | addi DISPATCH, r12, GG_G2DISP | stw r11, CTSTATE->cb.slot | stw r3, CTSTATE->cb.gpr[0] - | stfd f1, CTSTATE->cb.fpr[0] + | .FPU stfd f1, CTSTATE->cb.fpr[0] | stw r4, CTSTATE->cb.gpr[1] - | stfd f2, CTSTATE->cb.fpr[1] + | .FPU stfd f2, CTSTATE->cb.fpr[1] | stw r5, CTSTATE->cb.gpr[2] - | stfd f3, CTSTATE->cb.fpr[2] + | .FPU stfd f3, CTSTATE->cb.fpr[2] | stw r6, CTSTATE->cb.gpr[3] - | stfd f4, CTSTATE->cb.fpr[3] + | .FPU stfd f4, CTSTATE->cb.fpr[3] | stw r7, CTSTATE->cb.gpr[4] - | stfd f5, CTSTATE->cb.fpr[4] + | .FPU stfd f5, CTSTATE->cb.fpr[4] | stw r8, CTSTATE->cb.gpr[5] - | stfd f6, CTSTATE->cb.fpr[5] + | .FPU stfd f6, CTSTATE->cb.fpr[5] | stw r9, CTSTATE->cb.gpr[6] - | stfd f7, CTSTATE->cb.fpr[6] + | .FPU stfd f7, CTSTATE->cb.fpr[6] | stw r10, CTSTATE->cb.gpr[7] - | stfd f8, CTSTATE->cb.fpr[7] + | .FPU stfd f8, CTSTATE->cb.fpr[7] | addi TMP0, sp, CFRAME_SPACE+8 | stw TMP0, CTSTATE->cb.stack | mr CARG1, CTSTATE @@ -2808,21 +3220,21 @@ static void build_subroutines(BuildCtx *ctx) | lp BASE, L:CRET1->base | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp RC, L:CRET1->top - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | li ZERO, 0 | mr L, CRET1 - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | lwz LFUNC:RB, FRAME_FUNC(BASE) - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | stw TMP0, TONUM_HI + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU stw TMP0, TONUM_HI | li TISNIL, LJ_TNIL | li_vmstate INTERP - | lfs TOBIT, TMPD - | stw TMP3, TMPD + | .FPU lfs TOBIT, TMPD + | .FPU stw TMP3, TMPD | sub RC, RC, BASE | st_vmstate - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | ins_callt |.endif | @@ -2836,7 +3248,7 @@ static void build_subroutines(BuildCtx *ctx) | mr CARG2, RA | bl extern lj_ccallback_leave // (CTState *cts, TValue *o) | lwz CRET1, CTSTATE->cb.gpr[0] - | lfd FARG1, CTSTATE->cb.fpr[0] + | .FPU lfd FARG1, CTSTATE->cb.fpr[0] | lwz CRET2, CTSTATE->cb.gpr[1] | b ->vm_leave_unw |.endif @@ -2870,14 +3282,14 @@ static void build_subroutines(BuildCtx *ctx) | bge <1 |2: | bney cr1, >3 - | lfd f1, CCSTATE->fpr[0] - | lfd f2, CCSTATE->fpr[1] - | lfd f3, CCSTATE->fpr[2] - | lfd f4, CCSTATE->fpr[3] - | lfd f5, CCSTATE->fpr[4] - | lfd f6, CCSTATE->fpr[5] - | lfd f7, CCSTATE->fpr[6] - | lfd f8, CCSTATE->fpr[7] + | .FPU lfd f1, CCSTATE->fpr[0] + | .FPU lfd f2, CCSTATE->fpr[1] + | .FPU lfd f3, CCSTATE->fpr[2] + | .FPU lfd f4, CCSTATE->fpr[3] + | .FPU lfd f5, CCSTATE->fpr[4] + | .FPU lfd f6, CCSTATE->fpr[5] + | .FPU lfd f7, CCSTATE->fpr[6] + | .FPU lfd f8, CCSTATE->fpr[7] |3: | lp TMP0, CCSTATE->func | lwz CARG2, CCSTATE->gpr[1] @@ -2894,7 +3306,7 @@ static void build_subroutines(BuildCtx *ctx) | lwz TMP2, -4(r14) | lwz TMP0, 4(r14) | stw CARG1, CCSTATE:TMP1->gpr[0] - | stfd FARG1, CCSTATE:TMP1->fpr[0] + | .FPU stfd FARG1, CCSTATE:TMP1->fpr[0] | stw CARG2, CCSTATE:TMP1->gpr[1] | mtlr TMP0 | stw CARG3, CCSTATE:TMP1->gpr[2] @@ -2923,19 +3335,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT: | // RA = src1*8, RD = src2*8, JMP with RD = target |.if DUALNUM - | lwzux TMP0, RA, BASE + | lwzux CARG1, RA, BASE | addi PC, PC, 4 | lwz CARG2, 4(RA) - | lwzux TMP1, RD, BASE + | lwzux CARG3, RD, BASE | lwz TMP2, -4(PC) - | checknum cr0, TMP0 - | lwz CARG3, 4(RD) + | checknum cr0, CARG1 + | lwz CARG4, 4(RD) | decode_RD4 TMP2, TMP2 - | checknum cr1, TMP1 - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | checknum cr1, CARG3 + | addis SAVE0, TMP2, -(BCBIAS_J*4 >> 16) | bne cr0, >7 | bne cr1, >8 - | cmpw CARG2, CARG3 + | cmpw CARG2, CARG4 if (op == BC_ISLT) { | bge >2 } else if (op == BC_ISGE) { @@ -2946,28 +3358,41 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ble >2 } |1: - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |2: | ins_next | |7: // RA is not an integer. | bgt cr0, ->vmeta_comp | // RA is a number. - | lfd f0, 0(RA) + | .FPU lfd f0, 0(RA) | bgt cr1, ->vmeta_comp | blt cr1, >4 | // RA is a number, RD is an integer. - | tonum_i f1, CARG3 + |.if FPU + | tonum_i f1, CARG4 + |.else + | bl ->vm_sfi2d_2 + |.endif | b >5 | |8: // RA is an integer, RD is not an integer. | bgt cr1, ->vmeta_comp | // RA is an integer, RD is a number. + |.if FPU | tonum_i f0, CARG2 + |.else + | bl ->vm_sfi2d_1 + |.endif |4: - | lfd f1, 0(RD) + | .FPU lfd f1, 0(RD) |5: + |.if FPU | fcmpu cr0, f0, f1 + |.else + | blex __ledf2 + | cmpwi CRET1, 0 + |.endif if (op == BC_ISLT) { | bge <2 } else if (op == BC_ISGE) { @@ -3015,42 +3440,42 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) vk = op == BC_ISEQV; | // RA = src1*8, RD = src2*8, JMP with RD = target |.if DUALNUM - | lwzux TMP0, RA, BASE + | lwzux CARG1, RA, BASE | addi PC, PC, 4 | lwz CARG2, 4(RA) - | lwzux TMP1, RD, BASE - | checknum cr0, TMP0 - | lwz TMP2, -4(PC) - | checknum cr1, TMP1 - | decode_RD4 TMP2, TMP2 - | lwz CARG3, 4(RD) + | lwzux CARG3, RD, BASE + | checknum cr0, CARG1 + | lwz SAVE0, -4(PC) + | checknum cr1, CARG3 + | decode_RD4 SAVE0, SAVE0 + | lwz CARG4, 4(RD) | cror 4*cr7+gt, 4*cr0+gt, 4*cr1+gt - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) if (vk) { | ble cr7, ->BC_ISEQN_Z } else { | ble cr7, ->BC_ISNEN_Z } |.else - | lwzux TMP0, RA, BASE - | lwz TMP2, 0(PC) + | lwzux CARG1, RA, BASE + | lwz SAVE0, 0(PC) | lfd f0, 0(RA) | addi PC, PC, 4 - | lwzux TMP1, RD, BASE - | checknum cr0, TMP0 - | decode_RD4 TMP2, TMP2 + | lwzux CARG3, RD, BASE + | checknum cr0, CARG1 + | decode_RD4 SAVE0, SAVE0 | lfd f1, 0(RD) - | checknum cr1, TMP1 - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | checknum cr1, CARG3 + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) | bge cr0, >5 | bge cr1, >5 | fcmpu cr0, f0, f1 if (vk) { | bne >1 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 } else { | beq >1 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 } |1: | ins_next @@ -3058,36 +3483,36 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |5: // Either or both types are not numbers. |.if not DUALNUM | lwz CARG2, 4(RA) - | lwz CARG3, 4(RD) + | lwz CARG4, 4(RD) |.endif |.if FFI - | cmpwi cr7, TMP0, LJ_TCDATA - | cmpwi cr5, TMP1, LJ_TCDATA + | cmpwi cr7, CARG1, LJ_TCDATA + | cmpwi cr5, CARG3, LJ_TCDATA |.endif - | not TMP3, TMP0 - | cmplw TMP0, TMP1 - | cmplwi cr1, TMP3, ~LJ_TISPRI // Primitive? + | not TMP2, CARG1 + | cmplw CARG1, CARG3 + | cmplwi cr1, TMP2, ~LJ_TISPRI // Primitive? |.if FFI | cror 4*cr7+eq, 4*cr7+eq, 4*cr5+eq |.endif - | cmplwi cr6, TMP3, ~LJ_TISTABUD // Table or userdata? + | cmplwi cr6, TMP2, ~LJ_TISTABUD // Table or userdata? |.if FFI | beq cr7, ->vmeta_equal_cd |.endif - | cmplw cr5, CARG2, CARG3 + | cmplw cr5, CARG2, CARG4 | crandc 4*cr0+gt, 4*cr0+eq, 4*cr1+gt // 2: Same type and primitive. | crorc 4*cr0+lt, 4*cr5+eq, 4*cr0+eq // 1: Same tv or different type. | crand 4*cr0+eq, 4*cr0+eq, 4*cr5+eq // 0: Same type and same tv. - | mr SAVE0, PC + | mr SAVE1, PC | cror 4*cr0+eq, 4*cr0+eq, 4*cr0+gt // 0 or 2. | cror 4*cr0+lt, 4*cr0+lt, 4*cr0+gt // 1 or 2. if (vk) { | bne cr0, >6 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |6: } else { | beq cr0, >6 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |6: } |.if DUALNUM @@ -3102,6 +3527,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | | // Different tables or userdatas. Need to check __eq metamethod. | // Field metatable must be at same offset for GCtab and GCudata! + | mr CARG3, CARG4 | lwz TAB:TMP2, TAB:CARG2->metatable | li CARG4, 1-vk // ne = 0 or 1. | cmplwi TAB:TMP2, 0 @@ -3109,7 +3535,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lbz TMP2, TAB:TMP2->nomm | andix. TMP2, TMP2, 1<vmeta_equal // Handle __eq metamethod. break; @@ -3150,16 +3576,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) vk = op == BC_ISEQN; | // RA = src*8, RD = num_const*8, JMP with RD = target |.if DUALNUM - | lwzux TMP0, RA, BASE + | lwzux CARG1, RA, BASE | addi PC, PC, 4 | lwz CARG2, 4(RA) - | lwzux TMP1, RD, KBASE - | checknum cr0, TMP0 - | lwz TMP2, -4(PC) - | checknum cr1, TMP1 - | decode_RD4 TMP2, TMP2 - | lwz CARG3, 4(RD) - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | lwzux CARG3, RD, KBASE + | checknum cr0, CARG1 + | lwz SAVE0, -4(PC) + | checknum cr1, CARG3 + | decode_RD4 SAVE0, SAVE0 + | lwz CARG4, 4(RD) + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) if (vk) { |->BC_ISEQN_Z: } else { @@ -3167,7 +3593,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) } | bne cr0, >7 | bne cr1, >8 - | cmpw CARG2, CARG3 + | cmpw CARG2, CARG4 |4: |.else if (vk) { @@ -3175,20 +3601,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) } else { |->BC_ISNEN_Z: // Dummy label. } - | lwzx TMP0, BASE, RA + | lwzx CARG1, BASE, RA | addi PC, PC, 4 | lfdx f0, BASE, RA - | lwz TMP2, -4(PC) + | lwz SAVE0, -4(PC) | lfdx f1, KBASE, RD - | decode_RD4 TMP2, TMP2 - | checknum TMP0 - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | decode_RD4 SAVE0, SAVE0 + | checknum CARG1 + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) | bge >3 | fcmpu cr0, f0, f1 |.endif if (vk) { | bne >1 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |1: |.if not FFI |3: @@ -3199,13 +3625,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |.if not FFI |3: |.endif - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |2: } | ins_next |.if FFI |3: - | cmpwi TMP0, LJ_TCDATA + | cmpwi CARG1, LJ_TCDATA | beq ->vmeta_equal_cd | b <1 |.endif @@ -3213,18 +3639,31 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |7: // RA is not an integer. | bge cr0, <3 | // RA is a number. - | lfd f0, 0(RA) + | .FPU lfd f0, 0(RA) | blt cr1, >1 | // RA is a number, RD is an integer. - | tonum_i f1, CARG3 + |.if FPU + | tonum_i f1, CARG4 + |.else + | bl ->vm_sfi2d_2 + |.endif | b >2 | |8: // RA is an integer, RD is a number. + |.if FPU | tonum_i f0, CARG2 + |.else + | bl ->vm_sfi2d_1 + |.endif |1: - | lfd f1, 0(RD) + | .FPU lfd f1, 0(RD) |2: + |.if FPU | fcmpu cr0, f0, f1 + |.else + | blex __ledf2 + | cmpwi CRET1, 0 + |.endif | b <4 |.endif break; @@ -3279,7 +3718,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | add PC, PC, TMP2 } else { | li TMP1, LJ_TFALSE + |.if FPU | lfdx f0, BASE, RD + |.else + | lwzux CARG1, RD, BASE + | lwz CARG2, 4(RD) + |.endif | cmplw TMP0, TMP1 if (op == BC_ISTC) { | bge >1 @@ -3288,7 +3732,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) } | addis PC, PC, -(BCBIAS_J*4 >> 16) | decode_RD4 TMP2, INS + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux CARG1, RA, BASE + | stw CARG2, 4(RA) + |.endif | add PC, PC, TMP2 |1: } @@ -3323,8 +3772,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_MOV: | // RA = dst*8, RD = src*8 | ins_next1 + |.if FPU | lfdx f0, BASE, RD | stfdx f0, BASE, RA + |.else + | lwzux TMP0, RD, BASE + | lwz TMP1, 4(RD) + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 break; case BC_NOT: @@ -3426,44 +3882,65 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); ||switch (vk) { ||case 0: - | lwzx TMP1, BASE, RB + | lwzx CARG1, BASE, RB | .if DUALNUM - | lwzx TMP2, KBASE, RC + | lwzx CARG3, KBASE, RC | .endif + | .if FPU | lfdx f14, BASE, RB | lfdx f15, KBASE, RC + | .else + | add TMP1, BASE, RB + | add TMP2, KBASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + | .endif | .if DUALNUM - | checknum cr0, TMP1 - | checknum cr1, TMP2 + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_vn | .else - | checknum TMP1; bge ->vmeta_arith_vn + | checknum CARG1; bge ->vmeta_arith_vn | .endif || break; ||case 1: - | lwzx TMP1, BASE, RB + | lwzx CARG1, BASE, RB | .if DUALNUM - | lwzx TMP2, KBASE, RC + | lwzx CARG3, KBASE, RC | .endif + | .if FPU | lfdx f15, BASE, RB | lfdx f14, KBASE, RC + | .else + | add TMP1, BASE, RB + | add TMP2, KBASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + | .endif | .if DUALNUM - | checknum cr0, TMP1 - | checknum cr1, TMP2 + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_nv | .else - | checknum TMP1; bge ->vmeta_arith_nv + | checknum CARG1; bge ->vmeta_arith_nv | .endif || break; ||default: - | lwzx TMP1, BASE, RB - | lwzx TMP2, BASE, RC + | lwzx CARG1, BASE, RB + | lwzx CARG3, BASE, RC + | .if FPU | lfdx f14, BASE, RB | lfdx f15, BASE, RC - | checknum cr0, TMP1 - | checknum cr1, TMP2 + | .else + | add TMP1, BASE, RB + | add TMP2, BASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + | .endif + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_vv || break; @@ -3497,48 +3974,78 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | fsub a, b, a // b - floor(b/c)*c |.endmacro | + |.macro sfpmod + |->BC_MODVN_Z: + | stw CARG1, SFSAVE_1 + | stw CARG2, SFSAVE_2 + | mr SAVE0, CARG3 + | mr SAVE1, CARG4 + | blex __divdf3 + | blex floor + | mr CARG3, SAVE0 + | mr CARG4, SAVE1 + | blex __muldf3 + | mr CARG3, CRET1 + | mr CARG4, CRET2 + | lwz CARG1, SFSAVE_1 + | lwz CARG2, SFSAVE_2 + | blex __subdf3 + |.endmacro + | |.macro ins_arithfp, fpins | ins_arithpre |.if "fpins" == "fpmod_" | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. - |.else + |.elif FPU | fpins f0, f14, f15 | ins_next1 | stfdx f0, BASE, RA | ins_next2 + |.else + | blex __divdf3 // Only soft-float div uses this macro. + | ins_next1 + | stwux CRET1, RA, BASE + | stw CRET2, 4(RA) + | ins_next2 |.endif |.endmacro | - |.macro ins_arithdn, intins, fpins + |.macro ins_arithdn, intins, fpins, fpcall | // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8 ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); ||switch (vk) { ||case 0: - | lwzux TMP1, RB, BASE - | lwzux TMP2, RC, KBASE - | lwz CARG1, 4(RB) - | checknum cr0, TMP1 - | lwz CARG2, 4(RC) + | lwzux CARG1, RB, BASE + | lwzux CARG3, RC, KBASE + | lwz CARG2, 4(RB) + | checknum cr0, CARG1 + | lwz CARG4, 4(RC) + | checknum cr1, CARG3 || break; ||case 1: - | lwzux TMP1, RB, BASE - | lwzux TMP2, RC, KBASE - | lwz CARG2, 4(RB) - | checknum cr0, TMP1 - | lwz CARG1, 4(RC) + | lwzux CARG3, RB, BASE + | lwzux CARG1, RC, KBASE + | lwz CARG4, 4(RB) + | checknum cr0, CARG3 + | lwz CARG2, 4(RC) + | checknum cr1, CARG1 || break; ||default: - | lwzux TMP1, RB, BASE - | lwzux TMP2, RC, BASE - | lwz CARG1, 4(RB) - | checknum cr0, TMP1 - | lwz CARG2, 4(RC) + | lwzux CARG1, RB, BASE + | lwzux CARG3, RC, BASE + | lwz CARG2, 4(RB) + | checknum cr0, CARG1 + | lwz CARG4, 4(RC) + | checknum cr1, CARG3 || break; ||} - | checknum cr1, TMP2 | bne >5 | bne cr1, >5 - | intins CARG1, CARG1, CARG2 + |.if "intins" == "intmod" + | mr CARG1, CARG2 + | mr CARG2, CARG4 + |.endif + | intins CARG1, CARG2, CARG4 | bso >4 |1: | ins_next1 @@ -3550,29 +4057,40 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | checkov TMP0, <1 // Ignore unrelated overflow. | ins_arithfallback b |5: // FP variant. + |.if FPU ||if (vk == 1) { | lfd f15, 0(RB) - | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | lfd f14, 0(RC) ||} else { | lfd f14, 0(RB) - | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | lfd f15, 0(RC) ||} + |.endif + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | ins_arithfallback bge |.if "fpins" == "fpmod_" | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. |.else + |.if FPU | fpins f0, f14, f15 - | ins_next1 | stfdx f0, BASE, RA + |.else + |.if "fpcall" == "sfpmod" + | sfpmod + |.else + | blex fpcall + |.endif + | stwux CRET1, RA, BASE + | stw CRET2, 4(RA) + |.endif + | ins_next1 | b <2 |.endif |.endmacro | - |.macro ins_arith, intins, fpins + |.macro ins_arith, intins, fpins, fpcall |.if DUALNUM - | ins_arithdn intins, fpins + | ins_arithdn intins, fpins, fpcall |.else | ins_arithfp fpins |.endif @@ -3587,9 +4105,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addo. TMP0, TMP0, TMP3 | add y, a, b |.endmacro - | ins_arith addo32., fadd + | ins_arith addo32., fadd, __adddf3 |.else - | ins_arith addo., fadd + | ins_arith addo., fadd, __adddf3 |.endif break; case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: @@ -3601,36 +4119,48 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | subo. TMP0, TMP0, TMP3 | sub y, a, b |.endmacro - | ins_arith subo32., fsub + | ins_arith subo32., fsub, __subdf3 |.else - | ins_arith subo., fsub + | ins_arith subo., fsub, __subdf3 |.endif break; case BC_MULVN: case BC_MULNV: case BC_MULVV: - | ins_arith mullwo., fmul + | ins_arith mullwo., fmul, __muldf3 break; case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: | ins_arithfp fdiv break; case BC_MODVN: - | ins_arith intmod, fpmod + | ins_arith intmod, fpmod, sfpmod break; case BC_MODNV: case BC_MODVV: - | ins_arith intmod, fpmod_ + | ins_arith intmod, fpmod_, sfpmod break; case BC_POW: | // NYI: (partial) integer arithmetic. - | lwzx TMP1, BASE, RB + | lwzx CARG1, BASE, RB + | lwzx CARG3, BASE, RC + |.if FPU | lfdx FARG1, BASE, RB - | lwzx TMP2, BASE, RC | lfdx FARG2, BASE, RC - | checknum cr0, TMP1 - | checknum cr1, TMP2 + |.else + | add TMP1, BASE, RB + | add TMP2, BASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + |.endif + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_vv | blex pow | ins_next1 + |.if FPU | stfdx FARG1, BASE, RA + |.else + | stwux CARG1, RA, BASE + | stw CARG2, 4(RA) + |.endif | ins_next2 break; @@ -3650,8 +4180,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lp BASE, L->base | bne ->vmeta_binop | ins_next1 + |.if FPU | lfdx f0, BASE, SAVE0 // Copy result from RB to RA. | stfdx f0, BASE, RA + |.else + | lwzux TMP0, SAVE0, BASE + | lwz TMP1, 4(SAVE0) + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 break; @@ -3714,8 +4251,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_KNUM: | // RA = dst*8, RD = num_const*8 | ins_next1 + |.if FPU | lfdx f0, KBASE, RD | stfdx f0, BASE, RA + |.else + | lwzux TMP0, RD, KBASE + | lwz TMP1, 4(RD) + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 break; case BC_KPRI: @@ -3748,8 +4292,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lwzx UPVAL:RB, LFUNC:RB, RD | ins_next1 | lwz TMP1, UPVAL:RB->v + |.if FPU | lfd f0, 0(TMP1) | stfdx f0, BASE, RA + |.else + | lwz TMP2, 0(TMP1) + | lwz TMP3, 4(TMP1) + | stwux TMP2, RA, BASE + | stw TMP3, 4(RA) + |.endif | ins_next2 break; case BC_USETV: @@ -3757,14 +4308,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lwz LFUNC:RB, FRAME_FUNC(BASE) | srwi RA, RA, 1 | addi RA, RA, offsetof(GCfuncL, uvptr) + |.if FPU | lfdux f0, RD, BASE + |.else + | lwzux CARG1, RD, BASE + | lwz CARG3, 4(RD) + |.endif | lwzx UPVAL:RB, LFUNC:RB, RA | lbz TMP3, UPVAL:RB->marked | lwz CARG2, UPVAL:RB->v | andix. TMP3, TMP3, LJ_GC_BLACK // isblack(uv) | lbz TMP0, UPVAL:RB->closed | lwz TMP2, 0(RD) + |.if FPU | stfd f0, 0(CARG2) + |.else + | stw CARG1, 0(CARG2) + | stw CARG3, 4(CARG2) + |.endif | cmplwi cr1, TMP0, 0 | lwz TMP1, 4(RD) | cror 4*cr0+eq, 4*cr0+eq, 4*cr1+eq @@ -3820,11 +4381,21 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lwz LFUNC:RB, FRAME_FUNC(BASE) | srwi RA, RA, 1 | addi RA, RA, offsetof(GCfuncL, uvptr) + |.if FPU | lfdx f0, KBASE, RD + |.else + | lwzux TMP2, RD, KBASE + | lwz TMP3, 4(RD) + |.endif | lwzx UPVAL:RB, LFUNC:RB, RA | ins_next1 | lwz TMP1, UPVAL:RB->v + |.if FPU | stfd f0, 0(TMP1) + |.else + | stw TMP2, 0(TMP1) + | stw TMP3, 4(TMP1) + |.endif | ins_next2 break; case BC_USETP: @@ -3972,11 +4543,21 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |.endif | ble ->vmeta_tgetv // Integer key and in array part? | lwzx TMP0, TMP1, TMP2 + |.if FPU | lfdx f14, TMP1, TMP2 + |.else + | lwzux SAVE0, TMP1, TMP2 + | lwz SAVE1, 4(TMP1) + |.endif | checknil TMP0; beq >2 |1: | ins_next1 + |.if FPU | stfdx f14, BASE, RA + |.else + | stwux SAVE0, RA, BASE + | stw SAVE1, 4(RA) + |.endif | ins_next2 | |2: // Check for __index if table value is nil. @@ -4052,12 +4633,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lwz TMP1, TAB:RB->asize | lwz TMP2, TAB:RB->array | cmplw TMP0, TMP1; bge ->vmeta_tgetb + |.if FPU | lwzx TMP1, TMP2, RC | lfdx f0, TMP2, RC + |.else + | lwzux TMP1, TMP2, RC + | lwz TMP3, 4(TMP2) + |.endif | checknil TMP1; beq >5 |1: | ins_next1 + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux TMP1, RA, BASE + | stw TMP3, 4(RA) + |.endif | ins_next2 | |5: // Check for __index if table value is nil. @@ -4087,10 +4678,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | cmplw TMP0, CARG2 | slwi TMP2, CARG2, 3 | ble ->vmeta_tgetr // In array part? + |.if FPU | lfdx f14, TMP1, TMP2 + |.else + | lwzux SAVE0, TMP2, TMP1 + | lwz SAVE1, 4(TMP2) + |.endif |->BC_TGETR_Z: | ins_next1 + |.if FPU | stfdx f14, BASE, RA + |.else + | stwux SAVE0, RA, BASE + | stw SAVE1, 4(RA) + |.endif | ins_next2 break; @@ -4131,11 +4732,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ble ->vmeta_tsetv // Integer key and in array part? | lwzx TMP2, TMP1, TMP0 | lbz TMP3, TAB:RB->marked + |.if FPU | lfdx f14, BASE, RA + |.else + | add SAVE1, BASE, RA + | lwz SAVE0, 0(SAVE1) + | lwz SAVE1, 4(SAVE1) + |.endif | checknil TMP2; beq >3 |1: | andix. TMP2, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | stfdx f14, TMP1, TMP0 + |.else + | stwux SAVE0, TMP1, TMP0 + | stw SAVE1, 4(TMP1) + |.endif | bne >7 |2: | ins_next @@ -4176,7 +4788,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lwz NODE:TMP2, TAB:RB->node | stb ZERO, TAB:RB->nomm // Clear metamethod cache. | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask + |.if FPU | lfdx f14, BASE, RA + |.else + | add CARG2, BASE, RA + | lwz SAVE0, 0(CARG2) + | lwz SAVE1, 4(CARG2) + |.endif | slwi TMP0, TMP1, 5 | slwi TMP1, TMP1, 3 | sub TMP1, TMP0, TMP1 @@ -4192,7 +4810,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | checknil CARG2; beq >4 // Key found, but nil value? |2: | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | stfd f14, NODE:TMP2->val + |.else + | stw SAVE0, NODE:TMP2->val.u32.hi + | stw SAVE1, NODE:TMP2->val.u32.lo + |.endif | bne >7 |3: | ins_next @@ -4231,7 +4854,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | bl extern lj_tab_newkey // (lua_State *L, GCtab *t, TValue *k) | // Returns TValue *. | lp BASE, L->base + |.if FPU | stfd f14, 0(CRET1) + |.else + | stw SAVE0, 0(CRET1) + | stw SAVE1, 4(CRET1) + |.endif | b <3 // No 2nd write barrier needed. | |7: // Possible table write barrier for the value. Skip valiswhite check. @@ -4248,13 +4876,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lwz TMP2, TAB:RB->array | lbz TMP3, TAB:RB->marked | cmplw TMP0, TMP1 + |.if FPU | lfdx f14, BASE, RA + |.else + | add CARG2, BASE, RA + | lwz SAVE0, 0(CARG2) + | lwz SAVE1, 4(CARG2) + |.endif | bge ->vmeta_tsetb | lwzx TMP1, TMP2, RC | checknil TMP1; beq >5 |1: | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | stfdx f14, TMP2, RC + |.else + | stwux SAVE0, RC, TMP2 + | stw SAVE1, 4(RC) + |.endif | bne >7 |2: | ins_next @@ -4294,10 +4933,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |2: | cmplw TMP0, CARG3 | slwi TMP2, CARG3, 3 + |.if FPU | lfdx f14, BASE, RA + |.else + | lwzux SAVE0, RA, BASE + | lwz SAVE1, 4(RA) + |.endif | ble ->vmeta_tsetr // In array part? | ins_next1 + |.if FPU | stfdx f14, TMP1, TMP2 + |.else + | stwux SAVE0, TMP1, TMP2 + | stw SAVE1, 4(TMP1) + |.endif | ins_next2 | |7: // Possible table write barrier for the value. Skip valiswhite check. @@ -4327,10 +4976,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | add TMP1, TMP1, TMP0 | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) |3: // Copy result slots to table. + |.if FPU | lfd f0, 0(RA) + |.else + | lwz SAVE0, 0(RA) + | lwz SAVE1, 4(RA) + |.endif | addi RA, RA, 8 | cmpw cr1, RA, TMP2 + |.if FPU | stfd f0, 0(TMP1) + |.else + | stw SAVE0, 0(TMP1) + | stw SAVE1, 4(TMP1) + |.endif | addi TMP1, TMP1, 8 | blt cr1, <3 | bne >7 @@ -4397,9 +5056,20 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | beq cr1, >3 |2: | addi TMP3, TMP2, 8 + |.if FPU | lfdx f0, RA, TMP2 + |.else + | add CARG3, RA, TMP2 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | cmplw cr1, TMP3, NARGS8:RC + |.if FPU | stfdx f0, BASE, TMP2 + |.else + | stwux CARG1, TMP2, BASE + | stw CARG2, 4(TMP2) + |.endif | mr TMP2, TMP3 | bne cr1, <2 |3: @@ -4432,14 +5102,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | add BASE, BASE, RA | lwz TMP1, -24(BASE) | lwz LFUNC:RB, -20(BASE) + |.if FPU | lfd f1, -8(BASE) | lfd f0, -16(BASE) + |.else + | lwz CARG1, -8(BASE) + | lwz CARG2, -4(BASE) + | lwz CARG3, -16(BASE) + | lwz CARG4, -12(BASE) + |.endif | stw TMP1, 0(BASE) // Copy callable. | stw LFUNC:RB, 4(BASE) | checkfunc TMP1 - | stfd f1, 16(BASE) // Copy control var. | li NARGS8:RC, 16 // Iterators get 2 arguments. + |.if FPU + | stfd f1, 16(BASE) // Copy control var. | stfdu f0, 8(BASE) // Copy state. + |.else + | stw CARG1, 16(BASE) // Copy control var. + | stw CARG2, 20(BASE) + | stwu CARG3, 8(BASE) // Copy state. + | stw CARG4, 4(BASE) + |.endif | bne ->vmeta_call | ins_call break; @@ -4460,7 +5144,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | slwi TMP3, RC, 3 | bge >5 // Index points after array part? | lwzx TMP2, TMP1, TMP3 + |.if FPU | lfdx f0, TMP1, TMP3 + |.else + | lwzux CARG1, TMP3, TMP1 + | lwz CARG2, 4(TMP3) + |.endif | checknil TMP2 | lwz INS, -4(PC) | beq >4 @@ -4472,7 +5161,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |.endif | addi RC, RC, 1 | addis TMP3, PC, -(BCBIAS_J*4 >> 16) + |.if FPU | stfd f0, 8(RA) + |.else + | stw CARG1, 8(RA) + | stw CARG2, 12(RA) + |.endif | decode_RD4 TMP1, INS | stw RC, -4(RA) // Update control var. | add PC, TMP1, TMP3 @@ -4497,17 +5191,38 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | slwi RB, RC, 3 | sub TMP3, TMP3, RB | lwzx RB, TMP2, TMP3 + |.if FPU | lfdx f0, TMP2, TMP3 + |.else + | add CARG3, TMP2, TMP3 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | add NODE:TMP3, TMP2, TMP3 | checknil RB | lwz INS, -4(PC) | beq >7 + |.if FPU | lfd f1, NODE:TMP3->key + |.else + | lwz CARG3, NODE:TMP3->key.u32.hi + | lwz CARG4, NODE:TMP3->key.u32.lo + |.endif | addis TMP2, PC, -(BCBIAS_J*4 >> 16) + |.if FPU | stfd f0, 8(RA) + |.else + | stw CARG1, 8(RA) + | stw CARG2, 12(RA) + |.endif | add RC, RC, TMP0 | decode_RD4 TMP1, INS + |.if FPU | stfd f1, 0(RA) + |.else + | stw CARG3, 0(RA) + | stw CARG4, 4(RA) + |.endif | addi RC, RC, 1 | add PC, TMP1, TMP2 | stw RC, -4(RA) // Update control var. @@ -4573,9 +5288,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | subi TMP2, TMP2, 16 | ble >2 // No vararg slots? |1: // Copy vararg slots to destination slots. + |.if FPU | lfd f0, 0(RC) + |.else + | lwz CARG1, 0(RC) + | lwz CARG2, 4(RC) + |.endif | addi RC, RC, 8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw CARG1, 0(RA) + | stw CARG2, 4(RA) + |.endif | cmplw RA, TMP2 | cmplw cr1, RC, TMP3 | bge >3 // All destination slots filled? @@ -4598,9 +5323,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addi MULTRES, TMP1, 8 | bgt >7 |6: + |.if FPU | lfd f0, 0(RC) + |.else + | lwz CARG1, 0(RC) + | lwz CARG2, 4(RC) + |.endif | addi RC, RC, 8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw CARG1, 0(RA) + | stw CARG2, 4(RA) + |.endif | cmplw RC, TMP3 | addi RA, RA, 8 | blt <6 // More vararg slots? @@ -4651,14 +5386,38 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | li TMP1, 0 |2: | addi TMP3, TMP1, 8 + |.if FPU | lfdx f0, RA, TMP1 + |.else + | add CARG3, RA, TMP1 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | cmpw TMP3, RC + |.if FPU | stfdx f0, TMP2, TMP1 + |.else + | add CARG3, TMP2, TMP1 + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | beq >3 | addi TMP1, TMP3, 8 + |.if FPU | lfdx f1, RA, TMP3 + |.else + | add CARG3, RA, TMP3 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | cmpw TMP1, RC + |.if FPU | stfdx f1, TMP2, TMP3 + |.else + | add CARG3, TMP2, TMP3 + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | bne <2 |3: |5: @@ -4700,8 +5459,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | subi TMP2, BASE, 8 | decode_RB8 RB, INS if (op == BC_RET1) { + |.if FPU | lfd f0, 0(RA) | stfd f0, 0(TMP2) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + | stw CARG1, 0(TMP2) + | stw CARG2, 4(TMP2) + |.endif } |5: | cmplw RB, RD @@ -4762,11 +5528,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |4: | stw CARG1, FORL_IDX*8+4(RA) } else { - | lwz TMP3, FORL_STEP*8(RA) + | lwz SAVE0, FORL_STEP*8(RA) | lwz CARG3, FORL_STEP*8+4(RA) | lwz TMP2, FORL_STOP*8(RA) | lwz CARG2, FORL_STOP*8+4(RA) - | cmplw cr7, TMP3, TISNUM + | cmplw cr7, SAVE0, TISNUM | cmplw cr1, TMP2, TISNUM | crand 4*cr0+eq, 4*cr0+eq, 4*cr7+eq | crand 4*cr0+eq, 4*cr0+eq, 4*cr1+eq @@ -4809,41 +5575,80 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) if (vk) { |.if DUALNUM |9: // FP loop. + |.if FPU | lfd f1, FORL_IDX*8(RA) |.else + | lwz CARG1, FORL_IDX*8(RA) + | lwz CARG2, FORL_IDX*8+4(RA) + |.endif + |.else | lfdux f1, RA, BASE |.endif + |.if FPU | lfd f3, FORL_STEP*8(RA) | lfd f2, FORL_STOP*8(RA) - | lwz TMP3, FORL_STEP*8(RA) | fadd f1, f1, f3 | stfd f1, FORL_IDX*8(RA) + |.else + | lwz CARG3, FORL_STEP*8(RA) + | lwz CARG4, FORL_STEP*8+4(RA) + | mr SAVE1, RD + | blex __adddf3 + | mr RD, SAVE1 + | stw CRET1, FORL_IDX*8(RA) + | stw CRET2, FORL_IDX*8+4(RA) + | lwz CARG3, FORL_STOP*8(RA) + | lwz CARG4, FORL_STOP*8+4(RA) + |.endif + | lwz SAVE0, FORL_STEP*8(RA) } else { |.if DUALNUM |9: // FP loop. |.else | lwzux TMP1, RA, BASE - | lwz TMP3, FORL_STEP*8(RA) + | lwz SAVE0, FORL_STEP*8(RA) | lwz TMP2, FORL_STOP*8(RA) | cmplw cr0, TMP1, TISNUM - | cmplw cr7, TMP3, TISNUM + | cmplw cr7, SAVE0, TISNUM | cmplw cr1, TMP2, TISNUM |.endif + |.if FPU | lfd f1, FORL_IDX*8(RA) + |.else + | lwz CARG1, FORL_IDX*8(RA) + | lwz CARG2, FORL_IDX*8+4(RA) + |.endif | crand 4*cr0+lt, 4*cr0+lt, 4*cr7+lt | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + |.if FPU | lfd f2, FORL_STOP*8(RA) + |.else + | lwz CARG3, FORL_STOP*8(RA) + | lwz CARG4, FORL_STOP*8+4(RA) + |.endif | bge ->vmeta_for } - | cmpwi cr6, TMP3, 0 + | cmpwi cr6, SAVE0, 0 if (op != BC_JFORL) { | srwi RD, RD, 1 } + |.if FPU | stfd f1, FORL_EXT*8(RA) + |.else + | stw CARG1, FORL_EXT*8(RA) + | stw CARG2, FORL_EXT*8+4(RA) + |.endif if (op != BC_JFORL) { | add RD, PC, RD } + |.if FPU | fcmpu cr0, f1, f2 + |.else + | mr SAVE1, RD + | blex __ledf2 + | cmpwi CRET1, 0 + | mr RD, SAVE1 + |.endif if (op == BC_JFORI) { | addis PC, RD, -(BCBIAS_J*4 >> 16) } From f3d75075ed91137699c6071abe49e2252e794a9c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 18 Aug 2017 12:52:14 +0200 Subject: [PATCH 094/116] Use https for freelists.org links. --- doc/ext_ffi_semantics.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/ext_ffi_semantics.html b/doc/ext_ffi_semantics.html index 899640ce..ae3c0379 100644 --- a/doc/ext_ffi_semantics.html +++ b/doc/ext_ffi_semantics.html @@ -844,7 +844,7 @@ place of a type, you'd need to use ffi.typeof("int") instead.

      The main use for parameterized types are libraries implementing abstract data types -(» example), +(example), similar to what can be achieved with C++ template metaprogramming. Another use case are derived types of anonymous structs, which avoids pollution of the global struct namespace. From 6b0824852677cc12570c20a3211fbfe0e4f0ce14 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 28 Aug 2017 10:43:37 +0200 Subject: [PATCH 095/116] x64/LJ_GC64: Fix fallback case of asm_fuseloadk64(). Contributed by Peter Cawley. --- src/lj_asm_x86.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 3e189b1d..55c02d24 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -387,6 +387,7 @@ static Reg asm_fuseloadk64(ASMState *as, IRIns *ir) ir->i = (int32_t)(as->mctop - as->mcbot); as->mcbot += 8; as->mclim = as->mcbot + MCLIM_REDZONE; + lj_mcode_commitbot(as->J, as->mcbot); } as->mrm.ofs = (int32_t)mcpofs(as, as->mctop - ir->i); as->mrm.base = RID_RIP; From 71b7bc88341945f13f3951e2bb5fd247b639ff7a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 3 Sep 2017 23:20:53 +0200 Subject: [PATCH 096/116] PPC: Add soft-float support to JIT compiler backend. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- src/lj_arch.h | 1 - src/lj_asm_ppc.h | 321 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 278 insertions(+), 44 deletions(-) diff --git a/src/lj_arch.h b/src/lj_arch.h index 0145a7c0..5962f3af 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -273,7 +273,6 @@ #endif #if LJ_ABI_SOFTFP -#define LJ_ARCH_NOJIT 1 /* NYI */ #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL #else #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index 6daa861b..1955429f 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -226,6 +226,7 @@ static void asm_fusexrefx(ASMState *as, PPCIns pi, Reg rt, IRRef ref, emit_tab(as, pi, rt, left, right); } +#if !LJ_SOFTFP /* Fuse to multiply-add/sub instruction. */ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir) { @@ -245,6 +246,7 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir) } return 0; } +#endif /* -- Calls --------------------------------------------------------------- */ @@ -253,13 +255,17 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) { uint32_t n, nargs = CCI_XNARGS(ci); int32_t ofs = 8; - Reg gpr = REGARG_FIRSTGPR, fpr = REGARG_FIRSTFPR; + Reg gpr = REGARG_FIRSTGPR; +#if !LJ_SOFTFP + Reg fpr = REGARG_FIRSTFPR; +#endif if ((void *)ci->func) emit_call(as, (void *)ci->func); for (n = 0; n < nargs; n++) { /* Setup args. */ IRRef ref = args[n]; if (ref) { IRIns *ir = IR(ref); +#if !LJ_SOFTFP if (irt_isfp(ir->t)) { if (fpr <= REGARG_LASTFPR) { lua_assert(rset_test(as->freeset, fpr)); /* Already evicted. */ @@ -271,7 +277,9 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) emit_spstore(as, ir, r, ofs); ofs += irt_isnum(ir->t) ? 8 : 4; } - } else { + } else +#endif + { if (gpr <= REGARG_LASTGPR) { lua_assert(rset_test(as->freeset, gpr)); /* Already evicted. */ ra_leftov(as, gpr, ref); @@ -290,8 +298,10 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) } checkmclim(as); } +#if !LJ_SOFTFP if ((ci->flags & CCI_VARARG)) /* Vararg calls need to know about FPR use. */ emit_tab(as, fpr == REGARG_FIRSTFPR ? PPCI_CRXOR : PPCI_CREQV, 6, 6, 6); +#endif } /* Setup result reg/sp for call. Evict scratch regs. */ @@ -299,8 +309,10 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) { RegSet drop = RSET_SCRATCH; int hiop = ((ir+1)->o == IR_HIOP && !irt_isnil((ir+1)->t)); +#if !LJ_SOFTFP if ((ci->flags & CCI_NOFPRCLOBBER)) drop &= ~RSET_FPR; +#endif if (ra_hasreg(ir->r)) rset_clear(drop, ir->r); /* Dest reg handled below. */ if (hiop && ra_hasreg((ir+1)->r)) @@ -308,7 +320,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) ra_evictset(as, drop); /* Evictions must be performed first. */ if (ra_used(ir)) { lua_assert(!irt_ispri(ir->t)); - if (irt_isfp(ir->t)) { + if (!LJ_SOFTFP && irt_isfp(ir->t)) { if ((ci->flags & CCI_CASTU64)) { /* Use spill slot or temp slots. */ int32_t ofs = ir->s ? sps_scale(ir->s) : SPOFS_TMP; @@ -377,6 +389,7 @@ static void asm_retf(ASMState *as, IRIns *ir) /* -- Type conversions ---------------------------------------------------- */ +#if !LJ_SOFTFP static void asm_tointg(ASMState *as, IRIns *ir, Reg left) { RegSet allow = RSET_FPR; @@ -409,15 +422,23 @@ static void asm_tobit(ASMState *as, IRIns *ir) emit_fai(as, PPCI_STFD, tmp, RID_SP, SPOFS_TMP); emit_fab(as, PPCI_FADD, tmp, left, right); } +#endif static void asm_conv(ASMState *as, IRIns *ir) { IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); +#if !LJ_SOFTFP int stfp = (st == IRT_NUM || st == IRT_FLOAT); +#endif IRRef lref = ir->op1; - lua_assert(irt_type(ir->t) != st); lua_assert(!(irt_isint64(ir->t) || (st == IRT_I64 || st == IRT_U64))); /* Handled by SPLIT. */ +#if LJ_SOFTFP + /* FP conversions are handled by SPLIT. */ + lua_assert(!irt_isfp(ir->t) && !(st == IRT_NUM || st == IRT_FLOAT)); + /* Can't check for same types: SPLIT uses CONV int.int + BXOR for sfp NEG. */ +#else + lua_assert(irt_type(ir->t) != st); if (irt_isfp(ir->t)) { Reg dest = ra_dest(as, ir, RSET_FPR); if (stfp) { /* FP to FP conversion. */ @@ -476,7 +497,9 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_fb(as, PPCI_FCTIWZ, tmp, left); } } - } else { + } else +#endif + { Reg dest = ra_dest(as, ir, RSET_GPR); if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ Reg left = ra_alloc1(as, ir->op1, RSET_GPR); @@ -496,17 +519,41 @@ static void asm_strto(ASMState *as, IRIns *ir) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num]; IRRef args[2]; - int32_t ofs; + int32_t ofs = SPOFS_TMP; +#if LJ_SOFTFP + ra_evictset(as, RSET_SCRATCH); + if (ra_used(ir)) { + if (ra_hasspill(ir->s) && ra_hasspill((ir+1)->s) && + (ir->s & 1) == LJ_BE && (ir->s ^ 1) == (ir+1)->s) { + int i; + for (i = 0; i < 2; i++) { + Reg r = (ir+i)->r; + if (ra_hasreg(r)) { + ra_free(as, r); + ra_modified(as, r); + emit_spload(as, ir+i, r, sps_scale((ir+i)->s)); + } + } + ofs = sps_scale(ir->s & ~1); + } else { + Reg rhi = ra_dest(as, ir+1, RSET_GPR); + Reg rlo = ra_dest(as, ir, rset_exclude(RSET_GPR, rhi)); + emit_tai(as, PPCI_LWZ, rhi, RID_SP, ofs); + emit_tai(as, PPCI_LWZ, rlo, RID_SP, ofs+4); + } + } +#else RegSet drop = RSET_SCRATCH; if (ra_hasreg(ir->r)) rset_set(drop, ir->r); /* Spill dest reg (if any). */ ra_evictset(as, drop); + if (ir->s) ofs = sps_scale(ir->s); +#endif asm_guardcc(as, CC_EQ); emit_ai(as, PPCI_CMPWI, RID_RET, 0); /* Test return status. */ args[0] = ir->op1; /* GCstr *str */ args[1] = ASMREF_TMP1; /* TValue *n */ asm_gencall(as, ci, args); /* Store the result to the spill slot or temp slots. */ - ofs = ir->s ? sps_scale(ir->s) : SPOFS_TMP; emit_tai(as, PPCI_ADDI, ra_releasetmp(as, ASMREF_TMP1), RID_SP, ofs); } @@ -530,7 +577,10 @@ static void asm_tvptr(ASMState *as, Reg dest, IRRef ref) Reg src = ra_alloc1(as, ref, allow); emit_setgl(as, src, tmptv.gcr); } - type = ra_allock(as, irt_toitype(ir->t), allow); + if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) + type = ra_alloc1(as, ref+1, allow); + else + type = ra_allock(as, irt_toitype(ir->t), allow); emit_setgl(as, type, tmptv.it); } } @@ -574,11 +624,27 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) Reg tisnum = RID_NONE, tmpnum = RID_NONE; IRRef refkey = ir->op2; IRIns *irkey = IR(refkey); + int isk = irref_isk(refkey); IRType1 kt = irkey->t; uint32_t khash; MCLabel l_end, l_loop, l_next; rset_clear(allow, tab); +#if LJ_SOFTFP + if (!isk) { + key = ra_alloc1(as, refkey, allow); + rset_clear(allow, key); + if (irkey[1].o == IR_HIOP) { + if (ra_hasreg((irkey+1)->r)) { + tmpnum = (irkey+1)->r; + ra_noweak(as, tmpnum); + } else { + tmpnum = ra_allocref(as, refkey+1, allow); + } + rset_clear(allow, tmpnum); + } + } +#else if (irt_isnum(kt)) { key = ra_alloc1(as, refkey, RSET_FPR); tmpnum = ra_scratch(as, rset_exclude(RSET_FPR, key)); @@ -588,6 +654,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) key = ra_alloc1(as, refkey, allow); rset_clear(allow, key); } +#endif tmp2 = ra_scratch(as, allow); rset_clear(allow, tmp2); @@ -610,7 +677,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) asm_guardcc(as, CC_EQ); else emit_condbranch(as, PPCI_BC|PPCF_Y, CC_EQ, l_end); - if (irt_isnum(kt)) { + if (!LJ_SOFTFP && irt_isnum(kt)) { emit_fab(as, PPCI_FCMPU, 0, tmpnum, key); emit_condbranch(as, PPCI_BC, CC_GE, l_next); emit_ab(as, PPCI_CMPLW, tmp1, tisnum); @@ -620,7 +687,10 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_ab(as, PPCI_CMPW, tmp2, key); emit_condbranch(as, PPCI_BC, CC_NE, l_next); } - emit_ai(as, PPCI_CMPWI, tmp1, irt_toitype(irkey->t)); + if (LJ_SOFTFP && ra_hasreg(tmpnum)) + emit_ab(as, PPCI_CMPW, tmp1, tmpnum); + else + emit_ai(as, PPCI_CMPWI, tmp1, irt_toitype(irkey->t)); if (!irt_ispri(kt)) emit_tai(as, PPCI_LWZ, tmp2, dest, (int32_t)offsetof(Node, key.gcr)); } @@ -629,19 +699,19 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) (((char *)as->mcp-(char *)l_loop) & 0xffffu); /* Load main position relative to tab->node into dest. */ - khash = irref_isk(refkey) ? ir_khash(irkey) : 1; + khash = isk ? ir_khash(irkey) : 1; if (khash == 0) { emit_tai(as, PPCI_LWZ, dest, tab, (int32_t)offsetof(GCtab, node)); } else { Reg tmphash = tmp1; - if (irref_isk(refkey)) + if (isk) tmphash = ra_allock(as, khash, allow); emit_tab(as, PPCI_ADD, dest, dest, tmp1); emit_tai(as, PPCI_MULLI, tmp1, tmp1, sizeof(Node)); emit_asb(as, PPCI_AND, tmp1, tmp2, tmphash); emit_tai(as, PPCI_LWZ, dest, tab, (int32_t)offsetof(GCtab, node)); emit_tai(as, PPCI_LWZ, tmp2, tab, (int32_t)offsetof(GCtab, hmask)); - if (irref_isk(refkey)) { + if (isk) { /* Nothing to do. */ } else if (irt_isstr(kt)) { emit_tai(as, PPCI_LWZ, tmp1, key, (int32_t)offsetof(GCstr, hash)); @@ -651,13 +721,19 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_asb(as, PPCI_XOR, tmp1, tmp1, tmp2); emit_rotlwi(as, tmp1, tmp1, (HASH_ROT2+HASH_ROT1)&31); emit_tab(as, PPCI_SUBF, tmp2, dest, tmp2); - if (irt_isnum(kt)) { + if (LJ_SOFTFP ? (irkey[1].o == IR_HIOP) : irt_isnum(kt)) { +#if LJ_SOFTFP + emit_asb(as, PPCI_XOR, tmp2, key, tmp1); + emit_rotlwi(as, dest, tmp1, HASH_ROT1); + emit_tab(as, PPCI_ADD, tmp1, tmpnum, tmpnum); +#else int32_t ofs = ra_spill(as, irkey); emit_asb(as, PPCI_XOR, tmp2, tmp2, tmp1); emit_rotlwi(as, dest, tmp1, HASH_ROT1); emit_tab(as, PPCI_ADD, tmp1, tmp1, tmp1); emit_tai(as, PPCI_LWZ, tmp2, RID_SP, ofs+4); emit_tai(as, PPCI_LWZ, tmp1, RID_SP, ofs); +#endif } else { emit_asb(as, PPCI_XOR, tmp2, key, tmp1); emit_rotlwi(as, dest, tmp1, HASH_ROT1); @@ -784,8 +860,8 @@ static PPCIns asm_fxloadins(IRIns *ir) case IRT_U8: return PPCI_LBZ; case IRT_I16: return PPCI_LHA; case IRT_U16: return PPCI_LHZ; - case IRT_NUM: return PPCI_LFD; - case IRT_FLOAT: return PPCI_LFS; + case IRT_NUM: lua_assert(!LJ_SOFTFP); return PPCI_LFD; + case IRT_FLOAT: if (!LJ_SOFTFP) return PPCI_LFS; default: return PPCI_LWZ; } } @@ -795,8 +871,8 @@ static PPCIns asm_fxstoreins(IRIns *ir) switch (irt_type(ir->t)) { case IRT_I8: case IRT_U8: return PPCI_STB; case IRT_I16: case IRT_U16: return PPCI_STH; - case IRT_NUM: return PPCI_STFD; - case IRT_FLOAT: return PPCI_STFS; + case IRT_NUM: lua_assert(!LJ_SOFTFP); return PPCI_STFD; + case IRT_FLOAT: if (!LJ_SOFTFP) return PPCI_STFS; default: return PPCI_STW; } } @@ -839,7 +915,8 @@ static void asm_fstore(ASMState *as, IRIns *ir) static void asm_xload(ASMState *as, IRIns *ir) { - Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); + Reg dest = ra_dest(as, ir, + (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR); lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED)); if (irt_isi8(ir->t)) emit_as(as, PPCI_EXTSB, dest, dest); @@ -857,7 +934,8 @@ static void asm_xstore_(ASMState *as, IRIns *ir, int32_t ofs) Reg src = ra_alloc1(as, irb->op1, RSET_GPR); asm_fusexrefx(as, PPCI_STWBRX, src, ir->op1, rset_exclude(RSET_GPR, src)); } else { - Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); + Reg src = ra_alloc1(as, ir->op2, + (!LJ_SOFTFP && irt_isfp(ir->t)) ? RSET_FPR : RSET_GPR); asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, rset_exclude(RSET_GPR, src), ofs); } @@ -871,10 +949,19 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) Reg dest = RID_NONE, type = RID_TMP, tmp = RID_TMP, idx; RegSet allow = RSET_GPR; int32_t ofs = AHUREF_LSX; + if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) { + t.irt = IRT_NUM; + if (ra_used(ir+1)) { + type = ra_dest(as, ir+1, allow); + rset_clear(allow, type); + } + ofs = 0; + } if (ra_used(ir)) { - lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); - if (!irt_isnum(t)) ofs = 0; - dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : RSET_GPR); + lua_assert((LJ_SOFTFP ? 0 : irt_isnum(ir->t)) || + irt_isint(ir->t) || irt_isaddr(ir->t)); + if (LJ_SOFTFP || !irt_isnum(t)) ofs = 0; + dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow); rset_clear(allow, dest); } idx = asm_fuseahuref(as, ir->op1, &ofs, allow); @@ -883,12 +970,13 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) asm_guardcc(as, CC_GE); emit_ab(as, PPCI_CMPLW, type, tisnum); if (ra_hasreg(dest)) { - if (ofs == AHUREF_LSX) { + if (!LJ_SOFTFP && ofs == AHUREF_LSX) { tmp = ra_scratch(as, rset_exclude(rset_exclude(RSET_GPR, (idx&255)), (idx>>8))); emit_fab(as, PPCI_LFDX, dest, (idx&255), tmp); } else { - emit_fai(as, PPCI_LFD, dest, idx, ofs); + emit_fai(as, LJ_SOFTFP ? PPCI_LWZ : PPCI_LFD, dest, idx, + ofs+4*LJ_SOFTFP); } } } else { @@ -911,7 +999,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir) int32_t ofs = AHUREF_LSX; if (ir->r == RID_SINK) return; - if (irt_isnum(ir->t)) { + if (!LJ_SOFTFP && irt_isnum(ir->t)) { src = ra_alloc1(as, ir->op2, RSET_FPR); } else { if (!irt_ispri(ir->t)) { @@ -919,11 +1007,14 @@ static void asm_ahustore(ASMState *as, IRIns *ir) rset_clear(allow, src); ofs = 0; } - type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); + if (LJ_SOFTFP && (ir+1)->o == IR_HIOP) + type = ra_alloc1(as, (ir+1)->op2, allow); + else + type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); rset_clear(allow, type); } idx = asm_fuseahuref(as, ir->op1, &ofs, allow); - if (irt_isnum(ir->t)) { + if (!LJ_SOFTFP && irt_isnum(ir->t)) { if (ofs == AHUREF_LSX) { emit_fab(as, PPCI_STFDX, src, (idx&255), RID_TMP); emit_slwi(as, RID_TMP, (idx>>8), 3); @@ -948,21 +1039,33 @@ static void asm_sload(ASMState *as, IRIns *ir) IRType1 t = ir->t; Reg dest = RID_NONE, type = RID_NONE, base; RegSet allow = RSET_GPR; + int hiop = (LJ_SOFTFP && (ir+1)->o == IR_HIOP); + if (hiop) + t.irt = IRT_NUM; lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ - lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK)); + lua_assert(irt_isguard(ir->t) || !(ir->op2 & IRSLOAD_TYPECHECK)); lua_assert(LJ_DUALNUM || !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME))); +#if LJ_SOFTFP + lua_assert(!(ir->op2 & IRSLOAD_CONVERT)); /* Handled by LJ_SOFTFP SPLIT. */ + if (hiop && ra_used(ir+1)) { + type = ra_dest(as, ir+1, allow); + rset_clear(allow, type); + } +#else if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) { dest = ra_scratch(as, RSET_FPR); asm_tointg(as, ir, dest); t.irt = IRT_NUM; /* Continue with a regular number type check. */ - } else if (ra_used(ir)) { + } else +#endif + if (ra_used(ir)) { lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t)); - dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : RSET_GPR); + dest = ra_dest(as, ir, (!LJ_SOFTFP && irt_isnum(t)) ? RSET_FPR : allow); rset_clear(allow, dest); base = ra_alloc1(as, REF_BASE, allow); rset_clear(allow, base); - if ((ir->op2 & IRSLOAD_CONVERT)) { + if (!LJ_SOFTFP && (ir->op2 & IRSLOAD_CONVERT)) { if (irt_isint(t)) { emit_tai(as, PPCI_LWZ, dest, RID_SP, SPOFS_TMPLO); dest = ra_scratch(as, RSET_FPR); @@ -994,10 +1097,13 @@ dotypecheck: if ((ir->op2 & IRSLOAD_TYPECHECK)) { Reg tisnum = ra_allock(as, (int32_t)LJ_TISNUM, allow); asm_guardcc(as, CC_GE); - emit_ab(as, PPCI_CMPLW, RID_TMP, tisnum); +#if !LJ_SOFTFP type = RID_TMP; +#endif + emit_ab(as, PPCI_CMPLW, type, tisnum); } - if (ra_hasreg(dest)) emit_fai(as, PPCI_LFD, dest, base, ofs-4); + if (ra_hasreg(dest)) emit_fai(as, LJ_SOFTFP ? PPCI_LWZ : PPCI_LFD, dest, + base, ofs-(LJ_SOFTFP?0:4)); } else { if ((ir->op2 & IRSLOAD_TYPECHECK)) { asm_guardcc(as, CC_NE); @@ -1119,6 +1225,7 @@ static void asm_obar(ASMState *as, IRIns *ir) /* -- Arithmetic and logic operations ------------------------------------- */ +#if !LJ_SOFTFP static void asm_fparith(ASMState *as, IRIns *ir, PPCIns pi) { Reg dest = ra_dest(as, ir, RSET_FPR); @@ -1146,13 +1253,17 @@ static void asm_fpmath(ASMState *as, IRIns *ir) else asm_callid(as, ir, IRCALL_lj_vm_floor + ir->op2); } +#endif static void asm_add(ASMState *as, IRIns *ir) { +#if !LJ_SOFTFP if (irt_isnum(ir->t)) { if (!asm_fusemadd(as, ir, PPCI_FMADD, PPCI_FMADD)) asm_fparith(as, ir, PPCI_FADD); - } else { + } else +#endif + { Reg dest = ra_dest(as, ir, RSET_GPR); Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); PPCIns pi; @@ -1191,10 +1302,13 @@ static void asm_add(ASMState *as, IRIns *ir) static void asm_sub(ASMState *as, IRIns *ir) { +#if !LJ_SOFTFP if (irt_isnum(ir->t)) { if (!asm_fusemadd(as, ir, PPCI_FMSUB, PPCI_FNMSUB)) asm_fparith(as, ir, PPCI_FSUB); - } else { + } else +#endif + { PPCIns pi = PPCI_SUBF; Reg dest = ra_dest(as, ir, RSET_GPR); Reg left, right; @@ -1220,9 +1334,12 @@ static void asm_sub(ASMState *as, IRIns *ir) static void asm_mul(ASMState *as, IRIns *ir) { +#if !LJ_SOFTFP if (irt_isnum(ir->t)) { asm_fparith(as, ir, PPCI_FMUL); - } else { + } else +#endif + { PPCIns pi = PPCI_MULLW; Reg dest = ra_dest(as, ir, RSET_GPR); Reg right, left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); @@ -1250,9 +1367,12 @@ static void asm_mul(ASMState *as, IRIns *ir) static void asm_neg(ASMState *as, IRIns *ir) { +#if !LJ_SOFTFP if (irt_isnum(ir->t)) { asm_fpunary(as, ir, PPCI_FNEG); - } else { + } else +#endif + { Reg dest, left; PPCIns pi = PPCI_NEG; if (as->flagmcp == as->mcp) { @@ -1563,9 +1683,40 @@ static void asm_bitshift(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pik) PPCI_RLWINM|PPCF_MB(0)|PPCF_ME(31)) #define asm_bror(as, ir) lua_assert(0) +#if LJ_SOFTFP +static void asm_sfpmin_max(ASMState *as, IRIns *ir) +{ + CCallInfo ci = lj_ir_callinfo[IRCALL_softfp_cmp]; + IRRef args[4]; + MCLabel l_right, l_end; + Reg desthi = ra_dest(as, ir, RSET_GPR), destlo = ra_dest(as, ir+1, RSET_GPR); + Reg righthi, lefthi = ra_alloc2(as, ir, RSET_GPR); + Reg rightlo, leftlo = ra_alloc2(as, ir+1, RSET_GPR); + PPCCC cond = (IROp)ir->o == IR_MIN ? CC_EQ : CC_NE; + righthi = (lefthi >> 8); lefthi &= 255; + rightlo = (leftlo >> 8); leftlo &= 255; + args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1; + args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2; + l_end = emit_label(as); + if (desthi != righthi) emit_mr(as, desthi, righthi); + if (destlo != rightlo) emit_mr(as, destlo, rightlo); + l_right = emit_label(as); + if (l_end != l_right) emit_jmp(as, l_end); + if (desthi != lefthi) emit_mr(as, desthi, lefthi); + if (destlo != leftlo) emit_mr(as, destlo, leftlo); + if (l_right == as->mcp+1) { + cond ^= 4; l_right = l_end; ++as->mcp; + } + emit_condbranch(as, PPCI_BC, cond, l_right); + ra_evictset(as, RSET_SCRATCH); + emit_cmpi(as, RID_RET, 1); + asm_gencall(as, &ci, args); +} +#endif + static void asm_min_max(ASMState *as, IRIns *ir, int ismax) { - if (irt_isnum(ir->t)) { + if (!LJ_SOFTFP && irt_isnum(ir->t)) { Reg dest = ra_dest(as, ir, RSET_FPR); Reg tmp = dest; Reg right, left = ra_alloc2(as, ir, RSET_FPR); @@ -1653,7 +1804,7 @@ static void asm_intcomp_(ASMState *as, IRRef lref, IRRef rref, Reg cr, PPCCC cc) static void asm_comp(ASMState *as, IRIns *ir) { PPCCC cc = asm_compmap[ir->o]; - if (irt_isnum(ir->t)) { + if (!LJ_SOFTFP && irt_isnum(ir->t)) { Reg right, left = ra_alloc2(as, ir, RSET_FPR); right = (left >> 8); left &= 255; asm_guardcc(as, (cc >> 4)); @@ -1674,6 +1825,44 @@ static void asm_comp(ASMState *as, IRIns *ir) #define asm_equal(as, ir) asm_comp(as, ir) +#if LJ_SOFTFP +/* SFP comparisons. */ +static void asm_sfpcomp(ASMState *as, IRIns *ir) +{ + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_softfp_cmp]; + RegSet drop = RSET_SCRATCH; + Reg r; + IRRef args[4]; + args[0^LJ_BE] = ir->op1; args[1^LJ_BE] = (ir+1)->op1; + args[2^LJ_BE] = ir->op2; args[3^LJ_BE] = (ir+1)->op2; + + for (r = REGARG_FIRSTGPR; r <= REGARG_FIRSTGPR+3; r++) { + if (!rset_test(as->freeset, r) && + regcost_ref(as->cost[r]) == args[r-REGARG_FIRSTGPR]) + rset_clear(drop, r); + } + ra_evictset(as, drop); + asm_setupresult(as, ir, ci); + switch ((IROp)ir->o) { + case IR_ULT: + asm_guardcc(as, CC_EQ); + emit_ai(as, PPCI_CMPWI, RID_RET, 0); + case IR_ULE: + asm_guardcc(as, CC_EQ); + emit_ai(as, PPCI_CMPWI, RID_RET, 1); + break; + case IR_GE: case IR_GT: + asm_guardcc(as, CC_EQ); + emit_ai(as, PPCI_CMPWI, RID_RET, 2); + default: + asm_guardcc(as, (asm_compmap[ir->o] & 0xf)); + emit_ai(as, PPCI_CMPWI, RID_RET, 0); + break; + } + asm_gencall(as, ci, args); +} +#endif + #if LJ_HASFFI /* 64 bit integer comparisons. */ static void asm_comp64(ASMState *as, IRIns *ir) @@ -1703,19 +1892,36 @@ static void asm_comp64(ASMState *as, IRIns *ir) /* Hiword op of a split 64 bit op. Previous op must be the loword op. */ static void asm_hiop(ASMState *as, IRIns *ir) { -#if LJ_HASFFI +#if LJ_HASFFI || LJ_SOFTFP /* HIOP is marked as a store because it needs its own DCE logic. */ int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ as->curins--; /* Always skip the CONV. */ +#if LJ_HASFFI && !LJ_SOFTFP if (usehi || uselo) asm_conv64(as, ir); return; +#endif } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ as->curins--; /* Always skip the loword comparison. */ +#if LJ_SOFTFP + if (!irt_isint(ir->t)) { + asm_sfpcomp(as, ir-1); + return; + } +#endif +#if LJ_HASFFI asm_comp64(as, ir); +#endif return; +#if LJ_SOFTFP + } else if ((ir-1)->o == IR_MIN || (ir-1)->o == IR_MAX) { + as->curins--; /* Always skip the loword min/max. */ + if (uselo || usehi) + asm_sfpmin_max(as, ir-1); + return; +#endif } else if ((ir-1)->o == IR_XSTORE) { as->curins--; /* Handle both stores here. */ if ((ir-1)->r != RID_SINK) { @@ -1726,14 +1932,27 @@ static void asm_hiop(ASMState *as, IRIns *ir) } if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ switch ((ir-1)->o) { +#if LJ_HASFFI case IR_ADD: as->curins--; asm_add64(as, ir); break; case IR_SUB: as->curins--; asm_sub64(as, ir); break; case IR_NEG: as->curins--; asm_neg64(as, ir); break; +#endif +#if LJ_SOFTFP + case IR_SLOAD: case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: case IR_VLOAD: + case IR_STRTO: + if (!uselo) + ra_allocref(as, ir->op1, RSET_GPR); /* Mark lo op as used. */ + break; +#endif case IR_CALLN: + case IR_CALLS: case IR_CALLXS: if (!uselo) ra_allocref(as, ir->op1, RID2RSET(RID_RETLO)); /* Mark lo op as used. */ break; +#if LJ_SOFTFP + case IR_ASTORE: case IR_HSTORE: case IR_USTORE: case IR_TOSTR: +#endif case IR_CNEWI: /* Nothing to do here. Handled by lo op itself. */ break; @@ -1797,8 +2016,19 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) if ((sn & SNAP_NORESTORE)) continue; if (irt_isnum(ir->t)) { +#if LJ_SOFTFP + Reg tmp; + RegSet allow = rset_exclude(RSET_GPR, RID_BASE); + lua_assert(irref_isk(ref)); /* LJ_SOFTFP: must be a number constant. */ + tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.lo, allow); + emit_tai(as, PPCI_STW, tmp, RID_BASE, ofs+(LJ_BE?4:0)); + if (rset_test(as->freeset, tmp+1)) allow = RID2RSET(tmp+1); + tmp = ra_allock(as, (int32_t)ir_knum(ir)->u32.hi, allow); + emit_tai(as, PPCI_STW, tmp, RID_BASE, ofs+(LJ_BE?0:4)); +#else Reg src = ra_alloc1(as, ref, RSET_FPR); emit_fai(as, PPCI_STFD, src, RID_BASE, ofs); +#endif } else { Reg type; RegSet allow = rset_exclude(RSET_GPR, RID_BASE); @@ -1811,6 +2041,10 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) if ((sn & (SNAP_CONT|SNAP_FRAME))) { if (s == 0) continue; /* Do not overwrite link to previous frame. */ type = ra_allock(as, (int32_t)(*flinks--), allow); +#if LJ_SOFTFP + } else if ((sn & SNAP_SOFTFPNUM)) { + type = ra_alloc1(as, ref+1, rset_exclude(RSET_GPR, RID_BASE)); +#endif } else { type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); } @@ -1947,14 +2181,15 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) int nslots = 2, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; asm_collectargs(as, ir, ci, args); for (i = 0; i < nargs; i++) - if (args[i] && irt_isfp(IR(args[i])->t)) { + if (!LJ_SOFTFP && args[i] && irt_isfp(IR(args[i])->t)) { if (nfpr > 0) nfpr--; else nslots = (nslots+3) & ~1; } else { if (ngpr > 0) ngpr--; else nslots++; } if (nslots > as->evenspill) /* Leave room for args in stack slots. */ as->evenspill = nslots; - return irt_isfp(ir->t) ? REGSP_HINT(RID_FPRET) : REGSP_HINT(RID_RET); + return (!LJ_SOFTFP && irt_isfp(ir->t)) ? REGSP_HINT(RID_FPRET) : + REGSP_HINT(RID_RET); } static void asm_setup_target(ASMState *as) From 05fbdf565c700365d22e38f11478101a0d92a23e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 10 Sep 2017 14:05:30 +0200 Subject: [PATCH 097/116] x64/LJ_GC64: Fix type-check-only variant of SLOAD. Thanks to Peter Cawley. --- src/lj_asm_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 55c02d24..af54dc7f 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1759,7 +1759,7 @@ static void asm_sload(ASMState *as, IRIns *ir) emit_i8(as, irt_toitype(t)); emit_rr(as, XO_ARITHi8, XOg_CMP, tmp); emit_shifti(as, XOg_SAR|REX_64, tmp, 47); - emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs+4); + emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs); #else } else { emit_i8(as, irt_toitype(t)); From bf12f1dafb157008b963f829b57b2472b6993cc8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 18 Sep 2017 09:50:22 +0200 Subject: [PATCH 098/116] MIPS64: Hide internal function. --- src/lj_ccall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lj_ccall.c b/src/lj_ccall.c index 799be487..25e938cb 100644 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -848,7 +848,8 @@ noth: /* Not a homogeneous float/double aggregate. */ return 0; /* Struct is in GPRs. */ } -void ccall_copy_struct(CCallState *cc, CType *ctr, void *dp, void *sp, int ft) +static void ccall_copy_struct(CCallState *cc, CType *ctr, void *dp, void *sp, + int ft) { if (LJ_ABI_SOFTFP ? ft : ((ft & 3) == FTYPE_FLOAT || (ft >> 2) == FTYPE_FLOAT)) { From 0c0e7b168ea147866835954267c151ef789f64fb Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 20 Sep 2017 19:39:50 +0200 Subject: [PATCH 099/116] DynASM/x86: Fix potential REL_A overflow. Thanks to Joshua Haberman. --- dynasm/dasm_x86.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dynasm/dasm_x86.h b/dynasm/dasm_x86.h index 90dc5d15..f9260b0c 100644 --- a/dynasm/dasm_x86.h +++ b/dynasm/dasm_x86.h @@ -395,7 +395,8 @@ int dasm_encode(Dst_DECL, void *buffer) case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; } case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; b++; n = (int)(ptrdiff_t)D->globals[-n]; - case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ + case DASM_REL_A: rel_a: + n -= (unsigned int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ case DASM_REL_PC: rel_pc: { int shrink = *b++; int *pb = DASM_POS2PTR(D, n); if (*pb < 0) { n = pb[1]; goto rel_a; } From b4ed3219a1a98dd9fe7d1e3eeea3b82f5a780948 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 2 Oct 2017 09:22:46 +0200 Subject: [PATCH 100/116] LJ_GC64: Fix ir_khash for non-string GCobj. Contributed by Peter Cawley. --- src/lj_asm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lj_asm.c b/src/lj_asm.c index bed2268e..d961927b 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1017,7 +1017,11 @@ static uint32_t ir_khash(IRIns *ir) } else { lua_assert(irt_isgcv(ir->t)); lo = u32ptr(ir_kgc(ir)); +#if LJ_GC64 + hi = (uint32_t)(u64ptr(ir_kgc(ir)) >> 32) | (irt_toitype(ir->t) << 15); +#else hi = lo + HASH_BIAS; +#endif } return hashrot(lo, hi); } From 850f8c59d3d04a9847f21f32a6c36d8269b5b6b1 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 2 Oct 2017 23:10:56 +0200 Subject: [PATCH 101/116] LJ_GC64: Make ASMREF_L references 64 bit. Reported by Yichun Zhang. --- src/lj_asm.c | 1 + src/lj_ir.h | 4 +++- src/lj_opt_sink.c | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index d961927b..753fe6bd 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -2015,6 +2015,7 @@ static void asm_setup_regsp(ASMState *as) ir->prev = REGSP_INIT; if (irt_is64(ir->t) && ir->o != IR_KNULL) { #if LJ_GC64 + /* The false-positive of irt_is64() for ASMREF_L (REF_NIL) is OK here. */ ir->i = 0; /* Will become non-zero only for RIP-relative addresses. */ #else /* Make life easier for backends by putting address of constant in i. */ diff --git a/src/lj_ir.h b/src/lj_ir.h index 34c27853..8057a750 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -377,10 +377,12 @@ typedef struct IRType1 { uint8_t irt; } IRType1; #define irt_isint64(t) (irt_typerange((t), IRT_I64, IRT_U64)) #if LJ_GC64 +/* Include IRT_NIL, so IR(ASMREF_L) (aka REF_NIL) is considered 64 bit. */ #define IRT_IS64 \ ((1u<cur.nk); ir < irbase; ir++) { irt_clearmark(ir->t); ir->prev = REGSP_INIT; + /* The false-positive of irt_is64() for ASMREF_L (REF_NIL) is OK here. */ if (irt_is64(ir->t) && ir->o != IR_KNULL) ir++; } From 9f0caad0e43f97a4613850b3874b851cb1bc301d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 8 Nov 2017 12:53:05 +0100 Subject: [PATCH 102/116] Fix FOLD rule for strength reduction of widening. Reported by Matthew Burk. --- src/lj_opt_fold.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 3d0e35a6..5dc7ae3d 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -1052,7 +1052,7 @@ LJFOLDF(simplify_conv_sext) if (ref == J->scev.idx) { IRRef lo = J->scev.dir ? J->scev.start : J->scev.stop; lua_assert(irt_isint(J->scev.t)); - if (lo && IR(lo)->i + ofs >= 0) { + if (lo && IR(lo)->o == IR_KINT && IR(lo)->i + ofs >= 0) { ok_reduce: #if LJ_TARGET_X64 /* Eliminate widening. All 32 bit ops do an implicit zero-extension. */ From 06cd9fce7df440323647174f1ca4a01281ec8acd Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 8 Nov 2017 12:53:48 +0100 Subject: [PATCH 103/116] ARM64: Fix assembly of HREFK. Reported by Jason Teplitz. --- src/lj_asm_arm64.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 8fd92e76..cbb186d3 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -869,14 +869,12 @@ static void asm_hrefk(ASMState *as, IRIns *ir) int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node)); int32_t kofs = ofs + (int32_t)offsetof(Node, key); int bigofs = !emit_checkofs(A64I_LDRx, ofs); - RegSet allow = RSET_GPR; Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE; - Reg node = ra_alloc1(as, ir->op1, allow); - Reg key = ra_scratch(as, rset_clear(allow, node)); - Reg idx = node; + Reg node = ra_alloc1(as, ir->op1, RSET_GPR); + Reg key, idx = node; + RegSet allow = rset_exclude(RSET_GPR, node); uint64_t k; lua_assert(ofs % sizeof(Node) == 0); - rset_clear(allow, key); if (bigofs) { idx = dest; rset_clear(allow, dest); @@ -892,7 +890,8 @@ static void asm_hrefk(ASMState *as, IRIns *ir) } else { k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey); } - emit_nm(as, A64I_CMPx, key, ra_allock(as, k, allow)); + key = ra_scratch(as, allow); + emit_nm(as, A64I_CMPx, key, ra_allock(as, k, rset_exclude(allow, key))); emit_lso(as, A64I_LDRx, key, idx, kofs); if (bigofs) emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR); From 99cdfbf6a1e8856f64908072ef10443a7eab14f2 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 8 Nov 2017 12:54:03 +0100 Subject: [PATCH 104/116] MIPS64: Fix register allocation in assembly of HREF. Contributed by James Cowgill. --- src/lj_asm_mips.h | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index 1406a873..3a4679b8 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -859,6 +859,9 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) Reg dest = ra_dest(as, ir, allow); Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); Reg key = RID_NONE, type = RID_NONE, tmpnum = RID_NONE, tmp1 = RID_TMP, tmp2; +#if LJ_64 + Reg cmp64 = RID_NONE; +#endif IRRef refkey = ir->op2; IRIns *irkey = IR(refkey); int isk = irref_isk(refkey); @@ -901,6 +904,26 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) #endif tmp2 = ra_scratch(as, allow); rset_clear(allow, tmp2); +#if LJ_64 + if (LJ_SOFTFP || !irt_isnum(kt)) { + /* Allocate cmp64 register used for 64-bit comparisons */ + if (LJ_SOFTFP && irt_isnum(kt)) { + cmp64 = key; + } else if (!isk && irt_isaddr(kt)) { + cmp64 = tmp2; + } else { + int64_t k; + if (isk && irt_isaddr(kt)) { + k = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64; + } else { + lua_assert(irt_ispri(kt) && !irt_isnil(kt)); + k = ~((int64_t)~irt_toitype(ir->t) << 47); + } + cmp64 = ra_allock(as, k, allow); + rset_clear(allow, cmp64); + } + } +#endif /* Key not found in chain: jump to exit (if merged) or load niltv. */ l_end = emit_label(as); @@ -943,24 +966,9 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_dta(as, MIPSI_DSRA32, tmp1, tmp1, 15); emit_tg(as, MIPSI_DMTC1, tmp1, tmpnum); emit_tsi(as, MIPSI_LD, tmp1, dest, (int32_t)offsetof(Node, key.u64)); - } else if (LJ_SOFTFP && irt_isnum(kt)) { - emit_branch(as, MIPSI_BEQ, tmp1, key, l_end); - emit_tsi(as, MIPSI_LD, tmp1, dest, (int32_t)offsetof(Node, key.u64)); - } else if (irt_isaddr(kt)) { - Reg refk = tmp2; - if (isk) { - int64_t k = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64; - refk = ra_allock(as, k, allow); - rset_clear(allow, refk); - } - emit_branch(as, MIPSI_BEQ, tmp1, refk, l_end); - emit_tsi(as, MIPSI_LD, tmp1, dest, offsetof(Node, key)); } else { - Reg pri = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow); - rset_clear(allow, pri); - lua_assert(irt_ispri(kt) && !irt_isnil(kt)); - emit_branch(as, MIPSI_BEQ, tmp1, pri, l_end); - emit_tsi(as, MIPSI_LD, tmp1, dest, offsetof(Node, key)); + emit_branch(as, MIPSI_BEQ, tmp1, cmp64, l_end); + emit_tsi(as, MIPSI_LD, tmp1, dest, (int32_t)offsetof(Node, key.u64)); } *l_loop = MIPSI_BNE | MIPSF_S(tmp1) | ((as->mcp-l_loop-1) & 0xffffu); if (!isk && irt_isaddr(kt)) { From 33082a6f4778aa152f6a4a684a7fe79436f1ecb6 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 16 Nov 2017 12:53:34 +0100 Subject: [PATCH 105/116] ARM64: Fix xpcall() error case. Thanks to Stefan Pejic. --- src/vm_arm64.dasc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 3eaf3763..241c58a1 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -1185,12 +1185,12 @@ static void build_subroutines(BuildCtx *ctx) | subs NARGS8:RC, NARGS8:RC, #16 | blo ->fff_fallback | mov RB, BASE - | add BASE, BASE, #24 | asr ITYPE, CARG2, #47 | ubfx TMP0w, TMP0w, #HOOK_ACTIVE_SHIFT, #1 | cmn ITYPE, #-LJ_TFUNC | add PC, TMP0, #24+FRAME_PCALL | bne ->fff_fallback // Traceback must be a function. + | add BASE, BASE, #24 | stp CARG2, CARG1, [RB] // Swap function and traceback. | cbz NARGS8:RC, ->vm_call_dispatch | b <1 From 7dbf0b05f1228c1c719866db5e5f3d58f87f74c8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 16 Nov 2017 12:58:12 +0100 Subject: [PATCH 106/116] Fix saved bytecode encapsulated in ELF objects. Thanks to Dimitry Andric. --- src/jit/bcsave.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index aa677dfc..c94064e4 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -275,7 +275,7 @@ typedef struct { o.sect[2].size = fofs(ofs) o.sect[3].type = f32(3) -- .strtab o.sect[3].ofs = fofs(sofs + ofs) - o.sect[3].size = fofs(#symname+1) + o.sect[3].size = fofs(#symname+2) ffi.copy(o.space+ofs+1, symname) ofs = ofs + #symname + 2 o.sect[4].type = f32(1) -- .rodata From d417ded17945b4211608d497d50b509e0274f5e0 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 18 Nov 2017 12:23:57 +0100 Subject: [PATCH 107/116] ARM64: Fix xpcall() error case (really). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to François Perrad and Stefan Pejic. --- src/vm_arm64.dasc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 241c58a1..c55794a6 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -1182,7 +1182,7 @@ static void build_subroutines(BuildCtx *ctx) |.ffunc xpcall | ldp CARG1, CARG2, [BASE] | ldrb TMP0w, GL->hookmask - | subs NARGS8:RC, NARGS8:RC, #16 + | subs NARGS8:TMP1, NARGS8:RC, #16 | blo ->fff_fallback | mov RB, BASE | asr ITYPE, CARG2, #47 @@ -1190,6 +1190,7 @@ static void build_subroutines(BuildCtx *ctx) | cmn ITYPE, #-LJ_TFUNC | add PC, TMP0, #24+FRAME_PCALL | bne ->fff_fallback // Traceback must be a function. + | mov NARGS8:RC, NARGS8:TMP1 | add BASE, BASE, #24 | stp CARG2, CARG1, [RB] // Swap function and traceback. | cbz NARGS8:RC, ->vm_call_dispatch From ea7071d3c30b6432bfe6f8a9d263e0285cec25e3 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 18 Nov 2017 12:25:35 +0100 Subject: [PATCH 108/116] MIPS64: Fix xpcall() error case. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to François Perrad and Stefan Pejic. --- src/vm_mips64.dasc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc index 75b38dee..a78cd251 100644 --- a/src/vm_mips64.dasc +++ b/src/vm_mips64.dasc @@ -1399,15 +1399,16 @@ static void build_subroutines(BuildCtx *ctx) |. nop | |.ffunc xpcall - | daddiu NARGS8:RC, NARGS8:RC, -16 + | daddiu NARGS8:TMP0, NARGS8:RC, -16 | ld CARG1, 0(BASE) | ld CARG2, 8(BASE) - | bltz NARGS8:RC, ->fff_fallback + | bltz NARGS8:TMP0, ->fff_fallback |. lbu TMP1, DISPATCH_GL(hookmask)(DISPATCH) | gettp AT, CARG2 | daddiu AT, AT, -LJ_TFUNC | bnez AT, ->fff_fallback // Traceback must be a function. |. move TMP2, BASE + | move NARGS8:RC, NARGS8:TMP0 | daddiu BASE, BASE, 24 | // Remember active hook before pcall. | srl TMP3, TMP3, HOOK_ACTIVE_SHIFT From 58d0dde0a2df49abc991decbabff15230010829a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 14 Jan 2018 13:57:00 +0100 Subject: [PATCH 109/116] Fix IR_BUFPUT assembly. Thanks to Peter Cawley. --- src/lj_asm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 753fe6bd..5f83779e 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1119,7 +1119,7 @@ static void asm_bufput(ASMState *as, IRIns *ir) const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_buf_putstr]; IRRef args[3]; IRIns *irs; - int kchar = -1; + int kchar = -129; args[0] = ir->op1; /* SBuf * */ args[1] = ir->op2; /* GCstr * */ irs = IR(ir->op2); @@ -1127,7 +1127,7 @@ static void asm_bufput(ASMState *as, IRIns *ir) if (irs->o == IR_KGC) { GCstr *s = ir_kstr(irs); if (s->len == 1) { /* Optimize put of single-char string constant. */ - kchar = strdata(s)[0]; + kchar = (int8_t)strdata(s)[0]; /* Signed! */ args[1] = ASMREF_TMP1; /* int, truncated to char */ ci = &lj_ir_callinfo[IRCALL_lj_buf_putchar]; } @@ -1154,7 +1154,7 @@ static void asm_bufput(ASMState *as, IRIns *ir) asm_gencall(as, ci, args); if (args[1] == ASMREF_TMP1) { Reg tmp = ra_releasetmp(as, ASMREF_TMP1); - if (kchar == -1) + if (kchar == -129) asm_tvptr(as, tmp, irs->op1); else ra_allockreg(as, kchar, tmp); From 430d9f8f7ebb779948dbd43944b876b1a3f58551 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 14 Jan 2018 14:11:59 +0100 Subject: [PATCH 110/116] Fix string.format("%c", 0). --- src/lib_string.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/lib_string.c b/src/lib_string.c index d1a60b61..c1e595c9 100644 --- a/src/lib_string.c +++ b/src/lib_string.c @@ -850,20 +850,21 @@ LJLIB_CF(string_format) } else { /* format item */ char form[MAX_FMTSPEC]; /* to store the format (`%...') */ char buff[MAX_FMTITEM]; /* to store the formatted item */ + int n = 0; if (++arg > top) luaL_argerror(L, arg, lj_obj_typename[0]); strfrmt = scanformat(L, strfrmt, form); switch (*strfrmt++) { case 'c': - sprintf(buff, form, lj_lib_checkint(L, arg)); + n = sprintf(buff, form, lj_lib_checkint(L, arg)); break; case 'd': case 'i': addintlen(form); - sprintf(buff, form, num2intfrm(L, arg)); + n = sprintf(buff, form, num2intfrm(L, arg)); break; case 'o': case 'u': case 'x': case 'X': addintlen(form); - sprintf(buff, form, num2uintfrm(L, arg)); + n = sprintf(buff, form, num2uintfrm(L, arg)); break; case 'e': case 'E': case 'f': case 'g': case 'G': case 'a': case 'A': { TValue tv; @@ -880,10 +881,10 @@ LJLIB_CF(string_format) nbuf[len] = '\0'; for (p = form; *p < 'A' && *p != '.'; p++) ; *p++ = 's'; *p = '\0'; - sprintf(buff, form, nbuf); + n = sprintf(buff, form, nbuf); break; } - sprintf(buff, form, (double)tv.n); + n = sprintf(buff, form, (double)tv.n); break; } case 'q': @@ -902,14 +903,14 @@ LJLIB_CF(string_format) luaL_addvalue(&b); continue; } - sprintf(buff, form, strdata(str)); + n = sprintf(buff, form, strdata(str)); break; } default: lj_err_callerv(L, LJ_ERR_STRFMTO, *(strfrmt -1)); break; } - luaL_addlstring(&b, buff, strlen(buff)); + luaL_addlstring(&b, buff, n); } } luaL_pushresult(&b); From 9eaad8574f5b2271b981cd31966b1e832cd8de12 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 18 Jan 2018 12:24:36 +0100 Subject: [PATCH 111/116] Fix ARMv8 (32 bit subset) detection. Thanks to Markus Oberhumber. --- src/lj_arch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_arch.h b/src/lj_arch.h index 5962f3af..fcebd84b 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -201,7 +201,7 @@ #define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */ #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL -#if __ARM_ARCH____ARM_ARCH_8__ || __ARM_ARCH_8A__ +#if __ARM_ARCH_8__ || __ARM_ARCH_8A__ #define LJ_ARCH_VERSION 80 #elif __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__ #define LJ_ARCH_VERSION 70 From c88602f080dcafea6ba222a2f7cc1ea0e41ef3cc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 18 Jan 2018 12:29:39 +0100 Subject: [PATCH 112/116] Fix LuaJIT API docs for LUAJIT_MODE_*. Thanks to sunfishgao. --- doc/ext_c_api.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/ext_c_api.html b/doc/ext_c_api.html index 041a7220..4bb82514 100644 --- a/doc/ext_c_api.html +++ b/doc/ext_c_api.html @@ -89,8 +89,8 @@ other Lua/C API functions).

      The third argument specifies the mode, which is 'or'ed with a flag. -The flag can be LUAJIT_MODE_OFF to turn a feature on, -LUAJIT_MODE_ON to turn a feature off, or +The flag can be LUAJIT_MODE_OFF to turn a feature off, +LUAJIT_MODE_ON to turn a feature on, or LUAJIT_MODE_FLUSH to flush cached code.

      From 8071aa4ad65cf09e3b7adda4a7787d8897e5314c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 29 Jan 2018 12:12:29 +0100 Subject: [PATCH 113/116] MIPS64: Fix soft-float +-0.0 vs. +-0.0 comparison. Thanks to Stefan Pejic. --- src/vm_mips64.dasc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc index a78cd251..0a3f8e52 100644 --- a/src/vm_mips64.dasc +++ b/src/vm_mips64.dasc @@ -2661,7 +2661,7 @@ static void build_subroutines(BuildCtx *ctx) |. slt CRET1, CARG2, CARG1 |8: | jr ra - |. nop + |. li CRET1, 0 |9: | jr ra |. move CRET1, CRET2 From b03a56f28ec360bbcf43091afd0607890a4a33c7 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 29 Jan 2018 12:47:08 +0100 Subject: [PATCH 114/116] FFI: Don't assert on #1LL (5.2 compatibility mode only). Reported by Denis Golovan. --- src/lib_ffi.c | 2 +- src/lj_carith.c | 9 +++++++++ src/lj_carith.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/lib_ffi.c b/src/lib_ffi.c index f2f2ede4..83483d95 100644 --- a/src/lib_ffi.c +++ b/src/lib_ffi.c @@ -193,7 +193,7 @@ LJLIB_CF(ffi_meta___eq) LJLIB_REC(cdata_arith MM_eq) LJLIB_CF(ffi_meta___len) LJLIB_REC(cdata_arith MM_len) { - return ffi_arith(L); + return lj_carith_len(L); } LJLIB_CF(ffi_meta___lt) LJLIB_REC(cdata_arith MM_lt) diff --git a/src/lj_carith.c b/src/lj_carith.c index 6224dee6..c34596ca 100644 --- a/src/lj_carith.c +++ b/src/lj_carith.c @@ -272,6 +272,15 @@ int lj_carith_op(lua_State *L, MMS mm) return lj_carith_meta(L, cts, &ca, mm); } +/* No built-in functionality for length of cdata. */ +int lj_carith_len(lua_State *L) +{ + CTState *cts = ctype_cts(L); + CDArith ca; + carith_checkarg(L, cts, &ca); + return lj_carith_meta(L, cts, &ca, MM_len); +} + /* -- 64 bit integer arithmetic helpers ----------------------------------- */ #if LJ_32 && LJ_HASJIT diff --git a/src/lj_carith.h b/src/lj_carith.h index 3c155910..82fc8245 100644 --- a/src/lj_carith.h +++ b/src/lj_carith.h @@ -11,6 +11,7 @@ #if LJ_HASFFI LJ_FUNC int lj_carith_op(lua_State *L, MMS mm); +LJ_FUNC int lj_carith_len(lua_State *L); #if LJ_32 && LJ_HASJIT LJ_FUNC int64_t lj_carith_mul64(int64_t x, int64_t k); From d4ee80342770d1281e2ce877f8ae8ab1d99e6528 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 29 Jan 2018 13:06:13 +0100 Subject: [PATCH 115/116] Fix GCC 7 -Wimplicit-fallthrough warnings. --- dynasm/dasm_arm.h | 2 ++ dynasm/dasm_mips.h | 1 + dynasm/dasm_ppc.h | 1 + dynasm/dasm_x86.h | 14 ++++++++++++-- src/lj_asm.c | 3 ++- src/lj_cparse.c | 10 ++++++++++ src/lj_err.c | 1 + src/lj_opt_sink.c | 2 +- src/lj_parse.c | 3 ++- src/luajit.c | 1 + 10 files changed, 33 insertions(+), 5 deletions(-) diff --git a/dynasm/dasm_arm.h b/dynasm/dasm_arm.h index a43f7c66..1d404ccd 100644 --- a/dynasm/dasm_arm.h +++ b/dynasm/dasm_arm.h @@ -254,6 +254,7 @@ void dasm_put(Dst_DECL, int start, ...) case DASM_IMMV8: CK((n & 3) == 0, RANGE_I); n >>= 2; + /* fallthrough */ case DASM_IMML8: case DASM_IMML12: CK(n >= 0 ? ((n>>((ins>>5)&31)) == 0) : @@ -371,6 +372,7 @@ int dasm_encode(Dst_DECL, void *buffer) break; case DASM_REL_LG: CK(n >= 0, UNDEF_LG); + /* fallthrough */ case DASM_REL_PC: CK(n >= 0, UNDEF_PC); n = *DASM_POS2PTR(D, n) - (int)((char *)cp - base) - 4; diff --git a/dynasm/dasm_mips.h b/dynasm/dasm_mips.h index 7eac6694..46af0349 100644 --- a/dynasm/dasm_mips.h +++ b/dynasm/dasm_mips.h @@ -350,6 +350,7 @@ int dasm_encode(Dst_DECL, void *buffer) break; case DASM_REL_LG: CK(n >= 0, UNDEF_LG); + /* fallthrough */ case DASM_REL_PC: CK(n >= 0, UNDEF_PC); n = *DASM_POS2PTR(D, n); diff --git a/dynasm/dasm_ppc.h b/dynasm/dasm_ppc.h index 61103612..81b9a76b 100644 --- a/dynasm/dasm_ppc.h +++ b/dynasm/dasm_ppc.h @@ -350,6 +350,7 @@ int dasm_encode(Dst_DECL, void *buffer) break; case DASM_REL_LG: CK(n >= 0, UNDEF_LG); + /* fallthrough */ case DASM_REL_PC: CK(n >= 0, UNDEF_PC); n = *DASM_POS2PTR(D, n) - (int)((char *)cp - base); diff --git a/dynasm/dasm_x86.h b/dynasm/dasm_x86.h index f9260b0c..8ae911df 100644 --- a/dynasm/dasm_x86.h +++ b/dynasm/dasm_x86.h @@ -194,12 +194,13 @@ void dasm_put(Dst_DECL, int start, ...) switch (action) { case DASM_DISP: if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; } - case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob; + /* fallthrough */ + case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob; /* fallthrough */ case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */ case DASM_IMM_D: ofs += 4; break; case DASM_IMM_S: CK(((n+128)&-256) == 0, RANGE_I); goto ob; case DASM_IMM_B: CK((n&-256) == 0, RANGE_I); ob: ofs++; break; - case DASM_IMM_WB: if (((n+128)&-256) == 0) goto ob; + case DASM_IMM_WB: if (((n+128)&-256) == 0) goto ob; /* fallthrough */ case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break; case DASM_SPACE: p++; ofs += n; break; case DASM_SETLABEL: b[pos-2] = -0x40000000; break; /* Neg. label ofs. */ @@ -323,11 +324,14 @@ int dasm_link(Dst_DECL, size_t *szp) pos += 2; break; } + /* fallthrough */ case DASM_SPACE: case DASM_IMM_LG: case DASM_VREG: p++; + /* fallthrough */ case DASM_DISP: case DASM_IMM_S: case DASM_IMM_B: case DASM_IMM_W: case DASM_IMM_D: case DASM_IMM_WB: case DASM_IMM_DB: case DASM_SETLABEL: case DASM_REL_A: case DASM_IMM_PC: pos++; break; case DASM_LABEL_LG: p++; + /* fallthrough */ case DASM_LABEL_PC: b[pos++] += ofs; break; /* Fix label offset. */ case DASM_ALIGN: ofs -= (b[pos++]+ofs)&*p++; break; /* Adjust ofs. */ case DASM_EXTERN: p += 2; break; @@ -385,16 +389,20 @@ int dasm_encode(Dst_DECL, void *buffer) if (mrm != 5) { mm[-1] -= 0x80; break; } } if (((n+128) & -256) != 0) goto wd; else mm[-1] -= 0x40; } + /* fallthrough */ case DASM_IMM_S: case DASM_IMM_B: wb: dasmb(n); break; case DASM_IMM_DB: if (((n+128)&-256) == 0) { db: if (!mark) mark = cp; mark[-2] += 2; mark = NULL; goto wb; } else mark = NULL; + /* fallthrough */ case DASM_IMM_D: wd: dasmd(n); break; case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL; + /* fallthrough */ case DASM_IMM_W: dasmw(n); break; case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; } case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; b++; n = (int)(ptrdiff_t)D->globals[-n]; + /* fallthrough */ case DASM_REL_A: rel_a: n -= (unsigned int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ case DASM_REL_PC: rel_pc: { @@ -407,6 +415,7 @@ int dasm_encode(Dst_DECL, void *buffer) } case DASM_IMM_LG: p++; if (n < 0) { n = (int)(ptrdiff_t)D->globals[-n]; goto wd; } + /* fallthrough */ case DASM_IMM_PC: { int *pb = DASM_POS2PTR(D, n); n = *pb < 0 ? pb[1] : (*pb + (int)(ptrdiff_t)base); @@ -427,6 +436,7 @@ int dasm_encode(Dst_DECL, void *buffer) case DASM_EXTERN: n = DASM_EXTERN(Dst, cp, p[1], *p); p += 2; goto wd; case DASM_MARK: mark = cp; break; case DASM_ESC: action = *p++; + /* fallthrough */ default: *cp++ = action; break; case DASM_SECTION: case DASM_STOP: goto stop; } diff --git a/src/lj_asm.c b/src/lj_asm.c index 02714d4e..dd7186f6 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1725,6 +1725,7 @@ static void asm_setup_regsp(ASMState *as) case IR_SNEW: case IR_XSNEW: case IR_NEWREF: if (REGARG_NUMGPR < 3 && as->evenspill < 3) as->evenspill = 3; /* lj_str_new and lj_tab_newkey need 3 args. */ + /* fallthrough */ case IR_TNEW: case IR_TDUP: case IR_CNEW: case IR_CNEWI: case IR_TOSTR: ir->prev = REGSP_HINT(RID_RET); if (inloop) @@ -1750,7 +1751,7 @@ static void asm_setup_regsp(ASMState *as) #endif continue; } - /* fallthrough for integer POW */ + /* fallthrough */ /* for integer POW */ case IR_DIV: case IR_MOD: if (!irt_isnum(ir->t)) { ir->prev = REGSP_HINT(RID_RET); diff --git a/src/lj_cparse.c b/src/lj_cparse.c index 2ba50a72..f111537d 100644 --- a/src/lj_cparse.c +++ b/src/lj_cparse.c @@ -590,28 +590,34 @@ static void cp_expr_infix(CPState *cp, CPValue *k, int pri) k->id = k2.id > k3.id ? k2.id : k3.id; continue; } + /* fallthrough */ case 1: if (cp_opt(cp, CTOK_OROR)) { cp_expr_sub(cp, &k2, 2); k->i32 = k->u32 || k2.u32; k->id = CTID_INT32; continue; } + /* fallthrough */ case 2: if (cp_opt(cp, CTOK_ANDAND)) { cp_expr_sub(cp, &k2, 3); k->i32 = k->u32 && k2.u32; k->id = CTID_INT32; continue; } + /* fallthrough */ case 3: if (cp_opt(cp, '|')) { cp_expr_sub(cp, &k2, 4); k->u32 = k->u32 | k2.u32; goto arith_result; } + /* fallthrough */ case 4: if (cp_opt(cp, '^')) { cp_expr_sub(cp, &k2, 5); k->u32 = k->u32 ^ k2.u32; goto arith_result; } + /* fallthrough */ case 5: if (cp_opt(cp, '&')) { cp_expr_sub(cp, &k2, 6); k->u32 = k->u32 & k2.u32; goto arith_result; } + /* fallthrough */ case 6: if (cp_opt(cp, CTOK_EQ)) { cp_expr_sub(cp, &k2, 7); k->i32 = k->u32 == k2.u32; k->id = CTID_INT32; @@ -620,6 +626,7 @@ static void cp_expr_infix(CPState *cp, CPValue *k, int pri) cp_expr_sub(cp, &k2, 7); k->i32 = k->u32 != k2.u32; k->id = CTID_INT32; continue; } + /* fallthrough */ case 7: if (cp_opt(cp, '<')) { cp_expr_sub(cp, &k2, 8); @@ -654,6 +661,7 @@ static void cp_expr_infix(CPState *cp, CPValue *k, int pri) k->id = CTID_INT32; continue; } + /* fallthrough */ case 8: if (cp_opt(cp, CTOK_SHL)) { cp_expr_sub(cp, &k2, 9); k->u32 = k->u32 << k2.u32; @@ -666,6 +674,7 @@ static void cp_expr_infix(CPState *cp, CPValue *k, int pri) k->u32 = k->u32 >> k2.u32; continue; } + /* fallthrough */ case 9: if (cp_opt(cp, '+')) { cp_expr_sub(cp, &k2, 10); k->u32 = k->u32 + k2.u32; @@ -675,6 +684,7 @@ static void cp_expr_infix(CPState *cp, CPValue *k, int pri) } else if (cp_opt(cp, '-')) { cp_expr_sub(cp, &k2, 10); k->u32 = k->u32 - k2.u32; goto arith_result; } + /* fallthrough */ case 10: if (cp_opt(cp, '*')) { cp_expr_unary(cp, &k2); k->u32 = k->u32 * k2.u32; goto arith_result; diff --git a/src/lj_err.c b/src/lj_err.c index 54f42c37..13a1ded7 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -153,6 +153,7 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode) if ((frame-1)->u32.lo == LJ_CONT_FFI_CALLBACK) goto unwind_c; #endif + /* fallthrough */ case FRAME_VARG: /* Vararg frame. */ frame = frame_prevd(frame); break; diff --git a/src/lj_opt_sink.c b/src/lj_opt_sink.c index 6a00d04c..4efe395e 100644 --- a/src/lj_opt_sink.c +++ b/src/lj_opt_sink.c @@ -100,8 +100,8 @@ static void sink_mark_ins(jit_State *J) (LJ_32 && ir+1 < irlast && (ir+1)->o == IR_HIOP && !sink_checkphi(J, ir, (ir+1)->op2)))) irt_setmark(ir->t); /* Mark ineligible allocation. */ - /* fallthrough */ #endif + /* fallthrough */ case IR_USTORE: irt_setmark(IR(ir->op2)->t); /* Mark stored value. */ break; diff --git a/src/lj_parse.c b/src/lj_parse.c index 9e5976f7..67854951 100644 --- a/src/lj_parse.c +++ b/src/lj_parse.c @@ -2696,7 +2696,8 @@ static int parse_stmt(LexState *ls) lj_lex_next(ls); parse_goto(ls); break; - } /* else: fallthrough */ + } + /* fallthrough */ default: parse_call_assign(ls); break; diff --git a/src/luajit.c b/src/luajit.c index 9e15b26d..0e18dc5f 100644 --- a/src/luajit.c +++ b/src/luajit.c @@ -419,6 +419,7 @@ static int collectargs(char **argv, int *flags) break; case 'e': *flags |= FLAGS_EXEC; + /* fallthrough */ case 'j': /* LuaJIT extension */ case 'l': *flags |= FLAGS_OPTION; From 03cd5aa749c1bc3bb4b7d4289236b6096cb3dc85 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 29 Jan 2018 13:25:51 +0100 Subject: [PATCH 116/116] Clear stack after print_jit_status() in CLI. Suggested by Hydroque. --- src/luajit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/luajit.c b/src/luajit.c index 0e18dc5f..9ede59c1 100644 --- a/src/luajit.c +++ b/src/luajit.c @@ -151,6 +151,7 @@ static void print_jit_status(lua_State *L) fputs(s, stdout); } putc('\n', stdout); + lua_settop(L, 0); /* clear stack */ } static int getargs(lua_State *L, char **argv, int n)