From bfce3c1127fd57fe0c935c92bcf45b4737041edd Mon Sep 17 00:00:00 2001
From: Mike Pall <mike>
Date: Thu, 10 Mar 2011 01:57:24 +0100
Subject: [PATCH] DUALNUM: Handle integer type in JIT compiler.

---
 src/Makefile.dep    |   9 +-
 src/lj_asm.c        |  35 +++---
 src/lj_crecord.c    |  35 +++---
 src/lj_ffrecord.c   |  39 +++---
 src/lj_ir.c         |  26 ----
 src/lj_ir.h         |  30 +++--
 src/lj_iropt.h      |  12 +-
 src/lj_meta.c       |  28 +++--
 src/lj_meta.h       |   2 +-
 src/lj_obj.h        |   2 -
 src/lj_opt_fold.c   |   5 +-
 src/lj_opt_loop.c   |   9 +-
 src/lj_opt_narrow.c | 235 ++++++++++++++++++++++++++++++------
 src/lj_record.c     | 282 ++++++++++++++++++++++++--------------------
 src/lj_snap.c       |   3 +-
 src/lj_trace.c      |  12 +-
 16 files changed, 486 insertions(+), 278 deletions(-)
diff --git a/src/Makefile.dep b/src/Makefile.dep
index 1684ebd7..8458ec78 100644
--- a/src/Makefile.dep
+++ b/src/Makefile.dep
@@ -128,15 +128,16 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_tab.h lj_ir.h lj_jit.h lj_iropt.h
 lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \
  lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
- lj_dispatch.h lj_traceerr.h
+ lj_dispatch.h lj_traceerr.h lj_vm.h
 lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \
- lj_arch.h
+ lj_arch.h lj_err.h lj_errmsg.h lj_str.h lj_ir.h lj_jit.h lj_iropt.h \
+ lj_vm.h
 lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
  lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \
  lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h
 lj_record.o: lj_record.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \
- lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_frame.h lj_bc.h lj_ff.h \
- lj_ffdef.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h lj_dispatch.h \
+ lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_meta.h lj_frame.h lj_bc.h \
+ lj_ff.h lj_ffdef.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h lj_dispatch.h \
  lj_traceerr.h lj_record.h lj_ffrecord.h lj_snap.h lj_vm.h
 lj_snap.o: lj_snap.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
  lj_state.h lj_frame.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \
diff --git a/src/lj_asm.c b/src/lj_asm.c
index 5f3c5fab..d395010d 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -2059,7 +2059,7 @@ static void asm_href(ASMState *as, IRIns *ir)
     } else {
       emit_sjcc(as, CC_P, l_next);
       emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n));
-      emit_sjcc(as, CC_A, l_next);
+      emit_sjcc(as, CC_AE, l_next);
       /* The type check avoids NaN penalties and complaints from Valgrind. */
 #if LJ_64
       emit_u32(as, LJ_TISNUM);
@@ -2388,7 +2388,8 @@ static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck)
 
 static void asm_ahuvload(ASMState *as, IRIns *ir)
 {
-  lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t));
+  lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) ||
+	     (LJ_DUALNUM && irt_isint(ir->t)));
 #if LJ_64
   if (irt_islightud(ir->t)) {
     Reg dest = asm_load_lightud64(as, ir, 1);
@@ -2409,8 +2410,9 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
   }
   /* Always do the type check, even if the load result is unused. */
   as->mrm.ofs += 4;
-  asm_guardcc(as, irt_isnum(ir->t) ? CC_A : CC_NE);
-  if (LJ_64 && irt_isnum(ir->t)) {
+  asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE);
+  if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
+    lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t));
     emit_u32(as, LJ_TISNUM);
     emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM);
   } else {
@@ -2443,7 +2445,7 @@ static void asm_ahustore(ASMState *as, IRIns *ir)
     if (ra_hasreg(src)) {
       emit_mrm(as, XO_MOVto, src, RID_MRM);
     } else if (!irt_ispri(irr->t)) {
-      lua_assert(irt_isaddr(ir->t));
+      lua_assert(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t)));
       emit_i32(as, irr->i);
       emit_mrm(as, XO_MOVmi, 0, RID_MRM);
     }
@@ -2460,8 +2462,9 @@ static void asm_sload(ASMState *as, IRIns *ir)
   Reg base;
   lua_assert(!(ir->op2 & IRSLOAD_PARENT));  /* Handled by asm_head_side(). */
   lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK));
-  lua_assert(!irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)));
-  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t)) {
+  lua_assert(LJ_DUALNUM ||
+	     !irt_isint(t) || (ir->op2 & (IRSLOAD_CONVERT|IRSLOAD_FRAME)));
+  if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) {
     Reg left = ra_scratch(as, RSET_FPR);
     asm_tointg(as, ir, left);  /* Frees dest reg. Do this before base alloc. */
     base = ra_alloc1(as, REF_BASE, RSET_GPR);
@@ -2481,12 +2484,14 @@ static void asm_sload(ASMState *as, IRIns *ir)
     Reg dest = ra_dest(as, ir, allow);
     base = ra_alloc1(as, REF_BASE, RSET_GPR);
     lua_assert(irt_isnum(t) || irt_isint(t) || irt_isaddr(t));
-    if ((ir->op2 & IRSLOAD_CONVERT))
-      emit_rmro(as, XO_CVTSD2SI, dest, base, ofs);
-    else if (irt_isnum(t))
+    if ((ir->op2 & IRSLOAD_CONVERT)) {
+      t.irt = irt_isint(t) ? IRT_NUM : IRT_INT;  /* Check for original type. */
+      emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTSD2SI, dest, base, ofs);
+    } else if (irt_isnum(t)) {
       emit_rmro(as, XMM_MOVRM(as), dest, base, ofs);
-    else
+    } else {
       emit_rmro(as, XO_MOV, dest, base, ofs);
+    }
   } else {
     if (!(ir->op2 & IRSLOAD_TYPECHECK))
       return;  /* No type check: avoid base alloc. */
@@ -2494,8 +2499,9 @@ static void asm_sload(ASMState *as, IRIns *ir)
   }
   if ((ir->op2 & IRSLOAD_TYPECHECK)) {
     /* Need type check, even if the load result is unused. */
-    asm_guardcc(as, irt_isnum(t) ? CC_A : CC_NE);
-    if (LJ_64 && irt_isnum(t)) {
+    asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE);
+    if (LJ_64 && irt_type(t) >= IRT_NUM) {
+      lua_assert(irt_isinteger(t) || irt_isnum(t));
       emit_u32(as, LJ_TISNUM);
       emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4);
     } else {
@@ -3408,7 +3414,8 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap)
       Reg src = ra_alloc1(as, ref, RSET_FPR);
       emit_rmro(as, XO_MOVSDto, src, RID_BASE, ofs);
     } else {
-      lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t));
+      lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) ||
+		 (LJ_DUALNUM && irt_isinteger(ir->t)));
       if (!irref_isk(ref)) {
 	Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE));
 	emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs);
diff --git a/src/lj_crecord.c b/src/lj_crecord.c
index c93cece3..9482cc18 100644
--- a/src/lj_crecord.c
+++ b/src/lj_crecord.c
@@ -185,6 +185,8 @@ static TRef crec_ct_ct(jit_State *J, CType *d, CType *s, TRef dp, TRef sp,
 		    (sinfo & CTF_UNSIGNED) ? 0 : IRCONV_SEXT);
     else if (dsize < 8 && ssize == 8)  /* Truncate from 64 bit integer. */
       sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0);
+    else if (ssize <= 4)
+      sp = lj_opt_narrow_toint(J, sp);
   xstore:
     if (dt == IRT_I64 || dt == IRT_U64) lj_needsplit(J);
     if (dp == 0) return sp;
@@ -355,10 +357,10 @@ static TRef crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, TValue *sval)
   CType *s;
   if (LJ_LIKELY(tref_isinteger(sp))) {
     sid = CTID_INT32;
-    svisnz = (void *)(intptr_t)(numV(sval) != 0);
+    svisnz = (void *)(intptr_t)(tvisint(sval)?(intV(sval)!=0):!tviszero(sval));
   } else if (tref_isnum(sp)) {
     sid = CTID_DOUBLE;
-    svisnz = (void *)(intptr_t)(numV(sval) != 0);
+    svisnz = (void *)(intptr_t)(tvisint(sval)?(intV(sval)!=0):!tviszero(sval));
   } else if (tref_isbool(sp)) {
     sp = lj_ir_kint(J, tref_istrue(sp) ? 1 : 0);
     sid = CTID_BOOL;
@@ -443,16 +445,16 @@ static CTypeID crec_constructor(jit_State *J, GCcdata *cd, TRef tr)
 static TRef crec_reassoc_ofs(jit_State *J, TRef tr, ptrdiff_t *ofsp, MSize sz)
 {
   IRIns *ir = IR(tref_ref(tr));
-  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) &&
-      ir->o == IR_ADD && irref_isk(ir->op2)) {
+  if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && irref_isk(ir->op2) &&
+      (ir->o == IR_ADD || ir->o == IR_ADDOV || ir->o == IR_SUBOV)) {
     IRIns *irk = IR(ir->op2);
-    tr = ir->op1;
-#if LJ_64
-    if (irk->o == IR_KINT64)
-      *ofsp += (ptrdiff_t)ir_kint64(irk)->u64 * sz;
+    ptrdiff_t k;
+    if (LJ_64 && irk->o == IR_KINT64)
+      k = (ptrdiff_t)ir_kint64(irk)->u64 * sz;
     else
-#endif
-      *ofsp += (ptrdiff_t)irk->i * sz;
+      k = (ptrdiff_t)irk->i * sz;
+    if (ir->o == IR_SUBOV) *ofsp -= k; else *ofsp += k;
+    tr = ir->op1;  /* Not a TRef, but the caller doesn't care. */
   }
   return tr;
 }
@@ -477,16 +479,7 @@ void LJ_FASTCALL recff_cdata_index(jit_State *J, RecordFFData *rd)
 
   idx = J->base[1];
   if (tref_isnumber(idx)) {
-    /* The size of a ptrdiff_t is target-specific. */
-#if LJ_64
-    if (tref_isnum(idx))
-      idx = emitconv(idx, IRT_I64, IRT_NUM, IRCONV_TRUNC|IRCONV_ANY);
-    else
-      idx = emitconv(idx, IRT_I64, IRT_INT, IRCONV_SEXT);
-#else
-    if (tref_isnum(idx))
-      idx = emitconv(idx, IRT_INT, IRT_NUM, IRCONV_TRUNC|IRCONV_ANY);
-#endif
+    idx = lj_opt_narrow_cindex(J, idx);
   integer_key:
     if (ctype_ispointer(ct->info)) {
       CTSize sz;
@@ -635,7 +628,7 @@ static void crec_alloc(jit_State *J, RecordFFData *rd, CTypeID id)
 	  TRef sp, dp;
 	  TValue tv;
 	  TValue *sval = &tv;
-	  setnumV(&tv, 0);
+	  setintV(&tv, 0);
 	  if (!gcref(df->name)) continue;  /* Ignore unnamed fields. */
 	  dc = ctype_rawchild(cts, df);  /* Field type. */
 	  if (!(ctype_isnum(dc->info) || ctype_isptr(dc->info)))
diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c
index 631321d9..8077bf84 100644
--- a/src/lj_ffrecord.c
+++ b/src/lj_ffrecord.c
@@ -63,9 +63,9 @@ typedef void (LJ_FASTCALL *RecordFunc)(jit_State *J, RecordFFData *rd);
 /* Get runtime value of int argument. */
 static int32_t argv2int(jit_State *J, TValue *o)
 {
-  if (!tvisnum(o) && !(tvisstr(o) && lj_str_tonum(strV(o), o)))
+  if (!tvisnumber(o) && !(tvisstr(o) && lj_str_tonumber(strV(o), o)))
     lj_trace_err(J, LJ_TRERR_BADTYPE);
-  return lj_num2bit(numV(o));
+  return tvisint(o) ? intV(o) : lj_num2int(numV(o));
 }
 
 /* Get runtime value of string argument. */
@@ -75,9 +75,12 @@ static GCstr *argv2str(jit_State *J, TValue *o)
     return strV(o);
   } else {
     GCstr *s;
-    if (!tvisnum(o))
+    if (!tvisnumber(o))
       lj_trace_err(J, LJ_TRERR_BADTYPE);
-    s = lj_str_fromnum(J->L, &o->n);
+    if (tvisint(o))
+      s = lj_str_fromint(J->L, intV(o));
+    else
+      s = lj_str_fromnum(J->L, &o->n);
     setstrV(J->L, o, s);
     return s;
   }
@@ -128,7 +131,7 @@ static void LJ_FASTCALL recff_type(jit_State *J, RecordFFData *rd)
 {
   /* Arguments already specialized. Result is a constant string. Neat, huh? */
   uint32_t t;
-  if (tvisnum(&rd->argv[0]))
+  if (tvisnumber(&rd->argv[0]))
     t = ~LJ_TNUMX;
   else if (LJ_64 && tvislightud(&rd->argv[0]))
     t = ~LJ_TLIGHTUD;
@@ -255,7 +258,7 @@ static void LJ_FASTCALL recff_tonumber(jit_State *J, RecordFFData *rd)
   TRef tr = J->base[0];
   TRef base = J->base[1];
   if (tr && base) {
-    base = lj_ir_toint(J, base);
+    base = lj_opt_narrow_toint(J, base);
     if (!tref_isk(base) || IR(tref_ref(base))->i != 10)
       recff_nyiu(J);
   }
@@ -332,12 +335,12 @@ static void LJ_FASTCALL recff_ipairs_aux(jit_State *J, RecordFFData *rd)
   RecordIndex ix;
   ix.tab = J->base[0];
   if (tref_istab(ix.tab)) {
-    if (!tvisnum(&rd->argv[1]))  /* No support for string coercion. */
+    if (!tvisnumber(&rd->argv[1]))  /* No support for string coercion. */
       lj_trace_err(J, LJ_TRERR_BADTYPE);
-    setnumV(&ix.keyv, numV(&rd->argv[1])+(lua_Number)1);
+    setintV(&ix.keyv, numberVint(&rd->argv[1])+1);
     settabV(J->L, &ix.tabv, tabV(&rd->argv[0]));
     ix.val = 0; ix.idxchain = 0;
-    ix.key = lj_ir_toint(J, J->base[1]);
+    ix.key = lj_opt_narrow_toint(J, J->base[1]);
     J->base[0] = ix.key = emitir(IRTI(IR_ADD), ix.key, lj_ir_kint(J, 1));
     J->base[1] = lj_record_idx(J, &ix);
     rd->nres = tref_isnil(J->base[1]) ? 0 : 2;
@@ -525,26 +528,26 @@ static void LJ_FASTCALL recff_math_random(jit_State *J, RecordFFData *rd)
 /* Record unary bit.tobit, bit.bnot, bit.bswap. */
 static void LJ_FASTCALL recff_bit_unary(jit_State *J, RecordFFData *rd)
 {
-  TRef tr = lj_ir_tobit(J, J->base[0]);
+  TRef tr = lj_opt_narrow_tobit(J, J->base[0]);
   J->base[0] = (rd->data == IR_TOBIT) ? tr : emitir(IRTI(rd->data), tr, 0);
 }
 
 /* Record N-ary bit.band, bit.bor, bit.bxor. */
 static void LJ_FASTCALL recff_bit_nary(jit_State *J, RecordFFData *rd)
 {
-  TRef tr = lj_ir_tobit(J, J->base[0]);
+  TRef tr = lj_opt_narrow_tobit(J, J->base[0]);
   uint32_t op = rd->data;
   BCReg i;
   for (i = 1; J->base[i] != 0; i++)
-    tr = emitir(IRTI(op), tr, lj_ir_tobit(J, J->base[i]));
+    tr = emitir(IRTI(op), tr, lj_opt_narrow_tobit(J, J->base[i]));
   J->base[0] = tr;
 }
 
 /* Record bit shifts. */
 static void LJ_FASTCALL recff_bit_shift(jit_State *J, RecordFFData *rd)
 {
-  TRef tr = lj_ir_tobit(J, J->base[0]);
-  TRef tsh = lj_ir_tobit(J, J->base[1]);
+  TRef tr = lj_opt_narrow_tobit(J, J->base[0]);
+  TRef tsh = lj_opt_narrow_tobit(J, J->base[1]);
   if (!(rd->data < IR_BROL ? LJ_TARGET_MASKSHIFT : LJ_TARGET_MASKROT) &&
       !tref_isk(tsh))
     tsh = emitir(IRTI(IR_BAND), tsh, lj_ir_kint(J, 31));
@@ -570,25 +573,25 @@ static void LJ_FASTCALL recff_string_range(jit_State *J, RecordFFData *rd)
   int32_t start, end;
   if (rd->data) {  /* string.sub(str, start [,end]) */
     start = argv2int(J, &rd->argv[1]);
-    trstart = lj_ir_toint(J, J->base[1]);
+    trstart = lj_opt_narrow_toint(J, J->base[1]);
     trend = J->base[2];
     if (tref_isnil(trend)) {
       trend = lj_ir_kint(J, -1);
       end = -1;
     } else {
-      trend = lj_ir_toint(J, trend);
+      trend = lj_opt_narrow_toint(J, trend);
       end = argv2int(J, &rd->argv[2]);
     }
   } else {  /* string.byte(str, [,start [,end]]) */
     if (J->base[1]) {
       start = argv2int(J, &rd->argv[1]);
-      trstart = lj_ir_toint(J, J->base[1]);
+      trstart = lj_opt_narrow_toint(J, J->base[1]);
       trend = J->base[2];
       if (tref_isnil(trend)) {
 	trend = trstart;
 	end = start;
       } else {
-	trend = lj_ir_toint(J, trend);
+	trend = lj_opt_narrow_toint(J, trend);
 	end = argv2int(J, &rd->argv[2]);
       }
     } else {
diff --git a/src/lj_ir.c b/src/lj_ir.c
index 1d57938e..721cfd0f 100644
--- a/src/lj_ir.c
+++ b/src/lj_ir.c
@@ -426,32 +426,6 @@ TRef LJ_FASTCALL lj_ir_tostr(jit_State *J, TRef tr)
   return tr;
 }
 
-/* Convert from number or string to bitop operand (overflow wrapped). */
-TRef LJ_FASTCALL lj_ir_tobit(jit_State *J, TRef tr)
-{
-  if (!tref_isinteger(tr)) {
-    if (tref_isstr(tr))
-      tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
-    else if (!tref_isnum(tr))
-      lj_trace_err(J, LJ_TRERR_BADTYPE);
-    tr = emitir(IRTI(IR_TOBIT), tr, lj_ir_knum_tobit(J));
-  }
-  return tr;
-}
-
-/* Convert from number or string to integer (overflow undefined). */
-TRef LJ_FASTCALL lj_ir_toint(jit_State *J, TRef tr)
-{
-  if (!tref_isinteger(tr)) {
-    if (tref_isstr(tr))
-      tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
-    else if (!tref_isnum(tr))
-      lj_trace_err(J, LJ_TRERR_BADTYPE);
-    tr = emitir(IRTI(IR_CONV), tr, IRCONV_INT_NUM|IRCONV_ANY);
-  }
-  return tr;
-}
-
 /* -- Miscellaneous IR ops ------------------------------------------------ */
 
 /* Evaluate numeric comparison. */
diff --git a/src/lj_ir.h b/src/lj_ir.h
index 060cf562..c46bbbe0 100644
--- a/src/lj_ir.h
+++ b/src/lj_ir.h
@@ -124,7 +124,7 @@
   _(XBAR,	S , ___, ___) \
   \
   /* Type conversions. */ \
-  _(CONV,	N , ref, lit) \
+  _(CONV,	NW, ref, lit) \
   _(TOBIT,	N , ref, ref) \
   _(TOSTR,	N , ref, ___) \
   _(STRTO,	N , ref, ___) \
@@ -345,8 +345,8 @@ typedef enum {
 #define IRM_AW			(IRM_A|IRM_W)
 #define IRM_LW			(IRM_L|IRM_W)
 
-#define irm_op1(m)		(cast(IRMode, (m)&3))
-#define irm_op2(m)		(cast(IRMode, ((m)>>2)&3))
+#define irm_op1(m)		((IRMode)((m)&3))
+#define irm_op2(m)		((IRMode)(((m)>>2)&3))
 #define irm_iscomm(m)		((m) & IRM_C)
 #define irm_kind(m)		((m) & IRM_S)
 
@@ -401,8 +401,8 @@ typedef struct IRType1 { uint8_t irt; } IRType1;
 #define IRTG(o, t)		(IRT((o), IRT_GUARD|(t)))
 #define IRTGI(o)		(IRT((o), IRT_GUARD|IRT_INT))
 
-#define irt_t(t)		(cast(IRType, (t).irt))
-#define irt_type(t)		(cast(IRType, (t).irt & IRT_TYPE))
+#define irt_t(t)		((IRType)(t).irt)
+#define irt_type(t)		((IRType)((t).irt & IRT_TYPE))
 #define irt_sametype(t1, t2)	((((t1).irt ^ (t2).irt) & IRT_TYPE) == 0)
 #define irt_typerange(t, first, last) \
   ((uint32_t)((t).irt & IRT_TYPE) - (uint32_t)(first) <= (uint32_t)(last-first))
@@ -441,18 +441,30 @@ typedef struct IRType1 { uint8_t irt; } IRType1;
 
 static LJ_AINLINE IRType itype2irt(const TValue *tv)
 {
-  if (tvisnum(tv))
+  if (tvisint(tv))
+    return IRT_INT;
+  else if (tvisnum(tv))
     return IRT_NUM;
 #if LJ_64
   else if (tvislightud(tv))
     return IRT_LIGHTUD;
 #endif
   else
-    return cast(IRType, ~itype(tv));
+    return (IRType)~itype(tv);
 }
 
-#define irt_toitype(t) \
-  check_exp(!(LJ_64 && irt_islightud((t))), ~(uint32_t)irt_type((t)))
+static LJ_AINLINE uint32_t irt_toitype_(IRType t)
+{
+  lua_assert(!LJ_64 || t != IRT_LIGHTUD);
+  if (LJ_DUALNUM && t > IRT_NUM) {
+    return LJ_TISNUM;
+  } else {
+    lua_assert(t <= IRT_NUM);
+    return ~(uint32_t)t;
+  }
+}
+
+#define irt_toitype(t)		irt_toitype_(irt_type((t)))
 
 #define irt_isguard(t)		((t).irt & IRT_GUARD)
 #define irt_ismarked(t)		((t).irt & IRT_MARK)
diff --git a/src/lj_iropt.h b/src/lj_iropt.h
index db99c118..1c94e91c 100644
--- a/src/lj_iropt.h
+++ b/src/lj_iropt.h
@@ -84,8 +84,6 @@ LJ_FUNC void lj_ir_kvalue(lua_State *L, TValue *tv, const IRIns *ir);
 /* Convert IR operand types. */
 LJ_FUNC TRef LJ_FASTCALL lj_ir_tonum(jit_State *J, TRef tr);
 LJ_FUNC TRef LJ_FASTCALL lj_ir_tostr(jit_State *J, TRef tr);
-LJ_FUNC TRef LJ_FASTCALL lj_ir_tobit(jit_State *J, TRef tr);
-LJ_FUNC TRef LJ_FASTCALL lj_ir_toint(jit_State *J, TRef tr);
 
 /* Miscellaneous IR ops. */
 LJ_FUNC int lj_ir_numcmp(lua_Number a, lua_Number b, IROp op);
@@ -134,9 +132,17 @@ LJ_FUNC TRef LJ_FASTCALL lj_opt_dse_xstore(jit_State *J);
 
 /* Narrowing. */
 LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_index(jit_State *J, TRef key);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_toint(jit_State *J, TRef tr);
+LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_tobit(jit_State *J, TRef tr);
+#if LJ_HASFFI
+LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_cindex(jit_State *J, TRef key);
+#endif
+LJ_FUNC TRef lj_opt_narrow_arith(jit_State *J, TRef rb, TRef rc,
+				 TValue *vb, TValue *vc, IROp op);
 LJ_FUNC TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc);
 LJ_FUNC TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc);
-LJ_FUNC IRType lj_opt_narrow_forl(cTValue *forbase);
+LJ_FUNC IRType lj_opt_narrow_forl(jit_State *J, cTValue *forbase);
 
 /* Optimization passes. */
 LJ_FUNC void lj_opt_dce(jit_State *J);
diff --git a/src/lj_meta.c b/src/lj_meta.c
index 23f11f58..48cee510 100644
--- a/src/lj_meta.c
+++ b/src/lj_meta.c
@@ -393,13 +393,27 @@ void LJ_FASTCALL lj_meta_for(lua_State *L, TValue *o)
     lj_err_msg(L, LJ_ERR_FORLIM);
   if (!(tvisnumber(o+2) || (tvisstr(o+2) && lj_str_tonumber(strV(o+2), o+2))))
     lj_err_msg(L, LJ_ERR_FORSTEP);
-#if LJ_DUALNUM
-  /* Ensure all slots are integers or all slots are numbers. */
-  if (!(tvisint(o) && tvisint(o+1) && tvisint(o+2))) {
-    if (tvisint(o)) setnumV(o, (lua_Number)intV(o));
-    if (tvisint(o+1)) setnumV(o+1, (lua_Number)intV(o+1));
-    if (tvisint(o+2)) setnumV(o+2, (lua_Number)intV(o+2));
+  if (LJ_DUALNUM) {
+    /* Ensure all slots are integers or all slots are numbers. */
+    int32_t k[3];
+    int nint = 0;
+    ptrdiff_t i;
+    for (i = 0; i <= 2; i++) {
+      if (tvisint(o+i)) {
+	k[i] = intV(o+i); nint++;
+      } else {
+	k[i] = lj_num2int(numV(o+i)); nint += ((lua_Number)k[i] == numV(o+i));
+      }
+    }
+    if (nint == 3) {  /* Narrow to integers. */
+      setintV(o, k[0]);
+      setintV(o+1, k[1]);
+      setintV(o+2, k[2]);
+    } else if (nint != 0) {  /* Widen to numbers. */
+      if (tvisint(o)) setnumV(o, (lua_Number)intV(o));
+      if (tvisint(o+1)) setnumV(o+1, (lua_Number)intV(o+1));
+      if (tvisint(o+2)) setnumV(o+2, (lua_Number)intV(o+2));
+    }
   }
-#endif
 }
 
diff --git a/src/lj_meta.h b/src/lj_meta.h
index 687e6c08..32b3dec3 100644
--- a/src/lj_meta.h
+++ b/src/lj_meta.h
@@ -29,6 +29,6 @@ LJ_FUNCA TValue *lj_meta_equal(lua_State *L, GCobj *o1, GCobj *o2, int ne);
 LJ_FUNCA TValue * LJ_FASTCALL lj_meta_equal_cd(lua_State *L, BCIns ins);
 LJ_FUNCA TValue *lj_meta_comp(lua_State *L, cTValue *o1, cTValue *o2, int op);
 LJ_FUNCA void lj_meta_call(lua_State *L, TValue *func, TValue *top);
-LJ_FUNCA void LJ_FASTCALL lj_meta_for(lua_State *L, TValue *base);
+LJ_FUNCA void LJ_FASTCALL lj_meta_for(lua_State *L, TValue *o);
 
 #endif
diff --git a/src/lj_obj.h b/src/lj_obj.h
index 88289f3e..19a2345f 100644
--- a/src/lj_obj.h
+++ b/src/lj_obj.h
@@ -325,8 +325,6 @@ typedef struct GCproto {
 #define proto_kgc(pt, idx) \
   check_exp((uintptr_t)(intptr_t)(idx) >= (uintptr_t)-(intptr_t)(pt)->sizekgc, \
 	    gcref(mref((pt)->k, GCRef)[(idx)]))
-#define proto_knum(pt, idx) \
-  check_exp((uintptr_t)(idx) < (pt)->sizekn, mref((pt)->k, lua_Number)[(idx)])
 #define proto_knumtv(pt, idx) \
   check_exp((uintptr_t)(idx) < (pt)->sizekn, &mref((pt)->k, TValue)[(idx)])
 #define proto_bc(pt)		((BCIns *)((char *)(pt) + sizeof(GCproto)))
diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c
index 471a4b29..e2d5c517 100644
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@@ -558,7 +558,10 @@ LJFOLD(CONV KINT IRCONV_I64_INT)
 LJFOLD(CONV KINT IRCONV_U64_INT)
 LJFOLDF(kfold_conv_kint_i64)
 {
-  return INT64FOLD((uint64_t)(int64_t)fleft->i);
+  if ((fins->op2 & IRCONV_SEXT))
+    return INT64FOLD((uint64_t)(int64_t)fleft->i);
+  else
+    return INT64FOLD((uint64_t)(int64_t)(uint32_t)fleft->i);
 }
 
 LJFOLD(CONV KINT64 IRCONV_NUM_I64)
diff --git a/src/lj_opt_loop.c b/src/lj_opt_loop.c
index 559e579e..6dd06636 100644
--- a/src/lj_opt_loop.c
+++ b/src/lj_opt_loop.c
@@ -300,8 +300,11 @@ static void loop_unroll(jit_State *J)
 	}
 	/* Check all loop-carried dependencies for type instability. */
 	if (!irt_sametype(t, irr->t)) {
-	  if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num case. */
+	  if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num. */
 	    subst[ins] = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
+	  else if (irt_isnum(irr->t) && irt_isinteger(t))  /* Fix num->int. */
+	    subst[ins] = tref_ref(emitir(IRTGI(IR_CONV), ref,
+					 IRCONV_INT_NUM|IRCONV_CHECK));
 	  else if (!(irt_isinteger(t) && irt_isinteger(irr->t)))
 	    lj_trace_err(J, LJ_TRERR_TYPEINS);
 	}
@@ -355,8 +358,8 @@ int lj_opt_loop(jit_State *J)
   int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt);
   if (LJ_UNLIKELY(errcode)) {
     lua_State *L = J->L;
-    if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) {  /* Trace error? */
-      int32_t e = lj_num2int(numV(L->top-1));
+    if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) {  /* Trace error? */
+      int32_t e = numberVint(L->top-1);
       switch ((TraceError)e) {
       case LJ_TRERR_TYPEINS:  /* Type instability. */
       case LJ_TRERR_GFAIL:  /* Guard would always fail. */
diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c
index 0a2bb6cd..1727e9b5 100644
--- a/src/lj_opt_narrow.c
+++ b/src/lj_opt_narrow.c
@@ -1,5 +1,6 @@
 /*
 ** NARROW: Narrowing of numbers to integers (double to int32_t).
+** STRIPOV: Stripping of overflow checks.
 ** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
 */
 
@@ -16,6 +17,7 @@
 #include "lj_jit.h"
 #include "lj_iropt.h"
 #include "lj_trace.h"
+#include "lj_vm.h"
 
 /* Rationale for narrowing optimizations:
 **
@@ -57,24 +59,34 @@
 **
 ** A better solution is to keep all numbers as FP values and only narrow
 ** when it's beneficial to do so. LuaJIT uses predictive narrowing for
-** induction variables and demand-driven narrowing for index expressions
-** and bit operations. Additionally it can eliminate or hoists most of the
-** resulting overflow checks. Regular arithmetic computations are never
-** narrowed to integers.
+** induction variables and demand-driven narrowing for index expressions,
+** integer arguments and bit operations. Additionally it can eliminate or
+** hoist most of the resulting overflow checks. Regular arithmetic
+** computations are never narrowed to integers.
 **
 ** The integer type in the IR has convenient wrap-around semantics and
 ** ignores overflow. Extra operations have been added for
 ** overflow-checking arithmetic (ADDOV/SUBOV) instead of an extra type.
 ** Apart from reducing overall complexity of the compiler, this also
 ** nicely solves the problem where you want to apply algebraic
-** simplifications to ADD, but not to ADDOV. And the assembler can use lea
-** instead of an add for integer ADD, but not for ADDOV (lea does not
-** affect the flags, but it helps to avoid register moves).
+** simplifications to ADD, but not to ADDOV. And the x86/x64 assembler can
+** use lea instead of an add for integer ADD, but not for ADDOV (lea does
+** not affect the flags, but it helps to avoid register moves).
 **
-** Note that all of the above has to be reconsidered if LuaJIT is to be
-** ported to architectures with slow FP operations or with no hardware FPU
-** at all. In the latter case an integer-only port may be the best overall
-** solution (if this still meets user demands).
+**
+** All of the above has to be reconsidered for architectures with slow FP
+** operations or without a hardware FPU. The dual-number mode of LuaJIT
+** addresses this issue. Arithmetic operations are performed on integers
+** as far as possible and overflow checks are added as needed.
+**
+** This implies that narrowing for integer arguments and bit operations
+** should also strip overflow checks, e.g. replace ADDOV with ADD. The
+** original overflow guards are weak and can be eliminated by DCE, if
+** there's no other use.
+**
+** A slight twist is that it's usually beneficial to use overflow-checked
+** integer arithmetics if all inputs are already integers. This is the only
+** change that affects the single-number mode, too.
 */
 
 /* Some local macros to save typing. Undef'd at the end. */
@@ -94,10 +106,10 @@
 ** already takes care of eliminating simple redundant conversions like
 ** CONV.int.num(CONV.num.int(x)) ==> x.
 **
-** But the surrounding code is FP-heavy and all arithmetic operations are
-** performed on FP numbers. Consider a common example such as 'x=t[i+1]',
-** with 'i' already an integer (due to induction variable narrowing). The
-** index expression would be recorded as
+** But the surrounding code is FP-heavy and arithmetic operations are
+** performed on FP numbers (for the single-number mode). Consider a common
+** example such as 'x=t[i+1]', with 'i' already an integer (due to induction
+** variable narrowing). The index expression would be recorded as
 **   CONV.int.num(ADD(CONV.num.int(i), 1))
 ** which is clearly suboptimal.
 **
@@ -113,6 +125,9 @@
 ** FP ops remain in the IR and are eliminated by DCE since all references to
 ** them are gone.
 **
+** [In dual-number mode the trace recorder already emits ADDOV etc., but
+** this can be further reduced. See below.]
+**
 ** Special care has to be taken to avoid narrowing across an operation
 ** which is potentially operating on non-integral operands. One obvious
 ** case is when an expression contains a non-integral constant, but ends
@@ -221,6 +236,26 @@ static void narrow_bpc_set(jit_State *J, IRRef1 key, IRRef1 val, IRRef mode)
   bp->mode = mode;
 }
 
+/* Backpropagate overflow stripping. */
+static void narrow_stripov_backprop(NarrowConv *nc, IRRef ref, int depth)
+{
+  jit_State *J = nc->J;
+  IRIns *ir = IR(ref);
+  if (ir->o == IR_ADDOV || ir->o == IR_SUBOV ||
+      (ir->o == IR_MULOV && (nc->mode & IRCONV_CONVMASK) == IRCONV_ANY)) {
+    BPropEntry *bp = narrow_bpc_get(nc->J, ref, IRCONV_TOBIT);
+    if (bp) {
+      ref = bp->val;
+    } else if (++depth < NARROW_MAX_BACKPROP && nc->sp < nc->maxsp) {
+      narrow_stripov_backprop(nc, ir->op1, depth);
+      narrow_stripov_backprop(nc, ir->op2, depth);
+      *nc->sp++ = NARROWINS(IRT(ir->o - IR_ADDOV + IR_ADD, IRT_INT), ref);
+      return;
+    }
+  }
+  *nc->sp++ = NARROWINS(NARROW_REF, ref);
+}
+
 /* Backpropagate narrowing conversion. Return number of needed conversions. */
 static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
 {
@@ -230,24 +265,26 @@ static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
 
   /* Check the easy cases first. */
   if (ir->o == IR_CONV && (ir->op2 & IRCONV_SRCMASK) == IRT_INT) {
-    if (nc->t == IRT_I64)
-      *nc->sp++ = NARROWINS(NARROW_SEXT, ir->op1);  /* Reduce to sign-ext. */
+    if ((nc->mode & IRCONV_CONVMASK) <= IRCONV_ANY)
+      narrow_stripov_backprop(nc, ir->op1, depth+1);
     else
       *nc->sp++ = NARROWINS(NARROW_REF, ir->op1);  /* Undo conversion. */
+    if (nc->t == IRT_I64)
+      *nc->sp++ = NARROWINS(NARROW_SEXT, 0);  /* Sign-extend integer. */
     return 0;
   } else if (ir->o == IR_KNUM) {  /* Narrow FP constant. */
     lua_Number n = ir_knum(ir)->n;
     if ((nc->mode & IRCONV_CONVMASK) == IRCONV_TOBIT) {
       /* Allows a wider range of constants. */
       int64_t k64 = (int64_t)n;
-      if (n == cast_num(k64)) {  /* Only if constant doesn't lose precision. */
+      if (n == (lua_Number)k64) {  /* Only if const doesn't lose precision. */
 	*nc->sp++ = NARROWINS(NARROW_INT, 0);
 	*nc->sp++ = (NarrowIns)k64;  /* But always truncate to 32 bits. */
 	return 0;
       }
     } else {
       int32_t k = lj_num2int(n);
-      if (n == cast_num(k)) {  /* Only if constant is really an integer. */
+      if (n == (lua_Number)k) {  /* Only if constant is really an integer. */
 	*nc->sp++ = NARROWINS(NARROW_INT, 0);
 	*nc->sp++ = (NarrowIns)k;
 	return 0;
@@ -287,7 +324,8 @@ static int narrow_conv_backprop(NarrowConv *nc, IRRef ref, int depth)
       mode = (IRT_INT<<5)|IRT_NUM|IRCONV_INDEX;
       bp = narrow_bpc_get(nc->J, (IRRef1)ref, mode);
       if (bp) {
-	*nc->sp++ = NARROWINS(NARROW_SEXT, bp->val);
+	*nc->sp++ = NARROWINS(NARROW_REF, bp->val);
+	*nc->sp++ = NARROWINS(NARROW_SEXT, 0);
 	return 0;
       }
     }
@@ -326,8 +364,9 @@ static IRRef narrow_conv_emit(jit_State *J, NarrowConv *nc)
     } else if (op == NARROW_CONV) {
       *sp++ = emitir_raw(convot, ref, convop2);  /* Raw emit avoids a loop. */
     } else if (op == NARROW_SEXT) {
-      *sp++ = emitir(IRT(IR_CONV, IRT_I64), ref,
-		     (IRT_I64<<5)|IRT_INT|IRCONV_SEXT);
+      lua_assert(sp >= nc->stack+1);
+      sp[-1] = emitir(IRT(IR_CONV, IRT_I64), sp[-1],
+		      (IRT_I64<<5)|IRT_INT|IRCONV_SEXT);
     } else if (op == NARROW_INT) {
       lua_assert(next < last);
       *sp++ = nc->t == IRT_I64 ?
@@ -340,7 +379,7 @@ static IRRef narrow_conv_emit(jit_State *J, NarrowConv *nc)
       /* Omit some overflow checks for array indexing. See comments above. */
       if ((mode & IRCONV_CONVMASK) == IRCONV_INDEX) {
 	if (next == last && irref_isk(narrow_ref(sp[0])) &&
-	  (uint32_t)IR(narrow_ref(sp[0]))->i + 0x40000000 < 0x80000000)
+	  (uint32_t)IR(narrow_ref(sp[0]))->i + 0x40000000u < 0x80000000u)
 	  guardot = 0;
 	else  /* Otherwise cache a stronger check. */
 	  mode += IRCONV_CHECK-IRCONV_INDEX;
@@ -377,12 +416,123 @@ TRef LJ_FASTCALL lj_opt_narrow_convert(jit_State *J)
   return NEXTFOLD;
 }
 
+/* -- Narrowing of implicit conversions ----------------------------------- */
+
+/* Recursively strip overflow checks. */
+static TRef narrow_stripov(jit_State *J, TRef tr, int lastop, IRRef mode)
+{
+  IRRef ref = tref_ref(tr);
+  IRIns *ir = IR(ref);
+  int op = ir->o;
+  if (op >= IR_ADDOV && op <= lastop) {
+    BPropEntry *bp = narrow_bpc_get(J, ref, mode);
+    if (bp) {
+      return TREF(bp->val, irt_t(IR(bp->val)->t));
+    } else {
+      IRRef op1 = ir->op1, op2 = ir->op2;  /* The IR may be reallocated. */
+      op1 = narrow_stripov(J, op1, lastop, mode);
+      op2 = narrow_stripov(J, op2, lastop, mode);
+      tr = emitir(IRT(op - IR_ADDOV + IR_ADD,
+		      ((mode & IRCONV_DSTMASK) >> IRCONV_DSH)), op1, op2);
+      narrow_bpc_set(J, ref, tref_ref(tr), mode);
+    }
+  } else if (LJ_64 && (mode & IRCONV_SEXT) && !irt_is64(ir->t)) {
+    tr = emitir(IRT(IR_CONV, IRT_INTP), tr, mode);
+  }
+  return tr;
+}
+
+/* Narrow array index. */
+TRef LJ_FASTCALL lj_opt_narrow_index(jit_State *J, TRef tr)
+{
+  IRIns *ir;
+  lua_assert(tref_isnumber(tr));
+  if (tref_isnum(tr))  /* Conversion may be narrowed, too. See above. */
+    return emitir(IRTGI(IR_CONV), tr, IRCONV_INT_NUM|IRCONV_INDEX);
+  /* Omit some overflow checks for array indexing. See comments above. */
+  ir = IR(tref_ref(tr));
+  if ((ir->o == IR_ADDOV || ir->o == IR_SUBOV) && irref_isk(ir->op2) &&
+      (uint32_t)IR(ir->op2)->i + 0x40000000u < 0x80000000u)
+    return emitir(IRTI(ir->o - IR_ADDOV + IR_ADD), ir->op1, ir->op2);
+  return tr;
+}
+
+/* Narrow conversion to integer operand (overflow undefined). */
+TRef LJ_FASTCALL lj_opt_narrow_toint(jit_State *J, TRef tr)
+{
+  if (tref_isstr(tr))
+    tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
+  if (tref_isnum(tr))  /* Conversion may be narrowed, too. See above. */
+    return emitir(IRTI(IR_CONV), tr, IRCONV_INT_NUM|IRCONV_ANY);
+  if (!tref_isinteger(tr))
+    lj_trace_err(J, LJ_TRERR_BADTYPE);
+  /*
+  ** Undefined overflow semantics allow stripping of ADDOV, SUBOV and MULOV.
+  ** Use IRCONV_TOBIT for the cache entries, since the semantics are the same.
+  */
+  return narrow_stripov(J, tr, IR_MULOV, (IRT_INT<<5)|IRT_INT|IRCONV_TOBIT);
+}
+
+/* Narrow conversion to bitop operand (overflow wrapped). */
+TRef LJ_FASTCALL lj_opt_narrow_tobit(jit_State *J, TRef tr)
+{
+  if (tref_isstr(tr))
+    tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0);
+  if (tref_isnum(tr))  /* Conversion may be narrowed, too. See above. */
+    return emitir(IRTI(IR_TOBIT), tr, lj_ir_knum_tobit(J));
+  if (!tref_isinteger(tr))
+    lj_trace_err(J, LJ_TRERR_BADTYPE);
+  /*
+  ** Wrapped overflow semantics allow stripping of ADDOV and SUBOV.
+  ** MULOV cannot be stripped due to precision widening.
+  */
+  return narrow_stripov(J, tr, IR_SUBOV, (IRT_INT<<5)|IRT_INT|IRCONV_TOBIT);
+}
+
+#if LJ_HASFFI
+/* Narrow C array index (overflow undefined). */
+TRef LJ_FASTCALL lj_opt_narrow_cindex(jit_State *J, TRef tr)
+{
+  lua_assert(tref_isnumber(tr));
+  if (tref_isnum(tr))
+    return emitir(IRTI(IR_CONV), tr,
+		  (IRT_INTP<<5)|IRT_NUM|IRCONV_TRUNC|IRCONV_ANY);
+  /* Undefined overflow semantics allow stripping of ADDOV, SUBOV and MULOV. */
+  return narrow_stripov(J, tr, IR_MULOV,
+			LJ_64 ? ((IRT_INTP<<5)|IRT_INT|IRCONV_SEXT) :
+				((IRT_INTP<<5)|IRT_INT|IRCONV_TOBIT));
+}
+#endif
+
 /* -- Narrowing of arithmetic operators ----------------------------------- */
 
 /* Check whether a number fits into an int32_t (-0 is ok, too). */
 static int numisint(lua_Number n)
 {
-  return (n == cast_num(lj_num2int(n)));
+  return (n == (lua_Number)lj_num2int(n));
+}
+
+/* Narrowing of arithmetic operations. */
+TRef lj_opt_narrow_arith(jit_State *J, TRef rb, TRef rc,
+			 TValue *vb, TValue *vc, IROp op)
+{
+  if (tref_isstr(rb)) {
+    rb = emitir(IRTG(IR_STRTO, IRT_NUM), rb, 0);
+    lj_str_tonum(strV(vb), vb);
+  }
+  if (tref_isstr(rc)) {
+    rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0);
+    lj_str_tonum(strV(vc), vc);
+  }
+  /* Must not narrow MUL in non-DUALNUM variant, because it loses -0. */
+  if ((op >= IR_ADD && op <= (LJ_DUALNUM ? IR_MUL : IR_SUB)) &&
+      tref_isinteger(rb) && tref_isinteger(rc) &&
+      numisint(lj_vm_foldarith(numberVnum(vb), numberVnum(vc),
+			       (int)op - (int)IR_ADD)))
+    return emitir(IRTGI((int)op - (int)IR_ADD + (int)IR_ADDOV), rb, rc);
+  if (!tref_isnum(rb)) rb = emitir(IRTN(IR_CONV), rb, IRCONV_NUM_INT);
+  if (!tref_isnum(rc)) rc = emitir(IRTN(IR_CONV), rc, IRCONV_NUM_INT);
+  return emitir(IRTN(op), rb, rc);
 }
 
 /* Narrowing of modulo operator. */
@@ -409,16 +559,15 @@ TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc)
 /* Narrowing of power operator or math.pow. */
 TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc)
 {
-  lua_Number n;
   if (tvisstr(vc) && !lj_str_tonum(strV(vc), vc))
     lj_trace_err(J, LJ_TRERR_BADTYPE);
-  n = numV(vc);
   /* Narrowing must be unconditional to preserve (-x)^i semantics. */
-  if (numisint(n)) {
+  if (tvisint(vc) || numisint(numV(vc))) {
     int checkrange = 0;
     /* Split pow is faster for bigger exponents. But do this only for (+k)^i. */
     if (tref_isk(rb) && (int32_t)ir_knum(IR(tref_ref(rb)))->u32.hi >= 0) {
-      if (!(n >= -65536.0 && n <= 65536.0)) goto split_pow;
+      int32_t k = numberVint(vc);
+      if (!(k >= -65536 && k <= 65536)) goto split_pow;
       checkrange = 1;
     }
     if (!tref_isinteger(rc)) {
@@ -448,20 +597,28 @@ split_pow:
 
 /* -- Predictive narrowing of induction variables ------------------------- */
 
-/* Narrow the FORL index type by looking at the runtime values. */
-IRType lj_opt_narrow_forl(cTValue *forbase)
+/* Narrow a single runtime value. */
+static int narrow_forl(jit_State *J, cTValue *o)
 {
-  lua_assert(tvisnum(&forbase[FORL_IDX]) &&
-	     tvisnum(&forbase[FORL_STOP]) &&
-	     tvisnum(&forbase[FORL_STEP]));
+  if (tvisint(o)) return 1;
+  if (LJ_DUALNUM || (J->flags & JIT_F_OPT_NARROW)) return numisint(numV(o));
+  return 0;
+}
+
+/* Narrow the FORL index type by looking at the runtime values. */
+IRType lj_opt_narrow_forl(jit_State *J, cTValue *tv)
+{
+  lua_assert(tvisnumber(&tv[FORL_IDX]) &&
+	     tvisnumber(&tv[FORL_STOP]) &&
+	     tvisnumber(&tv[FORL_STEP]));
   /* Narrow only if the runtime values of start/stop/step are all integers. */
-  if (numisint(numV(&forbase[FORL_IDX])) &&
-      numisint(numV(&forbase[FORL_STOP])) &&
-      numisint(numV(&forbase[FORL_STEP]))) {
+  if (narrow_forl(J, &tv[FORL_IDX]) &&
+      narrow_forl(J, &tv[FORL_STOP]) &&
+      narrow_forl(J, &tv[FORL_STEP])) {
     /* And if the loop index can't possibly overflow. */
-    lua_Number step = numV(&forbase[FORL_STEP]);
-    lua_Number sum = numV(&forbase[FORL_STOP]) + step;
-    if (0 <= step ? sum <= 2147483647.0 : sum >= -2147483648.0)
+    lua_Number step = numberVnum(&tv[FORL_STEP]);
+    lua_Number sum = numberVnum(&tv[FORL_STOP]) + step;
+    if (0 <= step ? (sum <= 2147483647.0) : (sum >= -2147483648.0))
       return IRT_INT;
   }
   return IRT_NUM;
diff --git a/src/lj_record.c b/src/lj_record.c
index 2bfd2608..613e458e 100644
--- a/src/lj_record.c
+++ b/src/lj_record.c
@@ -13,6 +13,7 @@
 #include "lj_err.h"
 #include "lj_str.h"
 #include "lj_tab.h"
+#include "lj_meta.h"
 #include "lj_frame.h"
 #include "lj_bc.h"
 #include "lj_ff.h"
@@ -102,7 +103,7 @@ static void rec_check_slots(jit_State *J)
 	lua_assert((J->slot[s+1] & TREF_FRAME));
 	depth++;
       } else {
-	if (tvisnum(tv))
+	if (tvisnumber(tv))
 	  lua_assert(tref_isnumber(tr));  /* Could be IRT_INT etc., too. */
 	else
 	  lua_assert(itype2irt(tv) == tref_type(tr));
@@ -197,6 +198,7 @@ typedef enum {
 static void canonicalize_slots(jit_State *J)
 {
   BCReg s;
+  if (LJ_DUALNUM) return;
   for (s = J->baseslot+J->maxslot-1; s >= 1; s--) {
     TRef tr = J->slot[s];
     if (tref_isinteger(tr)) {
@@ -254,16 +256,16 @@ static TRef find_kinit(jit_State *J, const BCIns *endpc, BCReg slot, IRType t)
 	  }
 	if (op == BC_KSHORT) {
 	  int32_t k = (int32_t)(int16_t)bc_d(ins);
-	  return t == IRT_INT ? lj_ir_kint(J, k) : lj_ir_knum(J, cast_num(k));
+	  return t == IRT_INT ? lj_ir_kint(J, k) : lj_ir_knum(J, (lua_Number)k);
 	} else {
-	  lua_Number n = proto_knum(J->pt, bc_d(ins));
+	  cTValue *tv = proto_knumtv(J->pt, bc_d(ins));
 	  if (t == IRT_INT) {
-	    int32_t k = lj_num2int(n);
-	    if (n == cast_num(k))  /* -0 is ok here. */
+	    int32_t k = numberVint(tv);
+	    if (tvisint(tv) || numV(tv) == (lua_Number)k)  /* -0 is ok here. */
 	      return lj_ir_kint(J, k);
 	    return 0;  /* Type mismatch. */
 	  } else {
-	    return lj_ir_knum(J, n);
+	    return lj_ir_knum(J, numberVnum(tv));
 	  }
 	}
       }
@@ -273,41 +275,47 @@ static TRef find_kinit(jit_State *J, const BCIns *endpc, BCReg slot, IRType t)
   return 0;  /* No assignment to this slot found? */
 }
 
+/* Load and optionally convert a FORI argument from a slot. */
+static TRef fori_load(jit_State *J, BCReg slot, IRType t, int mode)
+{
+  int conv = (tvisint(&J->L->base[slot]) != (t==IRT_INT)) ? IRSLOAD_CONVERT : 0;
+  return sloadt(J, (int32_t)slot,
+		t + (((mode & IRSLOAD_TYPECHECK) ||
+		      (conv && t == IRT_INT && !(mode >> 16))) ?
+		     IRT_GUARD : 0),
+		mode + conv);
+}
+
 /* Peek before FORI to find a const initializer. Otherwise load from slot. */
-static TRef fori_arg(jit_State *J, const BCIns *fori, BCReg slot, IRType t)
+static TRef fori_arg(jit_State *J, const BCIns *fori, BCReg slot,
+		     IRType t, int mode)
 {
   TRef tr = J->base[slot];
   if (!tr) {
     tr = find_kinit(J, fori, slot, t);
     if (!tr)
-      tr = sloadt(J, (int32_t)slot,
-	     t == IRT_INT ? (IRT_INT|IRT_GUARD) : t,
-	     t == IRT_INT ? (IRSLOAD_CONVERT|IRSLOAD_READONLY|IRSLOAD_INHERIT) :
-			    (IRSLOAD_READONLY|IRSLOAD_INHERIT));
+      tr = fori_load(J, slot, t, mode);
   }
   return tr;
 }
 
-/* In-place coercion of FORI arguments. */
-static lua_Number for_coerce(jit_State *J, TValue *o)
-{
-  if (!tvisnum(o) && !(tvisstr(o) && lj_str_tonum(strV(o), o)))
-    lj_trace_err(J, LJ_TRERR_BADTYPE);
-  return numV(o);
-}
-
-/* Simulate the runtime behavior of the FOR loop iterator.
+/* Return the direction of the FOR loop iterator.
 ** It's important to exactly reproduce the semantics of the interpreter.
 */
-static LoopEvent for_iter(jit_State *J, IROp *op, BCReg ra, int isforl)
+static int rec_for_direction(cTValue *o)
 {
-  TValue *forbase = &J->L->base[ra];
-  lua_Number stopv = for_coerce(J, &forbase[FORL_STOP]);
-  lua_Number idxv = for_coerce(J, &forbase[FORL_IDX]);
-  lua_Number stepv = for_coerce(J, &forbase[FORL_STEP]);
+  return (tvisint(o) ? intV(o) : (int32_t)o->u32.hi) >= 0;
+}
+
+/* Simulate the runtime behavior of the FOR loop iterator. */
+static LoopEvent rec_for_iter(IROp *op, cTValue *o, int isforl)
+{
+  lua_Number stopv = numberVnum(&o[FORL_STOP]);
+  lua_Number idxv = numberVnum(&o[FORL_IDX]);
+  lua_Number stepv = numberVnum(&o[FORL_STEP]);
   if (isforl)
     idxv += stepv;
-  if ((int32_t)forbase[FORL_STEP].u32.hi >= 0) {
+  if (rec_for_direction(&o[FORL_STEP])) {
     if (idxv <= stopv) { *op = IR_LE; return LOOPEV_ENTER; }
     *op = IR_GT; return LOOPEV_LEAVE;
   } else {
@@ -316,44 +324,123 @@ static LoopEvent for_iter(jit_State *J, IROp *op, BCReg ra, int isforl)
   }
 }
 
+/* Record checks for FOR loop overflow and step direction. */
+static void rec_for_check(jit_State *J, IRType t, int dir, TRef stop, TRef step)
+{
+  if (!tref_isk(step)) {
+    /* Non-constant step: need a guard for the direction. */
+    TRef zero = (t == IRT_INT) ? lj_ir_kint(J, 0) : lj_ir_knum_zero(J);
+    emitir(IRTG(dir ? IR_GE : IR_LT, t), step, zero);
+    /* Add hoistable overflow checks for a narrowed FORL index. */
+    if (t == IRT_INT) {
+      if (tref_isk(stop)) {
+	/* Constant stop: optimize check away or to a range check for step. */
+	int32_t k = IR(tref_ref(stop))->i;
+	if (dir) {
+	  if (k > 0)
+	    emitir(IRTGI(IR_LE), step, lj_ir_kint(J, (int32_t)0x7fffffff-k));
+	} else {
+	  if (k < 0)
+	    emitir(IRTGI(IR_GE), step, lj_ir_kint(J, (int32_t)0x80000000-k));
+	}
+      } else {
+	/* Stop+step variable: need full overflow check. */
+	TRef tr = emitir(IRTGI(IR_ADDOV), step, stop);
+	emitir(IRTI(IR_USE), tr, 0);  /* ADDOV is weak. Avoid dead result. */
+      }
+    }
+  } else if (t == IRT_INT && !tref_isk(stop)) {
+    /* Constant step: optimize overflow check to a range check for stop. */
+    int32_t k = IR(tref_ref(step))->i;
+    k = (int32_t)(dir ? 0x7fffffff : 0x80000000) - k;
+    emitir(IRTGI(dir ? IR_LE : IR_GE), stop, lj_ir_kint(J, k));
+  }
+}
+
+/* Record a FORL instruction. */
+static void rec_for_loop(jit_State *J, const BCIns *fori, ScEvEntry *scev,
+			 int init)
+{
+  BCReg ra = bc_a(*fori);
+  cTValue *tv = &J->L->base[ra];
+  TRef idx = J->base[ra+FORL_IDX];
+  IRType t = idx ? tref_type(idx) :
+	     (init || LJ_DUALNUM) ? lj_opt_narrow_forl(J, tv) : IRT_NUM;
+  int mode = IRSLOAD_INHERIT +
+    ((!LJ_DUALNUM || tvisint(tv) == (t == IRT_INT)) ? IRSLOAD_READONLY : 0);
+  TRef stop = fori_arg(J, fori, ra+FORL_STOP, t, mode);
+  TRef step = fori_arg(J, fori, ra+FORL_STEP, t, mode);
+  int tc, dir = rec_for_direction(&tv[FORL_STEP]);
+  lua_assert(bc_op(*fori) == BC_FORI || bc_op(*fori) == BC_JFORI);
+  scev->t.irt = t;
+  scev->dir = dir;
+  scev->stop = tref_ref(stop);
+  scev->step = tref_ref(step);
+  if (init)
+    rec_for_check(J, t, dir, stop, step);
+  scev->start = tref_ref(find_kinit(J, fori, ra+FORL_IDX, IRT_INT));
+  tc = (LJ_DUALNUM &&
+	!(scev->start && irref_isk(scev->stop) && irref_isk(scev->step))) ?
+	IRSLOAD_TYPECHECK : 0;
+  if (tc) {
+    J->base[ra+FORL_STOP] = stop;
+    J->base[ra+FORL_STEP] = step;
+  }
+  if (!idx)
+    idx = fori_load(J, ra+FORL_IDX, t,
+		    IRSLOAD_INHERIT + tc + (J->scev.start << 16));
+  if (!init)
+    J->base[ra+FORL_IDX] = idx = emitir(IRT(IR_ADD, t), idx, step);
+  J->base[ra+FORL_EXT] = idx;
+  scev->idx = tref_ref(idx);
+  J->maxslot = ra+FORL_EXT+1;
+}
+
 /* Record FORL/JFORL or FORI/JFORI. */
 static LoopEvent rec_for(jit_State *J, const BCIns *fori, int isforl)
 {
   BCReg ra = bc_a(*fori);
-  IROp op;
-  LoopEvent ev = for_iter(J, &op, ra, isforl);
+  TValue *tv = &J->L->base[ra];
   TRef *tr = &J->base[ra];
-  TRef idx, stop;
+  IROp op;
+  LoopEvent ev;
+  TRef stop;
   IRType t;
   if (isforl) {  /* Handle FORL/JFORL opcodes. */
-    TRef step;
-    idx = tr[FORL_IDX];
+    TRef idx = tr[FORL_IDX];
     if (tref_ref(idx) == J->scev.idx) {
       t = J->scev.t.irt;
       stop = J->scev.stop;
-      step = J->scev.step;
+      idx = emitir(IRT(IR_ADD, t), idx, J->scev.step);
+      tr[FORL_EXT] = tr[FORL_IDX] = idx;
     } else {
-      if (!idx) idx = sloadt(J, (int32_t)(ra+FORL_IDX), IRT_NUM, 0);
-      t = tref_type(idx);
-      stop = fori_arg(J, fori, ra+FORL_STOP, t);
-      step = fori_arg(J, fori, ra+FORL_STEP, t);
+      ScEvEntry scev;
+      rec_for_loop(J, fori, &scev, 0);
+      t = scev.t.irt;
+      stop = scev.stop;
     }
-    tr[FORL_IDX] = idx = emitir(IRT(IR_ADD, t), idx, step);
   } else {  /* Handle FORI/JFORI opcodes. */
     BCReg i;
-    t = IRT_NUM;
+    lj_meta_for(J->L, tv);
+    t = lj_opt_narrow_forl(J, tv);
     for (i = FORL_IDX; i <= FORL_STEP; i++) {
-      lua_assert(J->base[ra+i] != 0);  /* Assumes the slots are already set. */
-      tr[i] = lj_ir_tonum(J, J->base[ra+i]);
+      lua_assert(tref_isnumber_str(tr[i]));
+      if (tref_isstr(tr[i]))
+	tr[i] = emitir(IRTG(IR_STRTO, IRT_NUM), tr[i], 0);
+      if (t == IRT_INT) {
+	if (!tref_isinteger(tr[i]))
+	  tr[i] = emitir(IRTI(IR_CONV), tr[i], IRCONV_INT_NUM|IRCONV_CHECK);
+      } else {
+	if (!tref_isnum(tr[i]))
+	  tr[i] = emitir(IRTN(IR_CONV), tr[i], IRCONV_NUM_INT);
+      }
     }
-    idx = tr[FORL_IDX];
+    tr[FORL_EXT] = tr[FORL_IDX];
     stop = tr[FORL_STOP];
-    if (!tref_isk(tr[FORL_STEP]))  /* Non-const step: need direction guard. */
-      emitir(IRTG(((op-IR_LT)>>1)+IR_LT, IRT_NUM),
-	     tr[FORL_STEP], lj_ir_knum_zero(J));
+    rec_for_check(J, t, rec_for_direction(&tv[FORL_STEP]), stop, tr[FORL_STEP]);
   }
 
-  tr[FORL_EXT] = idx;
+  ev = rec_for_iter(&op, tv, isforl);
   if (ev == LOOPEV_LEAVE) {
     J->maxslot = ra+FORL_EXT+1;
     J->pc = fori+1;
@@ -363,7 +450,7 @@ static LoopEvent rec_for(jit_State *J, const BCIns *fori, int isforl)
   }
   lj_snap_add(J);
 
-  emitir(IRTG(op, t), idx, stop);
+  emitir(IRTG(op, t), tr[FORL_IDX], stop);
 
   if (ev == LOOPEV_LEAVE) {
     J->maxslot = ra;
@@ -870,7 +957,7 @@ static void rec_idx_abc(jit_State *J, TRef asizeref, TRef ikey, uint32_t asize)
     if (ref == J->scev.idx) {
       int32_t stop;
       lua_assert(irt_isint(J->scev.t) && ir->o == IR_SLOAD);
-      stop = lj_num2int(numV(&(J->L->base - J->baseslot)[ir->op1 + FORL_STOP]));
+      stop = numberVint(&(J->L->base - J->baseslot)[ir->op1 + FORL_STOP]);
       /* Runtime value for stop of loop is within bounds? */
       if ((int64_t)stop + ofs < (int64_t)asize) {
 	/* Emit invariant bounds check for stop. */
@@ -897,15 +984,12 @@ static TRef rec_idx_key(jit_State *J, RecordIndex *ix)
   /* Integer keys are looked up in the array part first. */
   key = ix->key;
   if (tref_isnumber(key)) {
-    lua_Number n = numV(&ix->keyv);
-    int32_t k = lj_num2int(n);
-    lua_assert(tvisnum(&ix->keyv));
-    /* Potential array key? */
-    if ((MSize)k < LJ_MAX_ASIZE && n == cast_num(k)) {
-      TRef asizeref, ikey = key;
-      if (!tref_isinteger(ikey))
-	ikey = emitir(IRTGI(IR_CONV), ikey, IRCONV_INT_NUM|IRCONV_INDEX);
-      asizeref = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_ASIZE);
+    int32_t k = numberVint(&ix->keyv);
+    if (!tvisint(&ix->keyv) && numV(&ix->keyv) != (lua_Number)k)
+      k = LJ_MAX_ASIZE;
+    if ((MSize)k < LJ_MAX_ASIZE) {  /* Potential array key? */
+      TRef ikey = lj_opt_narrow_index(J, key);
+      TRef asizeref = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_ASIZE);
       if ((MSize)k < t->asize) {  /* Currently an array key? */
 	TRef arrayref;
 	rec_idx_abc(J, asizeref, ikey, t->asize);
@@ -1081,7 +1165,8 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix)
     } else {
       keybarrier = 0;  /* Previous non-nil value kept the key alive. */
     }
-    if (tref_isinteger(ix->val))  /* Convert int to number before storing. */
+    /* Convert int to number before storing. */
+    if (!LJ_DUALNUM && tref_isinteger(ix->val))
       ix->val = emitir(IRTN(IR_CONV), ix->val, IRCONV_NUM_INT);
     emitir(IRT(loadop+IRDELTA_L2S, tref_type(ix->val)), xref, ix->val);
     if (keybarrier || tref_isgcv(ix->val))
@@ -1135,7 +1220,8 @@ static TRef rec_upvalue(jit_State *J, uint32_t uv, TRef val)
     if (irtype_ispri(t)) res = TREF_PRI(t);  /* Canonicalize primitive refs. */
     return res;
   } else {  /* Upvalue store. */
-    if (tref_isinteger(val))  /* Convert int to number before storing. */
+    /* Convert int to number before storing. */
+    if (!LJ_DUALNUM && tref_isinteger(val))
       val = emitir(IRTN(IR_CONV), val, IRCONV_NUM_INT);
     emitir(IRT(IR_USTORE, tref_type(val)), uref, val);
     if (needbarrier && tref_isgcv(val))
@@ -1455,16 +1541,15 @@ void lj_record_ins(jit_State *J)
   case BCMnone: rb = 0; rc = bc_d(ins); break;  /* Upgrade rc to 'rd'. */
   case BCMvar:
     copyTV(J->L, rbv, &lbase[rb]); ix.tab = rb = getslot(J, rb); break;
-  case BCMnum: { lua_Number n = proto_knum(J->pt, rb);
-    setnumV(rbv, n); ix.tab = rb = lj_ir_knumint(J, n); } break;
   default: break;  /* Handled later. */
   }
   switch (bcmode_c(op)) {
   case BCMvar:
     copyTV(J->L, rcv, &lbase[rc]); ix.key = rc = getslot(J, rc); break;
   case BCMpri: setitype(rcv, ~rc); ix.key = rc = TREF_PRI(IRT_NIL+rc); break;
-  case BCMnum: { lua_Number n = proto_knum(J->pt, rc);
-    setnumV(rcv, n); ix.key = rc = lj_ir_knumint(J, n); } break;
+  case BCMnum: { cTValue *tv = proto_knumtv(J->pt, rc);
+    copyTV(J->L, rcv, tv); ix.key = rc = tvisint(tv) ? lj_ir_kint(J, intV(tv)) :
+    lj_ir_knumint(J, numV(tv)); } break;
   case BCMstr: { GCstr *s = gco2str(proto_kgc(J->pt, ~(ptrdiff_t)rc));
     setstrV(J->L, rcv, s); ix.key = rc = lj_ir_kstr(J, s); } break;
   default: break;  /* Handled later. */
@@ -1502,9 +1587,11 @@ void lj_record_ins(jit_State *J)
       irop = (int)op - (int)BC_ISLT + (int)IR_LT;
       if (ta == IRT_NUM) {
 	if ((irop & 1)) irop ^= 4;  /* ISGE/ISGT are unordered. */
-	if (!lj_ir_numcmp(numV(rav), numV(rcv), (IROp)irop)) irop ^= 5;
+	if (!lj_ir_numcmp(numberVnum(rav), numberVnum(rcv), (IROp)irop))
+	  irop ^= 5;
       } else if (ta == IRT_INT) {
-	if (!lj_ir_numcmp(numV(rav), numV(rcv), (IROp)irop)) irop ^= 1;
+	if (!lj_ir_numcmp(numberVnum(rav), numberVnum(rcv), (IROp)irop))
+	  irop ^= 1;
       } else if (ta == IRT_STR) {
 	if (!lj_ir_strcmp(strV(rav), strV(rcv), (IROp)irop)) irop ^= 1;
 	ra = lj_ir_call(J, IRCALL_lj_str_cmp, ra, rc);
@@ -1599,13 +1686,11 @@ void lj_record_ins(jit_State *J)
   case BC_ADDVN: case BC_SUBVN: case BC_MULVN: case BC_DIVVN:
   case BC_ADDVV: case BC_SUBVV: case BC_MULVV: case BC_DIVVV: {
     MMS mm = bcmode_mm(op);
-    if (tref_isnumber_str(rb) && tref_isnumber_str(rc)) {
-      rb = lj_ir_tonum(J, rb);
-      rc = lj_ir_tonum(J, rc);
-      rc = emitir(IRTN((int)mm - (int)MM_add + (int)IR_ADD), rb, rc);
-    } else {
+    if (tref_isnumber_str(rb) && tref_isnumber_str(rc))
+      rc = lj_opt_narrow_arith(J, rb, rc, &ix.tabv, &ix.keyv,
+			       (int)mm - (int)MM_add + (int)IR_ADD);
+    else
       rc = rec_mm_arith(J, &ix, mm);
-    }
     break;
     }
 
@@ -1827,59 +1912,6 @@ void lj_record_ins(jit_State *J)
 
 /* -- Recording setup ----------------------------------------------------- */
 
-/* Setup recording for a FORL loop. */
-static void rec_setup_forl(jit_State *J, const BCIns *fori)
-{
-  BCReg ra = bc_a(*fori);
-  cTValue *forbase = &J->L->base[ra];
-  IRType t = (J->flags & JIT_F_OPT_NARROW) ? lj_opt_narrow_forl(forbase)
-					   : IRT_NUM;
-  TRef start;
-  TRef stop = fori_arg(J, fori, ra+FORL_STOP, t);
-  TRef step = fori_arg(J, fori, ra+FORL_STEP, t);
-  int dir = (0 <= numV(&forbase[FORL_STEP]));
-  lua_assert(bc_op(*fori) == BC_FORI || bc_op(*fori) == BC_JFORI);
-  J->scev.t.irt = t;
-  J->scev.dir = dir;
-  J->scev.stop = tref_ref(stop);
-  J->scev.step = tref_ref(step);
-  if (!tref_isk(step)) {
-    /* Non-constant step: need a guard for the direction. */
-    TRef zero = (t == IRT_INT) ? lj_ir_kint(J, 0) : lj_ir_knum_zero(J);
-    emitir(IRTG(dir ? IR_GE : IR_LT, t), step, zero);
-    /* Add hoistable overflow checks for a narrowed FORL index. */
-    if (t == IRT_INT) {
-      if (tref_isk(stop)) {
-	/* Constant stop: optimize check away or to a range check for step. */
-	int32_t k = IR(tref_ref(stop))->i;
-	if (dir) {
-	  if (k > 0)
-	    emitir(IRTGI(IR_LE), step, lj_ir_kint(J, (int32_t)0x7fffffff-k));
-	} else {
-	  if (k < 0)
-	    emitir(IRTGI(IR_GE), step, lj_ir_kint(J, (int32_t)0x80000000-k));
-	}
-      } else {
-	/* Stop+step variable: need full overflow check. */
-	TRef tr = emitir(IRTGI(IR_ADDOV), step, stop);
-	emitir(IRTI(IR_USE), tr, 0);  /* ADDOV is weak. Avoid dead result. */
-      }
-    }
-  } else if (t == IRT_INT && !tref_isk(stop)) {
-    /* Constant step: optimize overflow check to a range check for stop. */
-    int32_t k = IR(tref_ref(step))->i;
-    k = (int32_t)(dir ? 0x7fffffff : 0x80000000) - k;
-    emitir(IRTGI(dir ? IR_LE : IR_GE), stop, lj_ir_kint(J, k));
-  }
-  J->scev.start = tref_ref(find_kinit(J, fori, ra+FORL_IDX, IRT_INT));
-  start = sloadt(J, (int32_t)(ra+FORL_IDX),
-	    (t == IRT_INT && !J->scev.start) ? (IRT_INT|IRT_GUARD) : t,
-	    t == IRT_INT ? (IRSLOAD_CONVERT|IRSLOAD_INHERIT) : IRSLOAD_INHERIT);
-  J->base[ra+FORL_EXT] = start;
-  J->scev.idx = tref_ref(start);
-  J->maxslot = ra+FORL_EXT+1;
-}
-
 /* Setup recording for a root trace started by a hot loop. */
 static const BCIns *rec_setup_root(jit_State *J)
 {
@@ -2033,7 +2065,7 @@ void lj_record_setup(jit_State *J)
       if (J->pc > proto_bc(J->pt) && bc_op(J->pc[-1]) == BC_JFORI &&
 	  bc_d(J->pc[bc_j(J->pc[-1])-1]) == root) {
 	lj_snap_add(J);
-	rec_setup_forl(J, J->pc-1);
+	rec_for_loop(J, J->pc-1, &J->scev, 1);
 	goto sidecheck;
       }
     } else {
@@ -2054,7 +2086,7 @@ void lj_record_setup(jit_State *J)
     */
     lj_snap_add(J);
     if (bc_op(J->cur.startins) == BC_FORL)
-      rec_setup_forl(J, J->pc-1);
+      rec_for_loop(J, J->pc-1, &J->scev, 1);
     if (1 + J->pt->framesize >= LJ_MAX_JSLOTS)
       lj_trace_err(J, LJ_TRERR_STACKOV);
   }
diff --git a/src/lj_snap.c b/src/lj_snap.c
index 59435b20..70628a0e 100644
--- a/src/lj_snap.c
+++ b/src/lj_snap.c
@@ -68,7 +68,8 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots)
 	if (!(ir->op2 & IRSLOAD_INHERIT))
 	  continue;
 	/* No need to restore readonly slots and unmodified non-parent slots. */
-	if ((ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT)
+	if (!(LJ_DUALNUM && (ir->op2 & IRSLOAD_CONVERT)) &&
+	    (ir->op2 & (IRSLOAD_READONLY|IRSLOAD_PARENT)) != IRSLOAD_PARENT)
 	  sn |= SNAP_NORESTORE;
       }
       map[n++] = sn;
diff --git a/src/lj_trace.c b/src/lj_trace.c
index b67e8f75..69124542 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -495,8 +495,8 @@ static int trace_abort(jit_State *J)
 
   J->postproc = LJ_POST_NONE;
   lj_mcode_abort(J);
-  if (tvisnum(L->top-1))
-    e = (TraceError)lj_num2int(numV(L->top-1));
+  if (tvisnumber(L->top-1))
+    e = (TraceError)numberVint(L->top-1);
   if (e == LJ_TRERR_MCODELM) {
     J->state = LJ_TRACE_ASM;
     return 1;  /* Retry ASM with new MCode area. */
@@ -703,8 +703,12 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
     setintV(L->top++, J->exitno);
     setintV(L->top++, RID_NUM_GPR);
     setintV(L->top++, RID_NUM_FPR);
-    for (i = 0; i < RID_NUM_GPR; i++)
-      setnumV(L->top++, cast_num(ex->gpr[i]));
+    for (i = 0; i < RID_NUM_GPR; i++) {
+      if (sizeof(ex->gpr[i]) == sizeof(int32_t))
+	setintV(L->top++, (int32_t)ex->gpr[i]);
+      else
+	setnumV(L->top++, (lua_Number)ex->gpr[i]);
+    }
     for (i = 0; i < RID_NUM_FPR; i++) {
       setnumV(L->top, ex->fpr[i]);
       if (LJ_UNLIKELY(tvisnan(L->top)))