diff --git a/src/Makefile b/src/Makefile
index 278324a1..4ea8c85e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -42,13 +42,10 @@ CCOPT= -O2 -fomit-frame-pointer
 #
 # Target-specific compiler options:
 #
-# x86 only: it's recommended to compile at least for i686. Better yet,
-# compile for an architecture that has SSE2, too (-msse -msse2).
-#
 # x86/x64 only: For GCC 4.2 or higher and if you don't intend to distribute
 # the binaries to a different machine you could also use: -march=native
 #
-CCOPT_x86= -march=i686
+CCOPT_x86= -march=i686 -msse -msse2 -mfpmath=sse
 CCOPT_x64=
 CCOPT_arm=
 CCOPT_ppc=
@@ -394,11 +391,6 @@ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subs
 ifeq (Windows,$(TARGET_SYS))
   DASM_AFLAGS+= -D WIN
 endif
-ifeq (x86,$(TARGET_LJARCH))
-  ifneq (,$(findstring __SSE2__ 1,$(TARGET_TESTARCH)))
-    DASM_AFLAGS+= -D SSE
-  endif
-else
 ifeq (x64,$(TARGET_LJARCH))
   DASM_ARCH= x86
 else
@@ -423,7 +415,6 @@ ifeq (ppc,$(TARGET_LJARCH))
 endif
 endif
 endif
-endif
 
 DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
 DASM_DASC= vm_$(DASM_ARCH).dasc
diff --git a/src/lib_jit.c b/src/lib_jit.c
index 82e68258..1b69caa5 100644
--- a/src/lib_jit.c
+++ b/src/lib_jit.c
@@ -538,18 +538,14 @@ static uint32_t jit_cpudetect(lua_State *L)
   uint32_t features[4];
   if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
 #if !LJ_HASJIT
-#define JIT_F_CMOV	1
 #define JIT_F_SSE2	2
 #endif
-    flags |= ((features[3] >> 15)&1) * JIT_F_CMOV;
     flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
 #if LJ_HASJIT
     flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
     flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
     if (vendor[2] == 0x6c65746e) {  /* Intel. */
-      if ((features[0] & 0x0ff00f00) == 0x00000f00)  /* P4. */
-	flags |= JIT_F_P4;  /* Currently unused. */
-      else if ((features[0] & 0x0fff0ff0) == 0x000106c0)  /* Atom. */
+      if ((features[0] & 0x0fff0ff0) == 0x000106c0)  /* Atom. */
 	flags |= JIT_F_LEA_AGU;
     } else if (vendor[2] == 0x444d4163) {  /* AMD. */
       uint32_t fam = (features[0] & 0x0ff00f00);
@@ -562,14 +558,8 @@ static uint32_t jit_cpudetect(lua_State *L)
   }
   /* Check for required instruction set support on x86 (unnecessary on x64). */
 #if LJ_TARGET_X86
-#if !defined(LUAJIT_CPU_NOCMOV)
-  if (!(flags & JIT_F_CMOV))
-    luaL_error(L, "CPU not supported");
-#endif
-#if defined(LUAJIT_CPU_SSE2)
   if (!(flags & JIT_F_SSE2))
-    luaL_error(L, "CPU does not support SSE2 (recompile without -DLUAJIT_CPU_SSE2)");
-#endif
+    luaL_error(L, "CPU with SSE2 required");
 #endif
 #elif LJ_TARGET_ARM
 #if LJ_HASJIT
@@ -631,11 +621,7 @@ static void jit_init(lua_State *L)
   uint32_t flags = jit_cpudetect(L);
 #if LJ_HASJIT
   jit_State *J = L2J(L);
-#if LJ_TARGET_X86
-  /* Silently turn off the JIT compiler on CPUs without SSE2. */
-  if ((flags & JIT_F_SSE2))
-#endif
-    J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
+  J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
   memcpy(J->param, jit_param_default, sizeof(J->param));
   lj_dispatch_update(G(L));
 #else
@@ -645,6 +631,7 @@ static void jit_init(lua_State *L)
 
 LUALIB_API int luaopen_jit(lua_State *L)
 {
+  jit_init(L);
   lua_pushliteral(L, LJ_OS_NAME);
   lua_pushliteral(L, LJ_ARCH_NAME);
   lua_pushinteger(L, LUAJIT_VERSION_NUM);
@@ -657,7 +644,6 @@ LUALIB_API int luaopen_jit(lua_State *L)
   LJ_LIB_REG(L, "jit.opt", jit_opt);
 #endif
   L->top -= 2;
-  jit_init(L);
   return 1;
 }
 
diff --git a/src/lj_asm.c b/src/lj_asm.c
index c7365404..a01b4e52 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -1730,7 +1730,7 @@ static void asm_setup_regsp(ASMState *as)
       break;
     case IR_FPMATH:
 #if LJ_TARGET_X86ORX64
-      if (ir->op2 == IRFPM_EXP2) {  /* May be joined to lj_vm_pow_sse. */
+      if (ir->op2 == IRFPM_EXP2) {  /* May be joined to pow. */
 	ir->prev = REGSP_HINT(RID_XMM0);
 #if !LJ_64
 	if (as->evenspill < 4)  /* Leave room for 16 byte scratch area. */
diff --git a/src/lj_jit.h b/src/lj_jit.h
index c0b1c41e..8b42dd4e 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -14,18 +14,16 @@
 
 /* CPU-specific JIT engine flags. */
 #if LJ_TARGET_X86ORX64
-#define JIT_F_CMOV		0x00000010
-#define JIT_F_SSE2		0x00000020
-#define JIT_F_SSE3		0x00000040
-#define JIT_F_SSE4_1		0x00000080
-#define JIT_F_P4		0x00000100
-#define JIT_F_PREFER_IMUL	0x00000200
-#define JIT_F_SPLIT_XMM		0x00000400
-#define JIT_F_LEA_AGU		0x00000800
+#define JIT_F_SSE2		0x00000010
+#define JIT_F_SSE3		0x00000020
+#define JIT_F_SSE4_1		0x00000040
+#define JIT_F_PREFER_IMUL	0x00000080
+#define JIT_F_SPLIT_XMM		0x00000100
+#define JIT_F_LEA_AGU		0x00000200
 
 /* Names for the CPU-specific flags. Must match the order above. */
-#define JIT_F_CPU_FIRST		JIT_F_CMOV
-#define JIT_F_CPUSTRING		"\4CMOV\4SSE2\4SSE3\6SSE4.1\2P4\3AMD\2K8\4ATOM"
+#define JIT_F_CPU_FIRST		JIT_F_SSE2
+#define JIT_F_CPUSTRING		"\4SSE2\4SSE3\6SSE4.1\3AMD\2K8\4ATOM"
 #elif LJ_TARGET_ARM
 #define JIT_F_ARMV6_		0x00000010
 #define JIT_F_ARMV6T2_		0x00000020
diff --git a/src/lj_vm.h b/src/lj_vm.h
index c5d05de4..948d63c2 100644
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@@ -49,12 +49,14 @@ LJ_ASMF void lj_vm_exit_handler(void);
 LJ_ASMF void lj_vm_exit_interp(void);
 
 /* Internal math helper functions. */
-#if LJ_TARGET_X86ORX64 || LJ_TARGET_PPC
+#if LJ_TARGET_PPC
 #define lj_vm_floor	floor
 #define lj_vm_ceil	ceil
 #else
 LJ_ASMF double lj_vm_floor(double);
+#if !LJ_TARGET_X86ORX64
 LJ_ASMF double lj_vm_ceil(double);
+#endif
 #if LJ_TARGET_ARM
 LJ_ASMF double lj_vm_floor_sf(double);
 LJ_ASMF double lj_vm_ceil_sf(double);
diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat
index 745c93ff..1d5bd55a 100644
--- a/src/msvcbuild.bat
+++ b/src/msvcbuild.bat
@@ -35,6 +35,7 @@ if exist minilua.exe.manifest^
 @if errorlevel 8 goto :X64
 @set DASMFLAGS=-D WIN -D JIT -D FFI
 @set LJARCH=x86
+@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
 :X64
 minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h vm_x86.dasc
 @if errorlevel 1 goto :BAD
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index b4674e2b..7020eb27 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -18,7 +18,6 @@
 |
 |.if P64
 |.define X64, 1
-|.define SSE, 1
 |.if WIN
 |.define X64WIN, 1
 |.endif
@@ -856,13 +855,9 @@ static void build_subroutines(BuildCtx *ctx)
   |.if DUALNUM
   |  mov TMP2, LJ_TISNUM
   |  mov TMP1, RC
-  |.elif SSE
+  |.else
   |  cvtsi2sd xmm0, RC
   |  movsd TMPQ, xmm0
-  |.else
-  |  mov ARG4, RC
-  |  fild ARG4
-  |  fstp TMPQ
   |.endif
   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
   |  jmp >1
@@ -935,13 +930,9 @@ static void build_subroutines(BuildCtx *ctx)
   |.if DUALNUM
   |  mov TMP2, LJ_TISNUM
   |  mov TMP1, RC
-  |.elif SSE
+  |.else
   |  cvtsi2sd xmm0, RC
   |  movsd TMPQ, xmm0
-  |.else
-  |  mov ARG4, RC
-  |  fild ARG4
-  |  fstp TMPQ
   |.endif
   |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
   |  jmp >1
@@ -1509,11 +1500,7 @@ static void build_subroutines(BuildCtx *ctx)
   |.else
   |  jae ->fff_fallback
   |.endif
-  |.if SSE
   |  movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
-  |.else
-  |  fld qword [BASE]; jmp ->fff_resn
-  |.endif
   |
   |.ffunc_1 tostring
   |  // Only handles the string or number case inline.
@@ -1631,19 +1618,12 @@ static void build_subroutines(BuildCtx *ctx)
   |  add RD, 1
   |  mov dword [BASE-4], LJ_TISNUM
   |  mov dword [BASE-8], RD
-  |.elif SSE
+  |.else
   |  movsd xmm0, qword [BASE+8]
   |  sseconst_1 xmm1, RBa
   |  addsd xmm0, xmm1
   |  cvtsd2si RD, xmm0
   |  movsd qword [BASE-8], xmm0
-  |.else
-  |  fld qword [BASE+8]
-  |  fld1
-  |  faddp st1
-  |  fist ARG1
-  |  fstp qword [BASE-8]
-  |  mov RD, ARG1
   |.endif
   |  mov TAB:RB, [BASE]
   |  cmp RD, TAB:RB->asize;  jae >2	// Not in array part?
@@ -1690,12 +1670,9 @@ static void build_subroutines(BuildCtx *ctx)
   |.if DUALNUM
   |  mov dword [BASE+12], LJ_TISNUM
   |  mov dword [BASE+8], 0
-  |.elif SSE
+  |.else
   |  xorps xmm0, xmm0
   |  movsd qword [BASE+8], xmm0
-  |.else
-  |  fldz
-  |  fstp qword [BASE+8]
   |.endif
   |  mov RD, 1+3
   |  jmp ->fff_res
@@ -1925,12 +1902,10 @@ static void build_subroutines(BuildCtx *ctx)
   |->fff_resi:  // Dummy.
   |.endif
   |
-  |.if SSE
   |->fff_resn:
   |  mov PC, [BASE-4]
   |  fstp qword [BASE-8]
   |  jmp ->fff_res1
-  |.endif
   |
   |  .ffunc_1 math_abs
   |.if DUALNUM
@@ -1954,8 +1929,6 @@ static void build_subroutines(BuildCtx *ctx)
   |.else
   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
   |.endif
-  |
-  |.if SSE
   |  movsd xmm0, qword [BASE]
   |  sseconst_abs xmm1, RDa
   |  andps xmm0, xmm1
@@ -1963,15 +1936,6 @@ static void build_subroutines(BuildCtx *ctx)
   |  mov PC, [BASE-4]
   |  movsd qword [BASE-8], xmm0
   |  // fallthrough
-  |.else
-  |  fld qword [BASE]
-  |  fabs
-  |  // fallthrough
-  |->fff_resxmm0:  // Dummy.
-  |->fff_resn:
-  |  mov PC, [BASE-4]
-  |  fstp qword [BASE-8]
-  |.endif
   |
   |->fff_res1:
   |  mov RD, 1+1
@@ -2008,48 +1972,24 @@ static void build_subroutines(BuildCtx *ctx)
   |.else
   |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
   |.endif
-  |.if SSE
   |  movsd xmm0, qword [BASE]
-  |  call ->vm_ .. func
-  |  .if DUALNUM
-  |    cvtsd2si RB, xmm0
-  |    cmp RB, 0x80000000
-  |    jne ->fff_resi
-  |    cvtsi2sd xmm1, RB
-  |    ucomisd xmm0, xmm1
-  |    jp ->fff_resxmm0
-  |    je ->fff_resi
-  |  .endif
-  |  jmp ->fff_resxmm0
-  |.else
-  |  fld qword [BASE]
-  |  call ->vm_ .. func
-  |  .if DUALNUM
-  |    fist ARG1
-  |    mov RB, ARG1
-  |    cmp RB, 0x80000000; jne >2
-  |    fdup
-  |    fild ARG1
-  |    fcomparepp
-  |    jp ->fff_resn
-  |    jne ->fff_resn
-  |2:
-  |    fpop
-  |    jmp ->fff_resi
-  | .else
-  |    jmp ->fff_resn
-  | .endif
+  |  call ->vm_ .. func .. _sse
+  |.if DUALNUM
+  |  cvtsd2si RB, xmm0
+  |  cmp RB, 0x80000000
+  |  jne ->fff_resi
+  |  cvtsi2sd xmm1, RB
+  |  ucomisd xmm0, xmm1
+  |  jp ->fff_resxmm0
+  |  je ->fff_resi
   |.endif
+  |  jmp ->fff_resxmm0
   |.endmacro
   |
   |  math_round floor
   |  math_round ceil
   |
-  |.if SSE
   |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
-  |.else
-  |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
-  |.endif
   |
   |.ffunc math_log
   |  cmp NARGS:RD, 1+1; jne ->fff_fallback	// Exactly one argument.
@@ -2072,23 +2012,18 @@ static void build_subroutines(BuildCtx *ctx)
   |.ffunc_n math_atan;	fld1; fpatan;		jmp ->fff_resn
   |
   |.macro math_extern, func
-  |.if SSE
   |  .ffunc_nsse math_ .. func
-  |  .if not X64
-  |    movsd FPARG1, xmm0
-  |  .endif
-  |.else
-  |  .ffunc_n math_ .. func
-  |  fstp FPARG1
+  |.if not X64
+  |  movsd FPARG1, xmm0
   |.endif
   |  mov RB, BASE
   |  call extern lj_vm_ .. func
   |  mov BASE, RB
-  |  .if X64
-  |    jmp ->fff_resxmm0
-  |  .else
-  |    jmp ->fff_resn
-  |  .endif
+  |.if X64
+  |  jmp ->fff_resxmm0
+  |.else
+  |  jmp ->fff_resn
+  |.endif
   |.endmacro
   |
   |  math_extern sinh
@@ -2096,17 +2031,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern tanh
   |
   |->ff_math_deg:
-  |.if SSE
   |.ffunc_nsse math_rad
   |  mov CFUNC:RB, [BASE-8]
   |  mulsd xmm0, qword CFUNC:RB->upvalue[0]
   |  jmp ->fff_resxmm0
-  |.else
-  |.ffunc_n math_rad
-  |  mov CFUNC:RB, [BASE-8]
-  |  fmul qword CFUNC:RB->upvalue[0]
-  |  jmp ->fff_resn
-  |.endif
   |
   |.ffunc_nn math_atan2;	fpatan;		jmp ->fff_resn
   |.ffunc_nnr math_ldexp;	fscale; fpop1;	jmp ->fff_resn
@@ -2123,65 +2051,34 @@ static void build_subroutines(BuildCtx *ctx)
   |  cmp RB, 0x00200000; jb >4
   |1:
   |  shr RB, 21; sub RB, RC		// Extract and unbias exponent.
-  |.if SSE
   |  cvtsi2sd xmm0, RB
-  |.else
-  |  mov TMP1, RB; fild TMP1
-  |.endif
   |  mov RB, [BASE-4]
   |  and RB, 0x800fffff			// Mask off exponent.
   |  or RB, 0x3fe00000			// Put mantissa in range [0.5,1) or 0.
   |  mov [BASE-4], RB
   |2:
-  |.if SSE
   |  movsd qword [BASE], xmm0
-  |.else
-  |  fstp qword [BASE]
-  |.endif
   |  mov RD, 1+2
   |  jmp ->fff_res
   |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
-  |.if SSE
   |  xorps xmm0, xmm0; jmp <2
-  |.else
-  |  fldz; jmp <2
-  |.endif
   |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
-  |.if SSE
   |  movsd xmm0, qword [BASE]
   |  sseconst_hi xmm1, RBa, 43500000  // 2^54.
   |  mulsd xmm0, xmm1
   |  movsd qword [BASE-8], xmm0
-  |.else
-  |  fld qword [BASE]
-  |  mov TMP1, 0x5a800000; fmul TMP1	// x = x*2^54
-  |  fstp qword [BASE-8]
-  |.endif
   |  mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
   |
-  |.if SSE
   |.ffunc_nsse math_modf
-  |.else
-  |.ffunc_n math_modf
-  |.endif
   |  mov RB, [BASE+4]
   |  mov PC, [BASE-4]
   |  shl RB, 1; cmp RB, 0xffe00000; je >4	// +-Inf?
-  |.if SSE
   |  movaps xmm4, xmm0
-  |  call ->vm_trunc
+  |  call ->vm_trunc_sse
   |  subsd xmm4, xmm0
   |1:
   |  movsd qword [BASE-8], xmm0
   |  movsd qword [BASE], xmm4
-  |.else
-  |  fdup
-  |  call ->vm_trunc
-  |  fsub st1, st0
-  |1:
-  |  fstp qword [BASE-8]
-  |  fstp qword [BASE]
-  |.endif
   |  mov RC, [BASE-4]; mov RB, [BASE+4]
   |  xor RC, RB; js >3				// Need to adjust sign?
   |2:
@@ -2191,24 +2088,16 @@ static void build_subroutines(BuildCtx *ctx)
   |  xor RB, 0x80000000; mov [BASE+4], RB	// Flip sign of fraction.
   |  jmp <2
   |4:
-  |.if SSE
   |  xorps xmm4, xmm4; jmp <1			// Return +-Inf and +-0.
-  |.else
-  |  fldz; fxch; jmp <1				// Return +-Inf and +-0.
-  |.endif
   |
   |.ffunc_nnr math_fmod
   |1: ; fprem; fnstsw ax; sahf; jp <1
   |  fpop1
   |  jmp ->fff_resn
   |
-  |.if SSE
-  |.ffunc_nnsse math_pow;	call ->vm_pow;	jmp ->fff_resxmm0
-  |.else
-  |.ffunc_nn math_pow;		call ->vm_pow;	jmp ->fff_resn
-  |.endif
+  |.ffunc_nnsse math_pow;	call ->vm_pow_sse;	jmp ->fff_resxmm0
   |
-  |.macro math_minmax, name, cmovop, fcmovop, sseop
+  |.macro math_minmax, name, cmovop, sseop
   |  .ffunc name
   |  mov RA, 2
   |  cmp dword [BASE+4], LJ_TISNUM
@@ -2225,12 +2114,7 @@ static void build_subroutines(BuildCtx *ctx)
   |3:
   |  ja ->fff_fallback
   |  // Convert intermediate result to number and continue below.
-  |.if SSE
   |  cvtsi2sd xmm0, RB
-  |.else
-  |  mov TMP1, RB
-  |  fild TMP1
-  |.endif
   |  jmp >6
   |4:
   |  ja ->fff_fallback
@@ -2238,7 +2122,6 @@ static void build_subroutines(BuildCtx *ctx)
   |  jae ->fff_fallback
   |.endif
   |
-  |.if SSE
   |  movsd xmm0, qword [BASE]
   |5:  // Handle numbers or integers.
   |  cmp RA, RD; jae ->fff_resxmm0
@@ -2257,34 +2140,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  sseop xmm0, xmm1
   |  add RA, 1
   |  jmp <5
-  |.else
-  |  fld qword [BASE]
-  |5:  // Handle numbers or integers.
-  |  cmp RA, RD; jae ->fff_resn
-  |  cmp dword [BASE+RA*8-4], LJ_TISNUM
-  |.if DUALNUM
-  |  jb >6
-  |  ja >9
-  |  fild dword [BASE+RA*8-8]
-  |  jmp >7
-  |.else
-  |  jae >9
-  |.endif
-  |6:
-  |  fld qword [BASE+RA*8-8]
-  |7:
-  |  fucomi st1; fcmovop st1; fpop1
-  |  add RA, 1
-  |  jmp <5
-  |.endif
   |.endmacro
   |
-  |  math_minmax math_min, cmovg, fcmovnbe, minsd
-  |  math_minmax math_max, cmovl, fcmovbe, maxsd
-  |.if not SSE
-  |9:
-  |  fpop; jmp ->fff_fallback
-  |.endif
+  |  math_minmax math_min, cmovg, minsd
+  |  math_minmax math_max, cmovl, maxsd
   |
   |//-- String library -----------------------------------------------------
   |
@@ -2293,10 +2152,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  mov STR:RB, [BASE]
   |.if DUALNUM
   |  mov RB, dword STR:RB->len; jmp ->fff_resi
-  |.elif SSE
-  |  cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0
   |.else
-  |  fild dword STR:RB->len; jmp ->fff_resn
+  |  cvtsi2sd xmm0, dword STR:RB->len; jmp ->fff_resxmm0
   |.endif
   |
   |.ffunc string_byte			// Only handle the 1-arg case here.
@@ -2309,10 +2166,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  movzx RB, byte STR:RB[1]
   |.if DUALNUM
   |  jmp ->fff_resi
-  |.elif SSE
-  |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
   |.else
-  |  mov TMP1, RB; fild TMP1; jmp ->fff_resn
+  |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
   |.endif
   |
   |.ffunc string_char			// Only handle the 1-arg case here.
@@ -2324,16 +2179,11 @@ static void build_subroutines(BuildCtx *ctx)
   |  mov RB, dword [BASE]
   |  cmp RB, 255;  ja ->fff_fallback
   |  mov TMP2, RB
-  |.elif SSE
+  |.else
   |  jae ->fff_fallback
   |  cvttsd2si RB, qword [BASE]
   |  cmp RB, 255;  ja ->fff_fallback
   |  mov TMP2, RB
-  |.else
-  |  jae ->fff_fallback
-  |  fld qword [BASE]
-  |  fistp TMP2
-  |  cmp TMP2, 255;  ja ->fff_fallback
   |.endif
   |.if X64
   |  mov TMP3, 1
@@ -2371,14 +2221,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  jne ->fff_fallback
   |  mov RB, dword [BASE+16]
   |  mov TMP2, RB
-  |.elif SSE
+  |.else
   |  jae ->fff_fallback
   |  cvttsd2si RB, qword [BASE+16]
   |  mov TMP2, RB
-  |.else
-  |  jae ->fff_fallback
-  |  fld qword [BASE+16]
-  |  fistp TMP2
   |.endif
   |1:
   |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
@@ -2393,12 +2239,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  mov RB, STR:RB->len
   |.if DUALNUM
   |  mov RA, dword [BASE+8]
-  |.elif SSE
-  |  cvttsd2si RA, qword [BASE+8]
   |.else
-  |  fld qword [BASE+8]
-  |  fistp ARG3
-  |  mov RA, ARG3
+  |  cvttsd2si RA, qword [BASE+8]
   |.endif
   |  mov RC, TMP2
   |  cmp RB, RC				// len < end? (unsigned compare)
@@ -2451,14 +2293,9 @@ static void build_subroutines(BuildCtx *ctx)
   |.if DUALNUM
   |  jne ->fff_fallback
   |  mov RC, dword [BASE+8]
-  |.elif SSE
-  |  jae ->fff_fallback
-  |  cvttsd2si RC, qword [BASE+8]
   |.else
   |  jae ->fff_fallback
-  |  fld qword [BASE+8]
-  |  fistp TMP2
-  |  mov RC, TMP2
+  |  cvttsd2si RC, qword [BASE+8]
   |.endif
   |  test RC, RC
   |  jle ->fff_emptystr			// Count <= 0? (or non-int)
@@ -2554,10 +2391,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  mov BASE, RB			// Restore BASE.
   |.if DUALNUM
   |  mov RB, RD; jmp ->fff_resi
-  |.elif SSE
-  |  cvtsi2sd xmm0, RD; jmp ->fff_resxmm0
   |.else
-  |  mov ARG1, RD; fild ARG1; jmp ->fff_resn
+  |  cvtsi2sd xmm0, RD; jmp ->fff_resxmm0
   |.endif
   |
   |//-- Bit library --------------------------------------------------------
@@ -2567,11 +2402,7 @@ static void build_subroutines(BuildCtx *ctx)
   |.macro .ffunc_bit, name, kind
   |  .ffunc_1 name
   |.if kind == 2
-  |.if SSE
   |  sseconst_tobit xmm1, RBa
-  |.else
-  |  mov TMP1, TOBIT_BIAS
-  |.endif
   |.endif
   |  cmp dword [BASE+4], LJ_TISNUM
   |.if DUALNUM
@@ -2587,37 +2418,17 @@ static void build_subroutines(BuildCtx *ctx)
   |.else
   |  jae ->fff_fallback
   |.endif
-  |.if SSE
   |  movsd xmm0, qword [BASE]
   |.if kind < 2
   |  sseconst_tobit xmm1, RBa
   |.endif
   |  addsd xmm0, xmm1
   |  movd RB, xmm0
-  |.else
-  |  fld qword [BASE]
-  |.if kind < 2
-  |  mov TMP1, TOBIT_BIAS
-  |.endif
-  |  fadd TMP1
-  |  fstp FPARG1
-  |.if kind > 0
-  |  mov RB, ARG1
-  |.endif
-  |.endif
   |2:
   |.endmacro
   |
   |.ffunc_bit bit_tobit, 0
-  |.if DUALNUM or SSE
-  |.if not SSE
-  |  mov RB, ARG1
-  |.endif
   |  jmp ->fff_resbit
-  |.else
-  |  fild ARG1
-  |  jmp ->fff_resn
-  |.endif
   |
   |.macro .ffunc_bit_op, name, ins
   |  .ffunc_bit name, 2
@@ -2637,17 +2448,10 @@ static void build_subroutines(BuildCtx *ctx)
   |.else
   |  jae ->fff_fallback_bit_op
   |.endif
-  |.if SSE
   |  movsd xmm0, qword [RD]
   |  addsd xmm0, xmm1
   |  movd RA, xmm0
   |  ins RB, RA
-  |.else
-  |  fld qword [RD]
-  |  fadd TMP1
-  |  fstp FPARG1
-  |  ins RB, ARG1
-  |.endif
   |  sub RD, 8
   |  jmp <1
   |.endmacro
@@ -2664,15 +2468,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  not RB
   |.if DUALNUM
   |  jmp ->fff_resbit
-  |.elif SSE
+  |.else
   |->fff_resbit:
   |  cvtsi2sd xmm0, RB
   |  jmp ->fff_resxmm0
-  |.else
-  |->fff_resbit:
-  |  mov ARG1, RB
-  |  fild ARG1
-  |  jmp ->fff_resn
   |.endif
   |
   |->fff_fallback_bit_op:
@@ -2685,22 +2484,13 @@ static void build_subroutines(BuildCtx *ctx)
   |  // Note: no inline conversion from number for 2nd argument!
   |  cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
   |  mov RA, dword [BASE+8]
-  |.elif SSE
+  |.else
   |  .ffunc_nnsse name
   |  sseconst_tobit xmm2, RBa
   |  addsd xmm0, xmm2
   |  addsd xmm1, xmm2
   |  movd RB, xmm0
   |  movd RA, xmm1
-  |.else
-  |  .ffunc_nn name
-  |  mov TMP1, TOBIT_BIAS
-  |  fadd TMP1
-  |  fstp FPARG3
-  |  fadd TMP1
-  |  fstp FPARG1
-  |  mov RA, ARG3
-  |  mov RB, ARG1
   |.endif
   |  ins RB, cl				// Assumes RA is ecx.
   |  jmp ->fff_resbit
@@ -3051,27 +2841,9 @@ static void build_subroutines(BuildCtx *ctx)
   |//-----------------------------------------------------------------------
   |
   |// FP value rounding. Called by math.floor/math.ceil fast functions
-  |// and from JIT code.
-  |
-  |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified.
-  |.macro vm_round_x87, mode1, mode2
-  |  fnstcw word [esp+4]		// Caveat: overwrites ARG1 and ARG2.
-  |  mov [esp+8], eax
-  |  mov ax, mode1
-  |  or ax, [esp+4]
-  |.if mode2 ~= 0xffff
-  |  and ax, mode2
-  |.endif
-  |  mov [esp+6], ax
-  |  fldcw word [esp+6]
-  |  frndint
-  |  fldcw word [esp+4]
-  |  mov eax, [esp+8]
-  |  ret
-  |.endmacro
-  |
-  |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
-  |.macro vm_round_sse, mode
+  |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
+  |.macro vm_round, name, mode
+  |->name .. _sse:
   |  sseconst_abs xmm2, RDa
   |  sseconst_2p52 xmm3, RDa
   |  movaps xmm1, xmm0
@@ -3107,22 +2879,21 @@ static void build_subroutines(BuildCtx *ctx)
   |  ret
   |.endmacro
   |
-  |.macro vm_round, name, ssemode, mode1, mode2
-  |->name:
-  |.if not SSE
-  |  vm_round_x87 mode1, mode2
+  |->vm_floor:
+  |.if not X64
+  |  movsd xmm0, qword [esp+4]
+  |  call ->vm_floor_sse
+  |  movsd qword [esp+4], xmm0  // Overwrite callee-owned arg.
+  |  fld qword [esp+4]
+  |  ret
   |.endif
-  |->name .. _sse:
-  |  vm_round_sse ssemode
-  |.endmacro
   |
-  |  vm_round vm_floor, 0, 0x0400, 0xf7ff
-  |  vm_round vm_ceil,  1, 0x0800, 0xfbff
-  |  vm_round vm_trunc, 2, 0x0c00, 0xffff
+  |  vm_round vm_floor, 0
+  |  vm_round vm_ceil,  1
+  |  vm_round vm_trunc, 2
   |
   |// FP modulo x%y. Called by BC_MOD* and vm_arith.
   |->vm_mod:
-  |.if SSE
   |// Args in xmm0/xmm1, return value in xmm0.
   |// Caveat: xmm0-xmm5 and RC (eax) modified!
   |  movaps xmm5, xmm0
@@ -3150,23 +2921,6 @@ static void build_subroutines(BuildCtx *ctx)
   |  movaps xmm0, xmm5
   |  subsd xmm0, xmm1
   |  ret
-  |.else
-  |// Args/ret on x87 stack (y on top). No xmm registers modified.
-  |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
-  |  fld st1
-  |  fdiv st1
-  |  fnstcw word [esp+4]
-  |  mov ax, 0x0400
-  |  or ax, [esp+4]
-  |  and ax, 0xf7ff
-  |  mov [esp+6], ax
-  |  fldcw word [esp+6]
-  |  frndint
-  |  fldcw word [esp+4]
-  |  fmulp st1
-  |  fsubp st1
-  |  ret
-  |.endif
   |
   |// FP log2(x). Called by math.log(x, base).
   |->vm_log2:
@@ -3217,96 +2971,6 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |// Generic power function x^y. Called by BC_POW, math.pow fast function,
   |// and vm_arith.
-  |// Args/ret on x87 stack (y on top). RC (eax) modified.
-  |// Caveat: needs 3 slots on x87 stack!
-  |->vm_pow:
-  |.if not SSE
-  |  fist dword [esp+4]			// Store/reload int before comparison.
-  |  fild dword [esp+4]			// Integral exponent used in vm_powi.
-  |  fucomip st1
-  |  jnz >8				// Branch for FP exponents.
-  |  jp >9				// Branch for NaN exponent.
-  |  fpop				// Pop y and fallthrough to vm_powi.
-  |
-  |// FP/int power function x^i. Arg1/ret on x87 stack.
-  |// Arg2 (int) on C stack. RC (eax) modified.
-  |// Caveat: needs 2 slots on x87 stack!
-  |  mov eax, [esp+4]
-  |  cmp eax, 1; jle >6			// i<=1?
-  |  // Now 1 < (unsigned)i <= 0x80000000.
-  |1:  // Handle leading zeros.
-  |  test eax, 1; jnz >2
-  |  fmul st0
-  |  shr eax, 1
-  |  jmp <1
-  |2:
-  |  shr eax, 1; jz >5
-  |  fdup
-  |3:  // Handle trailing bits.
-  |  fmul st0
-  |  shr eax, 1; jz >4
-  |  jnc <3
-  |  fmul st1, st0
-  |  jmp <3
-  |4:
-  |  fmulp st1
-  |5:
-  |  ret
-  |6:
-  |  je <5				// x^1 ==> x
-  |  jb >7
-  |  fld1; fdivrp st1
-  |  neg eax
-  |  cmp eax, 1; je <5			// x^-1 ==> 1/x
-  |  jmp <1				// x^-i ==> (1/x)^i
-  |7:
-  |  fpop; fld1				// x^0 ==> 1
-  |  ret
-  |
-  |8:  // FP/FP power function x^y.
-  |  fst dword [esp+4]
-  |  fxch
-  |  fst dword [esp+8]
-  |  mov eax, [esp+4]; shl eax, 1
-  |  cmp eax, 0xff000000; je >2			// x^+-Inf?
-  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
-  |  cmp eax, 0xff000000; je >4			// +-Inf^y?
-  |  fyl2x
-  |  jmp ->vm_exp2raw
-  |
-  |9:  // Handle x^NaN.
-  |  fld1
-  |  fucomip st2
-  |  je >1				// 1^NaN ==> 1
-  |  fxch				// x^NaN ==> NaN
-  |1:
-  |  fpop
-  |  ret
-  |
-  |2:  // Handle x^+-Inf.
-  |  fabs
-  |  fld1
-  |  fucomip st1
-  |  je >3					// +-1^+-Inf ==> 1
-  |  fpop; fabs; fldz; mov eax, 0; setc al
-  |  ror eax, 1; xor eax, [esp+4]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
-  |  fxch
-  |3:
-  |  fpop1; fabs
-  |  ret
-  |
-  |4:  // Handle +-0^y or +-Inf^y.
-  |  cmp dword [esp+4], 0; jge <3		// y >= 0, x^y ==> |x|
-  |  fpop; fpop
-  |  test eax, eax; jz >5			// y < 0, +-0^y ==> +Inf
-  |  fldz					// y < 0, +-Inf^y ==> 0
-  |  ret
-  |5:
-  |  mov dword [esp+4], 0x7f800000		// Return +Inf.
-  |  fld dword [esp+4]
-  |  ret
-  |.endif
-  |
   |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
   |// Needs 16 byte scratch area for x86. Also called from JIT code.
   |->vm_pow_sse:
@@ -3315,7 +2979,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  ucomisd xmm1, xmm2
   |  jnz >8				// Branch for FP exponents.
   |  jp >9				// Branch for NaN exponent.
-  |  // Fallthrough to vm_powi_sse.
+  |  // Fallthrough.
   |
   |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
   |->vm_powi_sse:
@@ -3437,8 +3101,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  .else
   |    .define fpmop, CARG1d
   |  .endif
-  |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
-  |  cmp fpmop, 3; jb ->vm_trunc; ja >2
+  |  cmp fpmop, 1; jb ->vm_floor_sse; je ->vm_ceil_sse
+  |  cmp fpmop, 3; jb ->vm_trunc_sse; ja >2
   |  sqrtsd xmm0, xmm0; ret
   |2:
   |  .if X64WIN
@@ -3478,14 +3142,13 @@ static void build_subroutines(BuildCtx *ctx)
   |  ret
   |.else  // x86 calling convention.
   |  .define fpmop, eax
-  |.if SSE
   |  mov fpmop, [esp+12]
   |  movsd xmm0, qword [esp+4]
   |  cmp fpmop, 1; je >1; ja >2
-  |  call ->vm_floor; jmp >7
-  |1: ; call ->vm_ceil; jmp >7
+  |  call ->vm_floor_sse; jmp >7
+  |1: ; call ->vm_ceil_sse; jmp >7
   |2: ; cmp fpmop, 3; je >1; ja >2
-  |  call ->vm_trunc; jmp >7
+  |  call ->vm_trunc_sse; jmp >7
   |1:
   |  sqrtsd xmm0, xmm0
   |7:
@@ -3503,23 +3166,6 @@ static void build_subroutines(BuildCtx *ctx)
   |2: ; cmp fpmop, 11; je >1; ja >9
   |   fcos; ret
   |1: ; fptan; fpop; ret
-  |.else
-  |  mov fpmop, [esp+12]
-  |  fld qword [esp+4]
-  |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
-  |  cmp fpmop, 3; jb ->vm_trunc; ja >2
-  |  fsqrt; ret
-  |2: ; cmp fpmop, 5; jb ->vm_exp_x87; je ->vm_exp2_x87
-  |  cmp fpmop, 7; je >1; ja >2
-  |  fldln2; fxch; fyl2x; ret
-  |1: ; fld1; fxch; fyl2x; ret
-  |2: ; cmp fpmop, 9; je >1; ja >2
-  |  fldlg2; fxch; fyl2x; ret
-  |1: ; fsin; ret
-  |2: ; cmp fpmop, 11; je >1; ja >9
-  |   fcos; ret
-  |1: ; fptan; fpop; ret
-  |.endif
   |.endif
   |9: ; int3					// Bad fpm.
   |.endif
@@ -3541,7 +3187,7 @@ static void build_subroutines(BuildCtx *ctx)
   |2: ; cmp foldop, 3; je >1; ja >2
   |  mulsd xmm0, xmm1; ret
   |1: ; divsd xmm0, xmm1; ret
-  |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow
+  |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow_sse
   |  cmp foldop, 7; je >1; ja >2
   |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
   |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
@@ -3574,7 +3220,7 @@ static void build_subroutines(BuildCtx *ctx)
   |1: ; maxsd xmm0, xmm1; ret
   |9: ; int3				// Bad op.
   |
-  |.elif SSE  // x86 calling convention with SSE ops.
+  |.else  // x86 calling convention.
   |
   |  .define foldop, eax
   |  mov foldop, [esp+20]
@@ -3593,7 +3239,7 @@ static void build_subroutines(BuildCtx *ctx)
   |2: ; cmp foldop, 5
   |  je >1; ja >2
   |  call ->vm_mod; jmp <7
-  |1: ; pop edx; call ->vm_pow; push edx; jmp <7  // Writes to scratch area.
+  |1: ; pop edx; call ->vm_pow_sse; push edx; jmp <7  // Writes to scratch area.
   |2: ; cmp foldop, 7; je >1; ja >2
   |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
   |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
@@ -3608,29 +3254,6 @@ static void build_subroutines(BuildCtx *ctx)
   |1: ; maxsd xmm0, xmm1; jmp <7
   |9: ; int3				// Bad op.
   |
-  |.else  // x86 calling convention with x87 ops.
-  |
-  |  mov eax, [esp+20]
-  |  fld qword [esp+4]
-  |  fld qword [esp+12]
-  |  cmp eax, 1; je >1; ja >2
-  |  faddp st1; ret
-  |1: ; fsubp st1; ret
-  |2: ; cmp eax, 3; je >1; ja >2
-  |  fmulp st1; ret
-  |1: ; fdivp st1; ret
-  |2: ; cmp eax, 5; jb ->vm_mod; je ->vm_pow
-  |  cmp eax, 7; je >1; ja >2
-  |  fpop; fchs; ret
-  |1: ; fpop; fabs; ret
-  |2: ; cmp eax, 9; je >1; ja >2
-  |  fpatan; ret
-  |1: ; fxch; fscale; fpop1; ret
-  |2: ; cmp eax, 11; je >1; ja >9
-  |  fucomi st1; fcmovnbe st1; fpop1; ret
-  |1: ; fucomi st1; fcmovbe st1; fpop1; ret
-  |9: ; int3				// Bad op.
-  |
   |.endif
   |
   |//-----------------------------------------------------------------------
@@ -3943,19 +3566,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RA is a number.
     |  cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
     |  // RA is a number, RD is an integer.
-    |.if SSE
     |  cvtsi2sd xmm0, dword [BASE+RD*8]
     |  jmp >2
-    |.else
-    |  fld qword [BASE+RA*8]
-    |  fild dword [BASE+RD*8]
-    |  jmp >3
-    |.endif
     |
     |8:  // RA is an integer, RD is not an integer.
     |  ja ->vmeta_comp
     |  // RA is an integer, RD is a number.
-    |.if SSE
     |  cvtsi2sd xmm1, dword [BASE+RA*8]
     |  movsd xmm0, qword [BASE+RD*8]
     |  add PC, 4
@@ -3963,29 +3579,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  jmp_comp jbe, ja, jb, jae, <9
     |  jmp <6
     |.else
-    |  fild dword [BASE+RA*8]
-    |  jmp >2
-    |.endif
-    |.else
     |  checknum RA, ->vmeta_comp
     |  checknum RD, ->vmeta_comp
     |.endif
-    |.if SSE
     |1:
     |  movsd xmm0, qword [BASE+RD*8]
     |2:
     |  add PC, 4
     |  ucomisd xmm0, qword [BASE+RA*8]
     |3:
-    |.else
-    |1:
-    |  fld qword [BASE+RA*8]		// Reverse order, i.e like cmp D, A.
-    |2:
-    |  fld qword [BASE+RD*8]
-    |3:
-    |  add PC, 4
-    |  fcomparepp
-    |.endif
     |  // Unordered: all of ZF CF PF set, ordered: PF clear.
     |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
     |.if DUALNUM
@@ -4025,43 +3627,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RD is a number.
     |  cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
     |  // RD is a number, RA is an integer.
-    |.if SSE
     |  cvtsi2sd xmm0, dword [BASE+RA*8]
-    |.else
-    |  fild dword [BASE+RA*8]
-    |.endif
     |  jmp >2
     |
     |8:  // RD is an integer, RA is not an integer.
     |  ja >5
     |  // RD is an integer, RA is a number.
-    |.if SSE
     |  cvtsi2sd xmm0, dword [BASE+RD*8]
     |  ucomisd xmm0, qword [BASE+RA*8]
-    |.else
-    |  fild dword [BASE+RD*8]
-    |  fld qword [BASE+RA*8]
-    |.endif
     |  jmp >4
     |
     |.else
     |  cmp RB, LJ_TISNUM; jae >5
     |  checknum RA, >5
     |.endif
-    |.if SSE
     |1:
     |  movsd xmm0, qword [BASE+RA*8]
     |2:
     |  ucomisd xmm0, qword [BASE+RD*8]
     |4:
-    |.else
-    |1:
-    |  fld qword [BASE+RA*8]
-    |2:
-    |  fld qword [BASE+RD*8]
-    |4:
-    |  fcomparepp
-    |.endif
   iseqne_fp:
     if (vk) {
       |  jp >2				// Unordered means not equal.
@@ -4184,39 +3768,21 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RA is a number.
     |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
     |  // RA is a number, RD is an integer.
-    |.if SSE
     |  cvtsi2sd xmm0, dword [KBASE+RD*8]
-    |.else
-    |  fild dword [KBASE+RD*8]
-    |.endif
     |  jmp >2
     |
     |8:  // RA is an integer, RD is a number.
-    |.if SSE
     |  cvtsi2sd xmm0, dword [BASE+RA*8]
     |  ucomisd xmm0, qword [KBASE+RD*8]
-    |.else
-    |  fild dword [BASE+RA*8]
-    |  fld qword [KBASE+RD*8]
-    |.endif
     |  jmp >4
     |.else
     |  cmp RB, LJ_TISNUM; jae >3
     |.endif
-    |.if SSE
     |1:
     |  movsd xmm0, qword [KBASE+RD*8]
     |2:
     |  ucomisd xmm0, qword [BASE+RA*8]
     |4:
-    |.else
-    |1:
-    |  fld qword [KBASE+RD*8]
-    |2:
-    |  fld qword [BASE+RA*8]
-    |4:
-    |  fcomparepp
-    |.endif
     goto iseqne_fp;
   case BC_ISEQP: case BC_ISNEP:
     vk = op == BC_ISEQP;
@@ -4310,16 +3876,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.else
     |  checknum RD, ->vmeta_unm
     |.endif
-    |.if SSE
     |  movsd xmm0, qword [BASE+RD*8]
     |  sseconst_sign xmm1, RDa
     |  xorps xmm0, xmm1
     |  movsd qword [BASE+RA*8], xmm0
-    |.else
-    |  fld qword [BASE+RD*8]
-    |  fchs
-    |  fstp qword [BASE+RA*8]
-    |.endif
     |.if DUALNUM
     |  jmp <9
     |.else
@@ -4335,15 +3895,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |1:
     |  mov dword [BASE+RA*8+4], LJ_TISNUM
     |  mov dword [BASE+RA*8], RD
-    |.elif SSE
+    |.else
     |  xorps xmm0, xmm0
     |  cvtsi2sd xmm0, dword STR:RD->len
     |1:
     |  movsd qword [BASE+RA*8], xmm0
-    |.else
-    |  fild dword STR:RD->len
-    |1:
-    |  fstp qword [BASE+RA*8]
     |.endif
     |  ins_next
     |2:
@@ -4361,11 +3917,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // Length of table returned in eax (RD).
     |.if DUALNUM
     |  // Nothing to do.
-    |.elif SSE
-    |  cvtsi2sd xmm0, RD
     |.else
-    |  mov ARG1, RD
-    |  fild ARG1
+    |  cvtsi2sd xmm0, RD
     |.endif
     |  mov BASE, RB			// Restore BASE.
     |  movzx RA, PC_RA
@@ -4380,7 +3933,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   /* -- Binary ops -------------------------------------------------------- */
 
-    |.macro ins_arithpre, x87ins, sseins, ssereg
+    |.macro ins_arithpre, sseins, ssereg
     |  ins_ABC
     ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
     ||switch (vk) {
@@ -4389,37 +3942,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   .if DUALNUM
     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
     |   .endif
-    |   .if SSE
-    |     movsd xmm0, qword [BASE+RB*8]
-    |     sseins ssereg, qword [KBASE+RC*8]
-    |   .else
-    |     fld qword [BASE+RB*8]
-    |     x87ins qword [KBASE+RC*8]
-    |   .endif
+    |   movsd xmm0, qword [BASE+RB*8]
+    |   sseins ssereg, qword [KBASE+RC*8]
     ||  break;
     ||case 1:
     |   checknum RB, ->vmeta_arith_nv
     |   .if DUALNUM
     |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
     |   .endif
-    |   .if SSE
-    |     movsd xmm0, qword [KBASE+RC*8]
-    |     sseins ssereg, qword [BASE+RB*8]
-    |   .else
-    |     fld qword [KBASE+RC*8]
-    |     x87ins qword [BASE+RB*8]
-    |   .endif
+    |   movsd xmm0, qword [KBASE+RC*8]
+    |   sseins ssereg, qword [BASE+RB*8]
     ||  break;
     ||default:
     |   checknum RB, ->vmeta_arith_vv
     |   checknum RC, ->vmeta_arith_vv
-    |   .if SSE
-    |     movsd xmm0, qword [BASE+RB*8]
-    |     sseins ssereg, qword [BASE+RC*8]
-    |   .else
-    |     fld qword [BASE+RB*8]
-    |     x87ins qword [BASE+RC*8]
-    |   .endif
+    |   movsd xmm0, qword [BASE+RB*8]
+    |   sseins ssereg, qword [BASE+RC*8]
     ||  break;
     ||}
     |.endmacro
@@ -4457,54 +3995,50 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.endmacro
     |
     |.macro ins_arithpost
-    |.if SSE
     |  movsd qword [BASE+RA*8], xmm0
-    |.else
-    |  fstp qword [BASE+RA*8]
-    |.endif
     |.endmacro
     |
-    |.macro ins_arith, x87ins, sseins
-    |  ins_arithpre x87ins, sseins, xmm0
+    |.macro ins_arith, sseins
+    |  ins_arithpre sseins, xmm0
     |  ins_arithpost
     |  ins_next
     |.endmacro
     |
-    |.macro ins_arith, intins, x87ins, sseins
+    |.macro ins_arith, intins, sseins
     |.if DUALNUM
     |  ins_arithdn intins
     |.else
-    |  ins_arith, x87ins, sseins
+    |  ins_arith, sseins
     |.endif
     |.endmacro
 
     |  // RA = dst, RB = src1 or num const, RC = src2 or num const
   case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
-    |  ins_arith add, fadd, addsd
+    |  ins_arith add, addsd
     break;
   case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
-    |  ins_arith sub, fsub, subsd
+    |  ins_arith sub, subsd
     break;
   case BC_MULVN: case BC_MULNV: case BC_MULVV:
-    |  ins_arith imul, fmul, mulsd
+    |  ins_arith imul, mulsd
     break;
   case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
-    |  ins_arith fdiv, divsd
+    |  ins_arith divsd
     break;
   case BC_MODVN:
-    |  ins_arithpre fld, movsd, xmm1
+    |  ins_arithpre movsd, xmm1
     |->BC_MODVN_Z:
     |  call ->vm_mod
     |  ins_arithpost
     |  ins_next
     break;
   case BC_MODNV: case BC_MODVV:
-    |  ins_arithpre fld, movsd, xmm1
+    |  ins_arithpre movsd, xmm1
     |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
     break;
   case BC_POW:
-    |  ins_arithpre fld, movsd, xmm1
-    |  call ->vm_pow
+    |  ins_arithpre movsd, xmm1
+    |  call ->vm_pow_sse
     |  ins_arithpost
     |  ins_next
     break;
@@ -4573,25 +4107,17 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  movsx RD, RDW
     |  mov dword [BASE+RA*8+4], LJ_TISNUM
     |  mov dword [BASE+RA*8], RD
-    |.elif SSE
+    |.else
     |  movsx RD, RDW			// Sign-extend literal.
     |  cvtsi2sd xmm0, RD
     |  movsd qword [BASE+RA*8], xmm0
-    |.else
-    |  fild PC_RD			// Refetch signed RD from instruction.
-    |  fstp qword [BASE+RA*8]
     |.endif
     |  ins_next
     break;
   case BC_KNUM:
     |  ins_AD	// RA = dst, RD = num const
-    |.if SSE
     |  movsd xmm0, qword [KBASE+RD*8]
     |  movsd qword [BASE+RA*8], xmm0
-    |.else
-    |  fld qword [KBASE+RD*8]
-    |  fstp qword [BASE+RA*8]
-    |.endif
     |  ins_next
     break;
   case BC_KPRI:
@@ -4698,18 +4224,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_USETN:
     |  ins_AD	// RA = upvalue #, RD = num const
     |  mov LFUNC:RB, [BASE-8]
-    |.if SSE
     |  movsd xmm0, qword [KBASE+RD*8]
-    |.else
-    |  fld qword [KBASE+RD*8]
-    |.endif
     |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
     |  mov RA, UPVAL:RB->v
-    |.if SSE
     |  movsd qword [RA], xmm0
-    |.else
-    |  fstp qword [RA]
-    |.endif
     |  ins_next
     break;
   case BC_USETP:
@@ -4863,18 +4381,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.else
     |  // Convert number to int and back and compare.
     |  checknum RC, >5
-    |.if SSE
     |  movsd xmm0, qword [BASE+RC*8]
     |  cvtsd2si RC, xmm0
     |  cvtsi2sd xmm1, RC
     |  ucomisd xmm0, xmm1
-    |.else
-    |  fld qword [BASE+RC*8]
-    |  fist ARG1
-    |  fild ARG1
-    |  fcomparepp
-    |  mov RC, ARG1
-    |.endif
     |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
     |.endif
     |  cmp RC, TAB:RB->asize	// Takes care of unordered, too.
@@ -5011,18 +4521,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.else
     |  // Convert number to int and back and compare.
     |  checknum RC, >5
-    |.if SSE
     |  movsd xmm0, qword [BASE+RC*8]
     |  cvtsd2si RC, xmm0
     |  cvtsi2sd xmm1, RC
     |  ucomisd xmm0, xmm1
-    |.else
-    |  fld qword [BASE+RC*8]
-    |  fist ARG1
-    |  fild ARG1
-    |  fcomparepp
-    |  mov RC, ARG1
-    |.endif
     |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
     |.endif
     |  cmp RC, TAB:RB->asize		// Takes care of unordered, too.
@@ -5386,10 +4888,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.if DUALNUM
     |  mov dword [BASE+RA*8+4], LJ_TISNUM
     |  mov dword [BASE+RA*8], RC
-    |.elif SSE
-    |  cvtsi2sd xmm0, RC
     |.else
-    |  fild dword [BASE+RA*8-8]
+    |  cvtsi2sd xmm0, RC
     |.endif
     |  // Copy array slot to returned value.
     |.if X64
@@ -5405,10 +4905,8 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // Return array index as a numeric key.
     |.if DUALNUM
     |  // See above.
-    |.elif SSE
-    |  movsd qword [BASE+RA*8], xmm0
     |.else
-    |  fstp qword [BASE+RA*8]
+    |  movsd qword [BASE+RA*8], xmm0
     |.endif
     |  mov [BASE+RA*8-8], RC		// Update control var.
     |2:
@@ -5421,9 +4919,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |
     |4:  // Skip holes in array part.
     |  add RC, 1
-    |.if not (DUALNUM or SSE)
-    |  mov [BASE+RA*8-8], RC
-    |.endif
     |  jmp <1
     |
     |5:  // Traverse hash part.
@@ -5757,7 +5252,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     if (!vk) {
       |  cmp RB, LJ_TISNUM; jae ->vmeta_for
     }
-    |.if SSE
     |  movsd xmm0, qword FOR_IDX
     |  movsd xmm1, qword FOR_STOP
     if (vk) {
@@ -5770,22 +5264,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ucomisd xmm1, xmm0
     |1:
     |  movsd qword FOR_EXT, xmm0
-    |.else
-    |  fld qword FOR_STOP
-    |  fld qword FOR_IDX
-    if (vk) {
-      |  fadd qword FOR_STEP		// nidx = idx + step
-      |  fst qword FOR_IDX
-      |  fst qword FOR_EXT
-      |  test RB, RB; js >1
-    } else {
-      |  fst qword FOR_EXT
-      |  jl >1
-    }
-    |  fxch				// Swap lim/(n)idx if step non-negative.
-    |1:
-    |  fcomparepp
-    |.endif
     if (op == BC_FORI) {
       |.if DUALNUM
       |  jnb <7
@@ -5813,11 +5291,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |2:
     |  ins_next
     |.endif
-    |.if SSE
+    |
     |3:  // Invert comparison if step is negative.
     |  ucomisd xmm0, xmm1
     |  jmp <1
-    |.endif
     break;
 
   case BC_ITERL: