diff --git a/src/Makefile b/src/Makefile index a2be1a18..0150b049 100644 --- a/src/Makefile +++ b/src/Makefile @@ -331,7 +331,7 @@ LJCORE_O= lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o \ lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \ lj_lex.o lj_parse.o \ lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \ - lj_opt_dce.o lj_opt_loop.o \ + lj_opt_dce.o lj_opt_loop.o lj_opt_split.o \ lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \ lj_asm.o lj_trace.o lj_gdbjit.o \ lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_carith.o lj_clib.o \ diff --git a/src/Makefile.dep b/src/Makefile.dep index 3d0c4239..1534ac27 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -128,6 +128,8 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \ lj_dispatch.h lj_traceerr.h +lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \ + lj_arch.h lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \ lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h @@ -167,10 +169,11 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \ lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_carith.c lj_carith.h \ lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c \ lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \ - lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_mcode.c lj_mcode.h lj_snap.c \ - lj_target.h lj_target_*.h lj_record.c lj_record.h lj_ffrecord.h \ - lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h \ - lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c \ - lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \ - lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c lib_init.c + lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_mcode.c \ + lj_mcode.h lj_snap.c lj_target.h lj_target_*.h lj_record.c lj_record.h \ + lj_ffrecord.h lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h \ + lj_asm.c lj_asm.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c \ + lib_aux.c lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c \ + lib_io.c lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c \ + lib_ffi.c lib_init.c luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h diff --git a/src/lj_asm.c b/src/lj_asm.c index cc2ae597..441700d4 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -347,6 +347,20 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs) } } +/* op rm/mrm, i */ +static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i) +{ + x86Op xo; + if (checki8(i)) { + emit_i8(as, i); + xo = XG_TOXOi8(xg); + } else { + emit_i32(as, i); + xo = XG_TOXOi(xg); + } + emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64)); +} + /* -- Emit moves ---------------------------------------------------------- */ /* mov [base+ofs], i */ @@ -371,7 +385,10 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i) /* mov r, i / xor r, r */ static void emit_loadi(ASMState *as, Reg r, int32_t i) { - if (i == 0) { + /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */ + if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP || + (as->curins+1 < as->T->nins && + IR(as->curins+1)->o == IR_HIOP)))) { emit_rr(as, XO_ARITH(XOg_XOR), r, r); } else { MCode *p = as->mcp; @@ -422,6 +439,19 @@ static void emit_loadn(ASMState *as, Reg r, cTValue *tv) /* Label for short jumps. */ typedef MCode *MCLabel; +#if LJ_32 && LJ_HASFFI +/* jmp short target */ +static void emit_sjmp(ASMState *as, MCLabel target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - p; + lua_assert(delta == (int8_t)delta); + p[-1] = (MCode)(int8_t)delta; + p[-2] = XI_JMPs; + as->mcp = p - 2; +} +#endif + /* jcc short target */ static void emit_sjcc(ASMState *as, int cc, MCLabel target) { @@ -630,7 +660,7 @@ static Reg ra_rematk(ASMState *as, IRIns *ir) } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */ lua_assert(irt_isnil(ir->t)); emit_getgl(as, r, jit_L); -#if LJ_64 /* NYI: 32 bit register pairs. */ +#if LJ_64 } else if (ir->o == IR_KINT64) { emit_loadu64(as, r, ir_kint64(ir)->u64); #endif @@ -681,8 +711,7 @@ static Reg ra_releasetmp(ASMState *as, IRRef ref) #if LJ_64 #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) #else -/* NYI: 32 bit register pairs. */ -#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r)) +#define REX_64IR(ir, r) (r) #endif /* Generic move between two regs. */ @@ -939,7 +968,7 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref) emit_loadn(as, dest, tv); return; } -#if LJ_64 /* NYI: 32 bit register pairs. */ +#if LJ_64 } else if (ir->o == IR_KINT64) { emit_loadu64(as, dest, ir_kint64(ir)->u64); return; @@ -1463,7 +1492,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) #endif if (r) { /* Argument is in a register. */ if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { -#if LJ_64 /* NYI: 32 bit register pairs. */ +#if LJ_64 if (ir->o == IR_KINT64) emit_loadu64(as, r, ir_kint64(ir)->u64); else @@ -1519,7 +1548,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) ra_evictset(as, drop); /* Evictions must be performed first. */ if (ra_used(ir)) { if (irt_isfp(ir->t)) { - int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ + int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ #if LJ_64 if ((ci->flags & CCI_CASTU64)) { Reg dest = ir->r; @@ -1632,19 +1661,24 @@ static void asm_conv(ASMState *as, IRIns *ir) int stfp = (st == IRT_NUM || st == IRT_FLOAT); IRRef lref = ir->op1; lua_assert(irt_type(ir->t) != st); + lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */ if (irt_isfp(ir->t)) { Reg dest = ra_dest(as, ir, RSET_FPR); if (stfp) { /* FP to FP conversion. */ Reg left = asm_fuseload(as, lref, RSET_FPR); emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); if (left == dest) return; /* Avoid the XO_XORPS. */ -#if LJ_32 - } else if (st >= IRT_U32) { - /* NYI: 64 bit integer or uint32_t to number conversion. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); + } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */ + /* number = (2^52+2^51 .. u32) - (2^52+2^51) */ + cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000)); + Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); + if (irt_isfloat(ir->t)) + emit_rr(as, XO_CVTSD2SS, dest, dest); + emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ + emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ + emit_loadn(as, bias, k); + emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); return; -#endif } else { /* Integer to FP conversion. */ Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? ra_alloc1(as, lref, RSET_GPR) : @@ -1663,41 +1697,47 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ } else if (stfp) { /* FP to integer conversion. */ if (irt_isguard(ir->t)) { - lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */ + /* Checked conversions are only supported from number to int. */ + lua_assert(irt_isint(ir->t) && st == IRT_NUM); asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); -#if LJ_32 - } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) { - /* NYI: number to 64 bit integer or uint32_t conversion. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); -#endif } else { Reg dest = ra_dest(as, ir, RSET_GPR); x86Op op = st == IRT_NUM ? ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) : ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI); - if (LJ_64 && irt_isu64(ir->t)) { - const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000)); - MCLabel l_end = emit_label(as); - Reg left = IR(lref)->r; + if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */ + /* u32 = (int32_t)(number - 2^31) + 2^31 */ + Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) : + ra_scratch(as, RSET_FPR); + emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000); + emit_rr(as, op, dest, tmp); + if (st == IRT_NUM) + emit_rma(as, XO_ADDSD, tmp, + lj_ir_k64_find(as->J, U64x(c1e00000,00000000))); + else + emit_rma(as, XO_ADDSS, tmp, + lj_ir_k64_find(as->J, U64x(00000000,cf000000))); + ra_left(as, tmp, lref); + } else if (LJ_64 && irt_isu64(ir->t)) { /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ - if (ra_hasreg(left)) { - Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left)); - emit_rr(as, op, dest|REX_64, tmpn); - emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left); - emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k); - } else { - left = ra_allocref(as, lref, RSET_FPR); - emit_rr(as, op, dest|REX_64, left); - emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k); - } + Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) : + ra_scratch(as, RSET_FPR); + MCLabel l_end = emit_label(as); + emit_rr(as, op, dest|REX_64, tmp); + if (st == IRT_NUM) + emit_rma(as, XO_ADDSD, tmp, + lj_ir_k64_find(as->J, U64x(c3f00000,00000000))); + else + emit_rma(as, XO_ADDSS, tmp, + lj_ir_k64_find(as->J, U64x(00000000,df800000))); emit_sjcc(as, CC_NS, l_end); emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */ - emit_rr(as, op, dest|REX_64, left); + emit_rr(as, op, dest|REX_64, tmp); + ra_left(as, tmp, lref); } else { Reg left = asm_fuseload(as, lref, RSET_FPR); if (LJ_64 && irt_isu32(ir->t)) - emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */ + emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */ emit_mrm(as, op, dest|((LJ_64 && (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), @@ -1728,12 +1768,10 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_mrm(as, op, dest, left); } } else { /* 32/64 bit integer conversions. */ - if (irt_is64(ir->t)) { -#if LJ_32 - /* NYI: conversion to 64 bit integers. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); -#else + if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ + } else if (irt_is64(ir->t)) { Reg dest = ra_dest(as, ir, RSET_GPR); if (st64 || !(ir->op2 & IRCONV_SEXT)) { /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ @@ -1742,21 +1780,14 @@ static void asm_conv(ASMState *as, IRIns *ir) Reg left = asm_fuseload(as, lref, RSET_GPR); emit_mrm(as, XO_MOVSXd, dest|REX_64, left); } -#endif } else { Reg dest = ra_dest(as, ir, RSET_GPR); if (st64) { -#if LJ_32 - /* NYI: conversion from 64 bit integers. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); -#else Reg left = asm_fuseload(as, lref, RSET_GPR); - /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits - ** or a load of the lower 32 bits from a 64 bit address. + /* This is either a 32 bit reg/reg mov which zeroes the hiword + ** or a load of the loword from a 64 bit address. */ emit_mrm(as, XO_MOV, dest, left); -#endif } else { /* 32/32 bit no-op (cast). */ ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ } @@ -1764,6 +1795,93 @@ static void asm_conv(ASMState *as, IRIns *ir) } } +#if LJ_32 && LJ_HASFFI +/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */ + +/* 64 bit integer to FP conversion in 32 bit mode. */ +static void asm_conv_fp_int64(ASMState *as, IRIns *ir) +{ + Reg hi = ra_alloc1(as, ir->op1, RSET_GPR); + Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi)); + int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ + Reg dest = ir->r; + if (ra_hasreg(dest)) { + ra_free(as, dest); + ra_modified(as, dest); + emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS, + dest, RID_ESP, ofs); + } + emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd, + irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); + if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) { + /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */ + MCLabel l_end = emit_label(as); + emit_rma(as, XO_FADDq, XOg_FADDq, + lj_ir_k64_find(as->J, U64x(43f00000,00000000))); + emit_sjcc(as, CC_NS, l_end); + emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */ + } else { + lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64); + } + emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0); + /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */ + emit_rmro(as, XO_MOVto, hi, RID_ESP, 4); + emit_rmro(as, XO_MOVto, lo, RID_ESP, 0); +} + +/* FP to 64 bit integer conversion in 32 bit mode. */ +static void asm_conv_int64_fp(ASMState *as, IRIns *ir) +{ + IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK); + IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH); + Reg lo, hi; + lua_assert(st == IRT_NUM || st == IRT_FLOAT); + lua_assert(dt == IRT_I64 || dt == IRT_U64); + lua_assert(((ir-1)->op2 & IRCONV_TRUNC)); + hi = ra_dest(as, ir, RSET_GPR); + lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi)); + if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0); + /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */ + if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */ + emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4); + emit_rmro(as, XO_MOVto, lo, RID_ESP, 4); + emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff); + } + if (dt == IRT_U64) { + /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ + MCLabel l_pop, l_end = emit_label(as); + emit_x87op(as, XI_FPOP); + l_pop = emit_label(as); + emit_sjmp(as, l_end); + emit_rmro(as, XO_MOV, hi, RID_ESP, 4); + if ((as->flags & JIT_F_SSE3)) + emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); + else + emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); + emit_rma(as, XO_FADDq, XOg_FADDq, + lj_ir_k64_find(as->J, U64x(c3f00000,00000000))); + emit_sjcc(as, CC_NS, l_pop); + emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */ + } + emit_rmro(as, XO_MOV, hi, RID_ESP, 4); + if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */ + emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); + } else { /* Otherwise set FPU rounding mode to truncate before the store. */ + emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); + emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0); + emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0); + emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0); + emit_loadi(as, lo, 0xc00); + emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0); + } + if (dt == IRT_U64) + emit_x87op(as, XI_FDUP); + emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd, + st == IRT_NUM ? XOg_FLDq: XOg_FLDd, + asm_fuseload(as, ir->op1, RSET_EMPTY)); +} +#endif + static void asm_strto(ASMState *as, IRIns *ir) { /* Force a spill slot for the destination register (if any). */ @@ -2644,6 +2762,18 @@ static void asm_powi(ASMState *as, IRIns *ir) ra_left(as, RID_EAX, ir->op2); } +#if LJ_64 && LJ_HASFFI +static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id) +{ + const CCallInfo *ci = &lj_ir_callinfo[id]; + IRRef args[2]; + args[0] = ir->op1; + args[1] = ir->op2; + asm_setupresult(as, ir, ci); + asm_gencall(as, ci, args); +} +#endif + /* Find out whether swapping operands might be beneficial. */ static int swapops(ASMState *as, IRIns *ir) { @@ -2877,12 +3007,30 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) /* -- Comparisons --------------------------------------------------------- */ /* Virtual flags for unordered FP comparisons. */ -#define VCC_U 0x100 /* Unordered. */ -#define VCC_P 0x200 /* Needs extra CC_P branch. */ -#define VCC_S 0x400 /* Swap avoids CC_P branch. */ +#define VCC_U 0x1000 /* Unordered. */ +#define VCC_P 0x2000 /* Needs extra CC_P branch. */ +#define VCC_S 0x4000 /* Swap avoids CC_P branch. */ #define VCC_PS (VCC_P|VCC_S) -static void asm_comp_(ASMState *as, IRIns *ir, int cc) +/* Map of comparisons to flags. ORDER IR. */ +#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf)) +static const uint16_t asm_compmap[IR_ABC+1] = { + /* signed non-eq unsigned flags */ + /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS), + /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0), + /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS), + /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0), + /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U), + /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS), + /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U), + /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS), + /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P), + /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P), + /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */ +}; + +/* FP and integer comparisons. */ +static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc) { if (irt_isnum(ir->t)) { IRRef lref = ir->op1; @@ -3008,15 +3156,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc) if (irl+1 == ir) /* Referencing previous ins? */ as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ } else { - x86Op xo; - if (checki8(imm)) { - emit_i8(as, imm); - xo = XO_ARITHi8; - } else { - emit_i32(as, imm); - xo = XO_ARITHi; - } - emit_mrm(as, xo, r64 + XOg_CMP, left); + emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm); } } } else { @@ -3028,8 +3168,133 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc) } } -#define asm_comp(as, ir, ci, cf, cu) \ - asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) +#if LJ_32 && LJ_HASFFI +/* 64 bit integer comparisons in 32 bit mode. */ +static void asm_comp_int64(ASMState *as, IRIns *ir) +{ + uint32_t cc = asm_compmap[(ir-1)->o]; + RegSet allow = RSET_GPR; + Reg lefthi = RID_NONE, leftlo = RID_NONE; + Reg righthi = RID_NONE, rightlo = RID_NONE; + MCLabel l_around; + x86ModRM mrm; + + as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */ + + /* Allocate/fuse hiword operands. */ + if (irref_isk(ir->op2)) { + lefthi = asm_fuseload(as, ir->op1, allow); + } else { + lefthi = ra_alloc1(as, ir->op1, allow); + righthi = asm_fuseload(as, ir->op2, allow); + if (righthi == RID_MRM) { + if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base); + if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx); + } else { + rset_clear(allow, righthi); + } + } + mrm = as->mrm; /* Save state for hiword instruction. */ + + /* Allocate/fuse loword operands. */ + if (irref_isk((ir-1)->op2)) { + leftlo = asm_fuseload(as, (ir-1)->op1, allow); + } else { + leftlo = ra_alloc1(as, (ir-1)->op1, allow); + rightlo = asm_fuseload(as, (ir-1)->op2, allow); + if (rightlo == RID_MRM) { + if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base); + if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx); + } else { + rset_clear(allow, rightlo); + } + } + + /* All register allocations must be performed _before_ this point. */ + l_around = emit_label(as); + as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */ + + /* Loword comparison and branch. */ + asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */ + if (ra_noreg(rightlo)) { + int32_t imm = IR((ir-1)->op2)->i; + if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM) + emit_rr(as, XO_TEST, leftlo, leftlo); + else + emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm); + } else { + emit_mrm(as, XO_CMP, leftlo, rightlo); + } + + /* Hiword comparison and branches. */ + if ((cc & 15) != CC_NE) + emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */ + if ((cc & 15) != CC_E) + asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */ + as->mrm = mrm; /* Restore state. */ + if (ra_noreg(righthi)) { + int32_t imm = IR(ir->op2)->i; + if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM) + emit_rr(as, XO_TEST, lefthi, lefthi); + else + emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm); + } else { + emit_mrm(as, XO_CMP, lefthi, righthi); + } +} +#endif + +/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */ + +/* Hiword op of a split 64 bit op. Previous op must be the loword op. */ +static void asm_hiop(ASMState *as, IRIns *ir) +{ +#if LJ_32 && LJ_HASFFI + /* HIOP is marked as a store because it needs its own DCE logic. */ + int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ + if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; + if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ + if (usehi || uselo) { + if (irt_isfp(ir->t)) + asm_conv_fp_int64(as, ir); + else + asm_conv_int64_fp(as, ir); + } + as->curins--; /* Always skip the CONV. */ + return; + } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ + asm_comp_int64(as, ir); + return; + } + if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ + switch ((ir-1)->o) { + case IR_ADD: + asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD); + break; + case IR_SUB: + asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB); + break; + case IR_NEG: { + Reg dest = ra_dest(as, ir, RSET_GPR); + emit_rr(as, XO_GROUP3, XOg_NEG, dest); + if (uselo) { + emit_i8(as, 0); + emit_rr(as, XO_ARITHi8, XOg_ADC, dest); + } + ra_left(as, dest, ir->op1); + break; + } + case IR_CALLN: + ra_destreg(as, ir, RID_RETHI); + if (!uselo) + ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */ + break; + default: lua_assert(0); break; + } +#else + UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */ +#endif +} /* -- Stack handling ------------------------------------------------------ */ @@ -3682,21 +3947,16 @@ static void asm_ir(ASMState *as, IRIns *ir) switch ((IROp)ir->o) { /* Miscellaneous ops. */ case IR_LOOP: asm_loop(as); break; - case IR_NOP: break; + case IR_NOP: lua_assert(!ra_used(ir)); break; case IR_PHI: asm_phi(as, ir); break; + case IR_HIOP: asm_hiop(as, ir); break; /* Guarded assertions. */ - case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; - case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; - case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; - case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; - case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; - case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break; - case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break; - case IR_ABC: - case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break; - case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break; - case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break; + case IR_LT: case IR_GE: case IR_LE: case IR_GT: + case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT: + case IR_EQ: case IR_NE: case IR_ABC: + asm_comp(as, ir, asm_compmap[ir->o]); + break; case IR_RETF: asm_retf(as, ir); break; @@ -3744,7 +4004,15 @@ static void asm_ir(ASMState *as, IRIns *ir) case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: asm_fpmath(as, ir); break; - case IR_POWI: asm_powi(as, ir); break; + case IR_POWI: +#if LJ_64 && LJ_HASFFI + if (!irt_isnum(ir->t)) + asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : + IRCALL_lj_carith_powu64); + else +#endif + asm_powi(as, ir); + break; /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; @@ -3801,6 +4069,7 @@ static void asm_trace(ASMState *as) { for (as->curins--; as->curins > as->stopins; as->curins--) { IRIns *ir = IR(as->curins); + lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */ if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE)) continue; /* Dead-code elimination can be soooo easy. */ if (irt_isguard(ir->t)) @@ -3864,11 +4133,10 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T) case IR_CALLN: case IR_CALLL: case IR_CALLS: { const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; #if LJ_64 - /* NYI: add stack slots for x64 calls with many args. */ lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6)); ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); #else - /* NYI: not fastcall-aware, but doesn't matter (yet). */ + lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2); if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */ as->evenspill = (int32_t)CCI_NARGS(ci); ir->prev = REGSP_HINT(RID_RET); @@ -3878,6 +4146,12 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T) (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH; continue; } +#if LJ_32 && LJ_HASFFI + case IR_HIOP: + if ((ir-1)->o == IR_CALLN) + ir->prev = REGSP_HINT(RID_RETHI); + break; +#endif /* C calls evict all scratch regs and return results in RID_RET. */ case IR_SNEW: case IR_NEWREF: #if !LJ_64 @@ -3894,6 +4168,14 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T) as->modset = RSET_SCRATCH; break; case IR_POWI: +#if LJ_64 && LJ_HASFFI + if (!irt_isnum(ir->t)) { + ir->prev = REGSP_HINT(RID_RET); + if (inloop) + as->modset |= (RSET_SCRATCH & RSET_GPR); + continue; + } +#endif ir->prev = REGSP_HINT(RID_XMM0); if (inloop) as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); diff --git a/src/lj_carith.c b/src/lj_carith.c index 46f07be7..134a61fb 100644 --- a/src/lj_carith.c +++ b/src/lj_carith.c @@ -230,6 +230,14 @@ int lj_carith_op(lua_State *L, MMS mm) /* -- 64 bit integer arithmetic helpers ----------------------------------- */ +#if LJ_32 +/* Signed/unsigned 64 bit multiply. */ +int64_t lj_carith_mul64(int64_t a, int64_t b) +{ + return a * b; +} +#endif + /* Unsigned 64 bit x^k. */ uint64_t lj_carith_powu64(uint64_t x, uint64_t k) { diff --git a/src/lj_carith.h b/src/lj_carith.h index 6870172b..14073603 100644 --- a/src/lj_carith.h +++ b/src/lj_carith.h @@ -12,6 +12,9 @@ LJ_FUNC int lj_carith_op(lua_State *L, MMS mm); +#if LJ_32 +LJ_FUNC int64_t lj_carith_mul64(int64_t x, int64_t k); +#endif LJ_FUNC uint64_t lj_carith_powu64(uint64_t x, uint64_t k); LJ_FUNC int64_t lj_carith_powi64(int64_t x, int64_t k); diff --git a/src/lj_crecord.c b/src/lj_crecord.c index 61210907..5eafa3a7 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -189,6 +189,7 @@ static void crec_ct_ct(jit_State *J, CType *d, CType *s, TRef dp, TRef sp, sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0); #endif xstore: + if (dt == IRT_I64 || dt == IRT_U64) lj_needsplit(J); emitir(IRT(IR_XSTORE, dt), dp, sp); break; case CCX(I, C): @@ -311,6 +312,7 @@ static TRef crec_tv_ct(jit_State *J, CType *s, CTypeID sid, TRef sp) TRef ptr = emitir(IRT(IR_ADD, IRT_PTR), dp, lj_ir_kintp(J, sizeof(GCcdata))); emitir(IRT(IR_XSTORE, t), ptr, tr); + lj_needsplit(J); return dp; } else if ((sinfo & CTF_BOOL)) { /* Assume not equal to zero. Fixup and emit pending guard later. */ @@ -406,7 +408,10 @@ static void crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, TValue *sval) if (ctype_isenum(s->info)) s = ctype_child(cts, s); if (ctype_isnum(s->info)) { /* Load number value. */ IRType t = crec_ct2irt(s); - if (t != IRT_CDATA) sp = emitir(IRT(IR_XLOAD, t), sp, 0); + if (t != IRT_CDATA) { + sp = emitir(IRT(IR_XLOAD, t), sp, 0); + if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J); + } } goto doconv; } @@ -499,8 +504,10 @@ void LJ_FASTCALL recff_cdata_index(jit_State *J, RecordFFData *rd) if (ctype_isinteger(ctk->info) && (t = crec_ct2irt(ctk)) != IRT_CDATA) { idx = emitir(IRT(IR_ADD, IRT_PTR), idx, lj_ir_kintp(J, sizeof(GCcdata))); idx = emitir(IRT(IR_XLOAD, t), idx, 0); - if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) + if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) { idx = emitconv(idx, IRT_INT, t, 0); + lj_needsplit(J); + } goto integer_key; } } else if (tref_isstr(idx)) { @@ -664,6 +671,7 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm) CTypeID id; TRef tr, dp, ptr; MSize i; + lj_needsplit(J); if (((s[0]->info & CTF_UNSIGNED) && s[0]->size == 8) || ((s[1]->info & CTF_UNSIGNED) && s[1]->size == 8)) { dt = IRT_U64; id = CTID_UINT64; @@ -691,9 +699,6 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm) lj_ir_set(J, IRTG(op, dt), sp[0], sp[1]); J->postproc = LJ_POST_FIXGUARD; return TREF_TRUE; - } else if (mm == MM_pow) { - tr = lj_ir_call(J, dt == IRT_I64 ? IRCALL_lj_carith_powi64 : - IRCALL_lj_carith_powu64, sp[0], sp[1]); } else { if (mm == MM_div || mm == MM_mod) return 0; /* NYI: integer div, mod. */ @@ -754,10 +759,11 @@ static TRef crec_arith_ptr(jit_State *J, TRef *sp, CType **s, MMS mm) tr = emitconv(tr, IRT_INTP, IRT_INT, ((t - IRT_I8) & 1) ? 0 : IRCONV_SEXT); #else - if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) + if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) { tr = emitconv(tr, IRT_INTP, t, (t == IRT_NUM || t == IRT_FLOAT) ? IRCONV_TRUNC|IRCONV_ANY : 0); + } #endif tr = emitir(IRT(IR_MUL, IRT_INTP), tr, lj_ir_kintp(J, sz)); tr = emitir(IRT(IR_ADD, IRT_PTR), sp[0], tr); @@ -790,6 +796,7 @@ void LJ_FASTCALL recff_cdata_arith(jit_State *J, RecordFFData *rd) if (ctype_isnum(ct->info)) { IRType t = crec_ct2irt(ct); if (t == IRT_CDATA) goto err_type; + if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J); tr = emitir(IRT(IR_XLOAD, t), tr, 0); } else if (!(ctype_isptr(ct->info) || ctype_isrefarray(ct->info))) { goto err_type; @@ -842,6 +849,7 @@ void LJ_FASTCALL lj_crecord_tonumber(jit_State *J, RecordFFData *rd) IRType t = crec_ct2irt(s); if (t != IRT_CDATA) { TRef tr = emitir(IRT(IR_XLOAD, t), sp, 0); /* Load number value. */ + if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J); if (t == IRT_FLOAT || t == IRT_U32 || t == IRT_I64 || t == IRT_U64) tr = emitconv(tr, IRT_NUM, t, 0); J->base[0] = tr; diff --git a/src/lj_ir.h b/src/lj_ir.h index 1cb3566e..286eb219 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -33,6 +33,7 @@ /* Miscellaneous ops. */ \ _(NOP, N , ___, ___) \ _(BASE, N , lit, lit) \ + _(HIOP, S , ref, ref) \ _(LOOP, S , ___, ___) \ _(PHI, S , ref, ref) \ _(RENAME, S , ref, lit) \ @@ -212,8 +213,9 @@ IRFLDEF(FLENUM) /* CONV mode, stored in op2. */ #define IRCONV_SRCMASK 0x001f /* Source IRType. */ #define IRCONV_DSTMASK 0x03e0 /* Dest. IRType (also in ir->t). */ -#define IRCONV_NUM_INT ((IRT_NUM<<5)|IRT_INT) -#define IRCONV_INT_NUM ((IRT_INT<<5)|IRT_NUM) +#define IRCONV_DSH 5 +#define IRCONV_NUM_INT ((IRT_NUM<ksimd[2*(n)] + 15) & ~(intptr_t)15)) +/* Set/reset flag to activate the SPLIT pass for the current trace. */ +#if LJ_32 && LJ_HASFFI +#define lj_needsplit(J) (J->needsplit = 1) +#define lj_resetsplit(J) (J->needsplit = 0) +#else +#define lj_needsplit(J) UNUSED(J) +#define lj_resetsplit(J) UNUSED(J) +#endif + /* Fold state is used to fold instructions on-the-fly. */ typedef struct FoldState { IRIns ins; /* Currently emitted instruction. */ @@ -293,6 +302,9 @@ typedef struct jit_State { MSize sizesnapmap; /* Size of temp. snapshot map buffer. */ PostProc postproc; /* Required post-processing after execution. */ +#if LJ_32 && LJ_HASFFI + int needsplit; /* Need SPLIT pass. */ +#endif GCRef *trace; /* Array of traces. */ TraceNo freetrace; /* Start of scan for next free trace. */ diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 2d08e187..03caf80d 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -538,6 +538,13 @@ LJFOLDF(kfold_conv_knum_int_num) } } +LJFOLD(CONV KNUM IRCONV_U32_NUM) +LJFOLDF(kfold_conv_knum_u32_num) +{ + lua_assert((fins->op2 & IRCONV_TRUNC)); + return INTFOLD((int32_t)(uint32_t)knumleft); +} + LJFOLD(CONV KNUM IRCONV_I64_NUM) LJFOLDF(kfold_conv_knum_i64_num) { @@ -805,6 +812,7 @@ LJFOLDF(simplify_conv_u32_num) } LJFOLD(CONV CONV IRCONV_I64_NUM) /* _INT or _U32*/ +LJFOLD(CONV CONV IRCONV_U64_NUM) /* _INT or _U32*/ LJFOLDF(simplify_conv_i64_num) { PHIBARRIER(fleft); @@ -826,23 +834,6 @@ LJFOLDF(simplify_conv_i64_num) return NEXTFOLD; } -LJFOLD(CONV CONV IRCONV_U64_NUM) /* _U32*/ -LJFOLDF(simplify_conv_u64_num) -{ - PHIBARRIER(fleft); - if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) { -#if LJ_TARGET_X64 - return fleft->op1; -#else - /* Reduce to a zero-extension. */ - fins->op1 = fleft->op1; - fins->op2 = (IRT_U64<<5)|IRT_U32; - return RETRYFOLD; -#endif - } - return NEXTFOLD; -} - /* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */ LJFOLD(TOBIT CONV KNUM) LJFOLDF(simplify_tobit_conv) diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c new file mode 100644 index 00000000..3cb30514 --- /dev/null +++ b/src/lj_opt_split.c @@ -0,0 +1,343 @@ +/* +** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions. +** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h +*/ + +#define lj_opt_split_c +#define LUA_CORE + +#include "lj_obj.h" + +#if LJ_HASJIT && LJ_HASFFI && LJ_32 + +#include "lj_err.h" +#include "lj_str.h" +#include "lj_ir.h" +#include "lj_jit.h" +#include "lj_iropt.h" +#include "lj_vm.h" + +/* SPLIT pass: +** +** This pass splits up 64 bit IR instructions into multiple 32 bit IR +** instructions. It's only active for 32 bit CPUs which lack native 64 bit +** operations. The FFI is currently the only emitter for 64 bit +** instructions, so this pass is disabled if the FFI is disabled. +** +** Splitting the IR in a separate pass keeps each 32 bit IR assembler +** backend simple. Only a small amount of extra functionality needs to be +** implemented. This is much easier than adding support for allocating +** register pairs to each backend (believe me, I tried). A few simple, but +** important optimizations can be performed by the SPLIT pass, which would +** be tedious to do in the backend. +** +** The basic idea is to replace each 64 bit IR instruction with its 32 bit +** equivalent plus an extra HIOP instruction. The splitted IR is not passed +** through FOLD or any other optimizations, so each HIOP is guaranteed to +** immediately follow it's counterpart. The actual functionality of HIOP is +** inferred from the previous instruction. +** +** The operands of HIOP hold the hiword input references. The output of HIOP +** is the hiword output reference, which is also used to hold the hiword +** register or spill slot information. The register allocator treats this +** instruction independent of any other instruction, which improves code +** quality compared to using fixed register pairs. +** +** It's easier to split up some instructions into two regular 32 bit +** instructions. E.g. XLOAD is split up into two XLOADs with two different +** addresses. Obviously 64 bit constants need to be split up into two 32 bit +** constants, too. Some hiword instructions can be entirely omitted, e.g. +** when zero-extending a 32 bit value to 64 bits. +** +** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with +** two int64_t fields: +** +** 0100 p32 ADD base +8 +** 0101 i64 XLOAD 0100 +** 0102 i64 ADD 0101 +1 +** 0103 p32 ADD base +16 +** 0104 i64 XSTORE 0103 0102 +** +** mov rax, [esi+0x8] +** add rax, +0x01 +** mov [esi+0x10], rax +** +** Here's the transformed IR and the x86 machine code after the SPLIT pass: +** +** 0100 p32 ADD base +8 +** 0101 int XLOAD 0100 +** 0102 p32 ADD base +12 +** 0103 int XLOAD 0102 +** 0104 int ADD 0101 +1 +** 0105 int HIOP 0103 +0 +** 0106 p32 ADD base +16 +** 0107 int XSTORE 0106 0104 +** 0108 p32 ADD base +20 +** 0109 int XSTORE 0108 0105 +** +** mov eax, [esi+0x8] +** mov ecx, [esi+0xc] +** add eax, +0x01 +** adc ecx, +0x00 +** mov [esi+0x10], eax +** mov [esi+0x14], ecx +** +** You may notice the reassociated hiword address computation, which is +** later fused into the mov operands by the assembler. +*/ + +/* Some local macros to save typing. Undef'd at the end. */ +#define IR(ref) (&J->cur.ir[(ref)]) + +/* Directly emit the transformed IR without updating chains etc. */ +static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2) +{ + IRRef nref = lj_ir_nextins(J); + IRIns *ir = IR(nref); + ir->ot = ot; + ir->op1 = op1; + ir->op2 = op2; + return nref; +} + +/* Emit a CALLN with two split 64 bit arguments. */ +static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir, + IRIns *ir, IRCallID id) +{ + IRRef tmp, op1 = ir->op1, op2 = ir->op2; + J->cur.nins--; +#if LJ_LE + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); +#else + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); +#endif + ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id); + return split_emit(J, IRTI(IR_HIOP), tmp, tmp); +} + +/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */ +static IRRef split_ptr(jit_State *J, IRRef ref) +{ + IRIns *ir = IR(ref); + int32_t ofs = 4; + if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */ + ofs += IR(ir->op2)->i; + ref = ir->op1; + if (ofs == 0) return ref; + } + return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs)); +} + +/* Transform the old IR to the new IR. */ +static void split_ir(jit_State *J) +{ + IRRef nins = J->cur.nins, nk = J->cur.nk; + MSize irlen = nins - nk; + MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1)); + IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need); + IRRef1 *hisubst; + IRRef ref; + + /* Copy old IR to buffer. */ + memcpy(oir, IR(nk), irlen*sizeof(IRIns)); + /* Bias hiword substitution table and old IR. Loword kept in field prev. */ + hisubst = (IRRef1 *)&oir[irlen] - nk; + oir -= nk; + + /* Remove all IR instructions, but retain IR constants. */ + J->cur.nins = REF_FIRST; + + /* Process constants and fixed references. */ + for (ref = nk; ref <= REF_BASE; ref++) { + IRIns *ir = &oir[ref]; + if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */ + TValue tv = *ir_k64(ir); + ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo); + hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi); + } else { + ir->prev = (IRRef1)ref; /* Identity substitution for loword. */ + } + } + + /* Process old IR instructions. */ + for (ref = REF_FIRST; ref < nins; ref++) { + IRIns *ir = &oir[ref]; + IRRef nref = lj_ir_nextins(J); + IRIns *nir = IR(nref); + + /* Copy-substitute old instruction to new instruction. */ + nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev; + nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev; + ir->prev = nref; /* Loword substitution. */ + nir->o = ir->o; + nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI); + + /* Split 64 bit instructions. */ + if (irt_isint64(ir->t)) { + IRRef hi = hisubst[ir->op1]; + nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */ + switch (ir->o) { + case IR_ADD: + case IR_SUB: + /* Use plain op for hiword if loword cannot produce a carry/borrow. */ + if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) { + ir->prev = nir->op1; /* Pass through loword. */ + nir->op1 = hi; nir->op2 = hisubst[ir->op2]; + hi = nref; + break; + } + /* fallthrough */ + case IR_NEG: + hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]); + break; + case IR_MUL: + hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64); + break; + case IR_POWI: + hi = split_call64(J, hisubst, oir, ir, + irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : + IRCALL_lj_carith_powu64); + break; + case IR_XLOAD: + hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2); +#if LJ_BE + ir->prev = hi; hi = nref; +#endif + break; + case IR_XSTORE: +#if LJ_LE + hi = hisubst[ir->op2]; +#else + hi = nir->op2; nir->op2 = hisubst[ir->op2]; +#endif + split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi); + continue; + case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */ + IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); + if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */ + hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref); + } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */ + /* Drop cast, since assembler doesn't care. */ + hisubst[ref] = hi; + goto fwdlo; + } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */ + IRRef k31 = lj_ir_kint(J, 31); + nir = IR(nref); /* May have been reallocated. */ + ir->prev = nir->op1; /* Pass through loword. */ + nir->o = IR_BSAR; /* hi = bsar(lo, 31). */ + nir->op2 = k31; + hi = nref; + } else { /* Zero-extend to 64 bit. */ + hisubst[ref] = lj_ir_kint(J, 0); + goto fwdlo; + } + break; + } + case IR_PHI: { + IRRef hi2; + if ((irref_isk(nir->op1) && irref_isk(nir->op2)) || + nir->op1 == nir->op2) + J->cur.nins--; /* Drop useless PHIs. */ + hi2 = hisubst[ir->op2]; + if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2)) + split_emit(J, IRTI(IR_PHI), hi, hi2); + continue; + } + default: + lua_assert(ir->o <= IR_NE); + split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */ + continue; + } + hisubst[ref] = hi; /* Store hiword substitution. */ + } else if (ir->o == IR_CONV) { /* See above, too. */ + IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); + if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */ + if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */ + ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)), + hisubst[ir->op1], nref); + } else { /* Truncate to lower 32 bits. */ + fwdlo: + ir->prev = nir->op1; /* Forward loword. */ + /* Replace with NOP to avoid messing up the snapshot logic. */ + nir->ot = IRT(IR_NOP, IRT_NIL); + nir->op1 = nir->op2 = 0; + } + } + } else if (ir->o == IR_LOOP) { + J->loopref = nref; /* Needed by assembler. */ + } + } + + /* Add PHI marks. */ + for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) { + IRIns *ir = IR(ref); + if (ir->o != IR_PHI) break; + if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t); + if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t); + } + + /* Substitute snapshot maps. */ + oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */ + { + SnapNo i, nsnap = J->cur.nsnap; + for (i = 0; i < nsnap; i++) { + SnapShot *snap = &J->cur.snap[i]; + SnapEntry *map = &J->cur.snapmap[snap->mapofs]; + MSize n, nent = snap->nent; + snap->ref = oir[snap->ref].prev; + for (n = 0; n < nent; n++) { + SnapEntry sn = map[n]; + map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev); + } + } + } +} + +/* Protected callback for split pass. */ +static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud) +{ + jit_State *J = (jit_State *)ud; + split_ir(J); + UNUSED(L); UNUSED(dummy); + return NULL; +} + +#ifdef LUA_USE_ASSERT +/* Slow, but sure way to check whether a SPLIT pass is needed. */ +static int split_needsplit(jit_State *J) +{ + IRIns *ir, *irend; + IRRef ref; + for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++) + if (irt_isint64(ir->t)) + return 1; + for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev) + if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 || + (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64) + return 1; + return 0; /* Nope. */ +} +#endif + +/* SPLIT pass. */ +void lj_opt_split(jit_State *J) +{ + lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */ + if (J->needsplit) { + int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit); + if (errcode) { + /* Completely reset the trace to avoid inconsistent dump on abort. */ + J->cur.nins = J->cur.nk = REF_BASE; + J->cur.nsnap = 0; + lj_err_throw(J->L, errcode); /* Propagate errors. */ + } + } +} + +#undef IR + +#endif diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 94ab3c32..37c68f4b 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -193,6 +193,7 @@ typedef enum { XI_FLD1 = 0xe8d9, XI_FLDLG2 = 0xecd9, XI_FLDLN2 = 0xedd9, + XI_FDUP = 0xc0d9, /* Really fld st0. */ XI_FPOP = 0xd8dd, /* Really fstp st0. */ XI_FPOP1 = 0xd9dd, /* Really fstp st1. */ XI_FRNDINT = 0xfcd9, @@ -263,10 +264,17 @@ typedef enum { XO_MOVD = XO_660f(6e), XO_MOVDto = XO_660f(7e), + XO_FLDd = XO_(d9), XOg_FLDd = 0, XO_FLDq = XO_(dd), XOg_FLDq = 0, XO_FILDd = XO_(db), XOg_FILDd = 0, + XO_FILDq = XO_(df), XOg_FILDq = 5, + XO_FSTPd = XO_(d9), XOg_FSTPd = 3, XO_FSTPq = XO_(dd), XOg_FSTPq = 3, XO_FISTPq = XO_(df), XOg_FISTPq = 7, + XO_FISTTPq = XO_(dd), XOg_FISTTPq = 1, + XO_FADDq = XO_(dc), XOg_FADDq = 0, + XO_FLDCW = XO_(d9), XOg_FLDCW = 5, + XO_FNSTCW = XO_(d9), XOg_FNSTCW = 7 } x86Op; /* x86 opcode groups. */ @@ -278,6 +286,7 @@ typedef uint32_t x86Group; #define XG_TOXOi8(xg) ((x86Op)(0x000000fe + (((xg)<<8) & 0xff000000))) #define XO_ARITH(a) ((x86Op)(0x030000fe + ((a)<<27))) +#define XO_ARITHw(a) ((x86Op)(0x036600fd + ((a)<<27))) typedef enum { XOg_ADD, XOg_OR, XOg_ADC, XOg_SBB, XOg_AND, XOg_SUB, XOg_XOR, XOg_CMP, diff --git a/src/lj_trace.c b/src/lj_trace.c index da20f991..b67e8f75 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -394,6 +394,7 @@ static void trace_start(jit_State *J) J->bcskip = 0; J->guardemit.irt = 0; J->postproc = LJ_POST_NONE; + lj_resetsplit(J); setgcref(J->cur.startpt, obj2gco(J->pt)); L = J->L; @@ -592,6 +593,7 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud) } J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */ } + lj_opt_split(J); J->state = LJ_TRACE_ASM; break; diff --git a/src/ljamalg.c b/src/ljamalg.c index 4d5f7600..5d90c002 100644 --- a/src/ljamalg.c +++ b/src/ljamalg.c @@ -58,6 +58,7 @@ #include "lj_opt_narrow.c" #include "lj_opt_dce.c" #include "lj_opt_loop.c" +#include "lj_opt_split.c" #include "lj_mcode.c" #include "lj_snap.c" #include "lj_record.c"