From b613216efc7447dae645d8834e4d6f3185cd1bcc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 2 Feb 2011 02:29:37 +0100 Subject: [PATCH] Add SPLIT pass to split 64 bit IR instructions for 32 bit CPUs. Add generic HIOP instruction for extra backend functionality. Add support for HIOP to x86 backend. Use POWI for 64 bit integer x^k, too. POWI is lowered to a call by SPLIT or the x64 backend. --- src/Makefile | 2 +- src/Makefile.dep | 15 +- src/lj_asm.c | 444 ++++++++++++++++++++++++++++++++++++-------- src/lj_carith.c | 8 + src/lj_carith.h | 3 + src/lj_crecord.c | 20 +- src/lj_ir.h | 21 ++- src/lj_iropt.h | 6 + src/lj_jit.h | 12 ++ src/lj_opt_fold.c | 25 +-- src/lj_opt_split.c | 343 ++++++++++++++++++++++++++++++++++ src/lj_target_x86.h | 9 + src/lj_trace.c | 2 + src/ljamalg.c | 1 + 14 files changed, 795 insertions(+), 116 deletions(-) create mode 100644 src/lj_opt_split.c diff --git a/src/Makefile b/src/Makefile index a2be1a18..0150b049 100644 --- a/src/Makefile +++ b/src/Makefile @@ -331,7 +331,7 @@ LJCORE_O= lj_gc.o lj_err.o lj_char.o lj_bc.o lj_obj.o \ lj_state.o lj_dispatch.o lj_vmevent.o lj_api.o \ lj_lex.o lj_parse.o \ lj_ir.o lj_opt_mem.o lj_opt_fold.o lj_opt_narrow.o \ - lj_opt_dce.o lj_opt_loop.o \ + lj_opt_dce.o lj_opt_loop.o lj_opt_split.o \ lj_mcode.o lj_snap.o lj_record.o lj_crecord.o lj_ffrecord.o \ lj_asm.o lj_trace.o lj_gdbjit.o \ lj_ctype.o lj_cdata.o lj_cconv.o lj_ccall.o lj_carith.o lj_clib.o \ diff --git a/src/Makefile.dep b/src/Makefile.dep index 3d0c4239..1534ac27 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -128,6 +128,8 @@ lj_opt_mem.o: lj_opt_mem.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ lj_arch.h lj_str.h lj_bc.h lj_ir.h lj_jit.h lj_iropt.h lj_trace.h \ lj_dispatch.h lj_traceerr.h +lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \ + lj_arch.h lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_gc.h lj_err.h lj_errmsg.h lj_str.h lj_tab.h lj_func.h lj_state.h \ lj_bc.h lj_ctype.h lj_lex.h lj_parse.h lj_vm.h lj_vmevent.h @@ -167,10 +169,11 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \ lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_carith.c lj_carith.h \ lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_lib.h lj_ir.c \ lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \ - lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_mcode.c lj_mcode.h lj_snap.c \ - lj_target.h lj_target_*.h lj_record.c lj_record.h lj_ffrecord.h \ - lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h \ - lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c lib_base.c \ - lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c lib_os.c \ - lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c lib_init.c + lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_mcode.c \ + lj_mcode.h lj_snap.c lj_target.h lj_target_*.h lj_record.c lj_record.h \ + lj_ffrecord.h lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h \ + lj_asm.c lj_asm.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c \ + lib_aux.c lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c \ + lib_io.c lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c \ + lib_ffi.c lib_init.c luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h diff --git a/src/lj_asm.c b/src/lj_asm.c index cc2ae597..441700d4 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -347,6 +347,20 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs) } } +/* op rm/mrm, i */ +static void emit_gmrmi(ASMState *as, x86Group xg, Reg rb, int32_t i) +{ + x86Op xo; + if (checki8(i)) { + emit_i8(as, i); + xo = XG_TOXOi8(xg); + } else { + emit_i32(as, i); + xo = XG_TOXOi(xg); + } + emit_mrm(as, xo, (Reg)(xg & 7) | (rb & REX_64), (rb & ~REX_64)); +} + /* -- Emit moves ---------------------------------------------------------- */ /* mov [base+ofs], i */ @@ -371,7 +385,10 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i) /* mov r, i / xor r, r */ static void emit_loadi(ASMState *as, Reg r, int32_t i) { - if (i == 0) { + /* XOR r,r is shorter, but modifies the flags. This is bad for HIOP. */ + if (i == 0 && !(LJ_32 && (IR(as->curins)->o == IR_HIOP || + (as->curins+1 < as->T->nins && + IR(as->curins+1)->o == IR_HIOP)))) { emit_rr(as, XO_ARITH(XOg_XOR), r, r); } else { MCode *p = as->mcp; @@ -422,6 +439,19 @@ static void emit_loadn(ASMState *as, Reg r, cTValue *tv) /* Label for short jumps. */ typedef MCode *MCLabel; +#if LJ_32 && LJ_HASFFI +/* jmp short target */ +static void emit_sjmp(ASMState *as, MCLabel target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - p; + lua_assert(delta == (int8_t)delta); + p[-1] = (MCode)(int8_t)delta; + p[-2] = XI_JMPs; + as->mcp = p - 2; +} +#endif + /* jcc short target */ static void emit_sjcc(ASMState *as, int cc, MCLabel target) { @@ -630,7 +660,7 @@ static Reg ra_rematk(ASMState *as, IRIns *ir) } else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */ lua_assert(irt_isnil(ir->t)); emit_getgl(as, r, jit_L); -#if LJ_64 /* NYI: 32 bit register pairs. */ +#if LJ_64 } else if (ir->o == IR_KINT64) { emit_loadu64(as, r, ir_kint64(ir)->u64); #endif @@ -681,8 +711,7 @@ static Reg ra_releasetmp(ASMState *as, IRRef ref) #if LJ_64 #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) #else -/* NYI: 32 bit register pairs. */ -#define REX_64IR(ir, r) check_exp(!irt_is64((ir)->t), (r)) +#define REX_64IR(ir, r) (r) #endif /* Generic move between two regs. */ @@ -939,7 +968,7 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref) emit_loadn(as, dest, tv); return; } -#if LJ_64 /* NYI: 32 bit register pairs. */ +#if LJ_64 } else if (ir->o == IR_KINT64) { emit_loadu64(as, dest, ir_kint64(ir)->u64); return; @@ -1463,7 +1492,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) #endif if (r) { /* Argument is in a register. */ if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { -#if LJ_64 /* NYI: 32 bit register pairs. */ +#if LJ_64 if (ir->o == IR_KINT64) emit_loadu64(as, r, ir_kint64(ir)->u64); else @@ -1519,7 +1548,7 @@ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) ra_evictset(as, drop); /* Evictions must be performed first. */ if (ra_used(ir)) { if (irt_isfp(ir->t)) { - int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ + int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ #if LJ_64 if ((ci->flags & CCI_CASTU64)) { Reg dest = ir->r; @@ -1632,19 +1661,24 @@ static void asm_conv(ASMState *as, IRIns *ir) int stfp = (st == IRT_NUM || st == IRT_FLOAT); IRRef lref = ir->op1; lua_assert(irt_type(ir->t) != st); + lua_assert(!(LJ_32 && (irt_isint64(ir->t) || st64))); /* Handled by SPLIT. */ if (irt_isfp(ir->t)) { Reg dest = ra_dest(as, ir, RSET_FPR); if (stfp) { /* FP to FP conversion. */ Reg left = asm_fuseload(as, lref, RSET_FPR); emit_mrm(as, st == IRT_NUM ? XO_CVTSD2SS : XO_CVTSS2SD, dest, left); if (left == dest) return; /* Avoid the XO_XORPS. */ -#if LJ_32 - } else if (st >= IRT_U32) { - /* NYI: 64 bit integer or uint32_t to number conversion. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); + } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */ + /* number = (2^52+2^51 .. u32) - (2^52+2^51) */ + cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000)); + Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); + if (irt_isfloat(ir->t)) + emit_rr(as, XO_CVTSD2SS, dest, dest); + emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ + emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ + emit_loadn(as, bias, k); + emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); return; -#endif } else { /* Integer to FP conversion. */ Reg left = (LJ_64 && (st == IRT_U32 || st == IRT_U64)) ? ra_alloc1(as, lref, RSET_GPR) : @@ -1663,41 +1697,47 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_rr(as, XO_XORPS, dest, dest); /* Avoid partial register stall. */ } else if (stfp) { /* FP to integer conversion. */ if (irt_isguard(ir->t)) { - lua_assert(!irt_is64(ir->t)); /* No support for checked 64 bit conv. */ + /* Checked conversions are only supported from number to int. */ + lua_assert(irt_isint(ir->t) && st == IRT_NUM); asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); -#if LJ_32 - } else if (irt_isi64(ir->t) || irt_isu64(ir->t) || irt_isu32(ir->t)) { - /* NYI: number to 64 bit integer or uint32_t conversion. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); -#endif } else { Reg dest = ra_dest(as, ir, RSET_GPR); x86Op op = st == IRT_NUM ? ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSD2SI : XO_CVTSD2SI) : ((ir->op2 & IRCONV_TRUNC) ? XO_CVTTSS2SI : XO_CVTSS2SI); - if (LJ_64 && irt_isu64(ir->t)) { - const void *k = lj_ir_k64_find(as->J, U64x(c3f00000,00000000)); - MCLabel l_end = emit_label(as); - Reg left = IR(lref)->r; + if (LJ_32 && irt_isu32(ir->t)) { /* FP to U32 conversion on x86. */ + /* u32 = (int32_t)(number - 2^31) + 2^31 */ + Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) : + ra_scratch(as, RSET_FPR); + emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000); + emit_rr(as, op, dest, tmp); + if (st == IRT_NUM) + emit_rma(as, XO_ADDSD, tmp, + lj_ir_k64_find(as->J, U64x(c1e00000,00000000))); + else + emit_rma(as, XO_ADDSS, tmp, + lj_ir_k64_find(as->J, U64x(00000000,cf000000))); + ra_left(as, tmp, lref); + } else if (LJ_64 && irt_isu64(ir->t)) { /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ - if (ra_hasreg(left)) { - Reg tmpn = ra_scratch(as, rset_exclude(RSET_FPR, left)); - emit_rr(as, op, dest|REX_64, tmpn); - emit_rr(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, tmpn, left); - emit_rma(as, st == IRT_NUM ? XMM_MOVRM(as) : XO_MOVSS, tmpn, k); - } else { - left = ra_allocref(as, lref, RSET_FPR); - emit_rr(as, op, dest|REX_64, left); - emit_rma(as, st == IRT_NUM ? XO_ADDSD : XO_ADDSS, left, k); - } + Reg tmp = ra_noreg(IR(lref)->r) ? ra_alloc1(as, lref, RSET_FPR) : + ra_scratch(as, RSET_FPR); + MCLabel l_end = emit_label(as); + emit_rr(as, op, dest|REX_64, tmp); + if (st == IRT_NUM) + emit_rma(as, XO_ADDSD, tmp, + lj_ir_k64_find(as->J, U64x(c3f00000,00000000))); + else + emit_rma(as, XO_ADDSS, tmp, + lj_ir_k64_find(as->J, U64x(00000000,df800000))); emit_sjcc(as, CC_NS, l_end); emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest < 2^63. */ - emit_rr(as, op, dest|REX_64, left); + emit_rr(as, op, dest|REX_64, tmp); + ra_left(as, tmp, lref); } else { Reg left = asm_fuseload(as, lref, RSET_FPR); if (LJ_64 && irt_isu32(ir->t)) - emit_rr(as, XO_MOV, dest, dest); /* Zero upper 32 bits. */ + emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */ emit_mrm(as, op, dest|((LJ_64 && (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), @@ -1728,12 +1768,10 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_mrm(as, op, dest, left); } } else { /* 32/64 bit integer conversions. */ - if (irt_is64(ir->t)) { -#if LJ_32 - /* NYI: conversion to 64 bit integers. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); -#else + if (LJ_32) { /* Only need to handle 32/32 bit no-op (cast) on x86. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ + } else if (irt_is64(ir->t)) { Reg dest = ra_dest(as, ir, RSET_GPR); if (st64 || !(ir->op2 & IRCONV_SEXT)) { /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ @@ -1742,21 +1780,14 @@ static void asm_conv(ASMState *as, IRIns *ir) Reg left = asm_fuseload(as, lref, RSET_GPR); emit_mrm(as, XO_MOVSXd, dest|REX_64, left); } -#endif } else { Reg dest = ra_dest(as, ir, RSET_GPR); if (st64) { -#if LJ_32 - /* NYI: conversion from 64 bit integers. */ - setintV(&as->J->errinfo, ir->o); - lj_trace_err_info(as->J, LJ_TRERR_NYIIR); -#else Reg left = asm_fuseload(as, lref, RSET_GPR); - /* This is either a 32 bit reg/reg mov which zeroes the hi-32 bits - ** or a load of the lower 32 bits from a 64 bit address. + /* This is either a 32 bit reg/reg mov which zeroes the hiword + ** or a load of the loword from a 64 bit address. */ emit_mrm(as, XO_MOV, dest, left); -#endif } else { /* 32/32 bit no-op (cast). */ ra_left(as, dest, lref); /* Do nothing, but may need to move regs. */ } @@ -1764,6 +1795,93 @@ static void asm_conv(ASMState *as, IRIns *ir) } } +#if LJ_32 && LJ_HASFFI +/* No SSE conversions to/from 64 bit on x86, so resort to ugly x87 code. */ + +/* 64 bit integer to FP conversion in 32 bit mode. */ +static void asm_conv_fp_int64(ASMState *as, IRIns *ir) +{ + Reg hi = ra_alloc1(as, ir->op1, RSET_GPR); + Reg lo = ra_alloc1(as, (ir-1)->op1, rset_exclude(RSET_GPR, hi)); + int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ + Reg dest = ir->r; + if (ra_hasreg(dest)) { + ra_free(as, dest); + ra_modified(as, dest); + emit_rmro(as, irt_isnum(ir->t) ? XMM_MOVRM(as) : XO_MOVSS, + dest, RID_ESP, ofs); + } + emit_rmro(as, irt_isnum(ir->t) ? XO_FSTPq : XO_FSTPd, + irt_isnum(ir->t) ? XOg_FSTPq : XOg_FSTPd, RID_ESP, ofs); + if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) { + /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */ + MCLabel l_end = emit_label(as); + emit_rma(as, XO_FADDq, XOg_FADDq, + lj_ir_k64_find(as->J, U64x(43f00000,00000000))); + emit_sjcc(as, CC_NS, l_end); + emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */ + } else { + lua_assert(((ir-1)->op2 & IRCONV_SRCMASK) == IRT_I64); + } + emit_rmro(as, XO_FILDq, XOg_FILDq, RID_ESP, 0); + /* NYI: Avoid narrow-to-wide store-to-load forwarding stall. */ + emit_rmro(as, XO_MOVto, hi, RID_ESP, 4); + emit_rmro(as, XO_MOVto, lo, RID_ESP, 0); +} + +/* FP to 64 bit integer conversion in 32 bit mode. */ +static void asm_conv_int64_fp(ASMState *as, IRIns *ir) +{ + IRType st = (IRType)((ir-1)->op2 & IRCONV_SRCMASK); + IRType dt = (((ir-1)->op2 & IRCONV_DSTMASK) >> IRCONV_DSH); + Reg lo, hi; + lua_assert(st == IRT_NUM || st == IRT_FLOAT); + lua_assert(dt == IRT_I64 || dt == IRT_U64); + lua_assert(((ir-1)->op2 & IRCONV_TRUNC)); + hi = ra_dest(as, ir, RSET_GPR); + lo = ra_dest(as, ir-1, rset_exclude(RSET_GPR, hi)); + if (ra_used(ir-1)) emit_rmro(as, XO_MOV, lo, RID_ESP, 0); + /* NYI: Avoid wide-to-narrow store-to-load forwarding stall. */ + if (!(as->flags & JIT_F_SSE3)) { /* Set FPU rounding mode to default. */ + emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 4); + emit_rmro(as, XO_MOVto, lo, RID_ESP, 4); + emit_gri(as, XG_ARITHi(XOg_AND), lo, 0xf3ff); + } + if (dt == IRT_U64) { + /* For inputs in [2^63,2^64-1] add -2^64 and convert again. */ + MCLabel l_pop, l_end = emit_label(as); + emit_x87op(as, XI_FPOP); + l_pop = emit_label(as); + emit_sjmp(as, l_end); + emit_rmro(as, XO_MOV, hi, RID_ESP, 4); + if ((as->flags & JIT_F_SSE3)) + emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); + else + emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); + emit_rma(as, XO_FADDq, XOg_FADDq, + lj_ir_k64_find(as->J, U64x(c3f00000,00000000))); + emit_sjcc(as, CC_NS, l_pop); + emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */ + } + emit_rmro(as, XO_MOV, hi, RID_ESP, 4); + if ((as->flags & JIT_F_SSE3)) { /* Truncation is easy with SSE3. */ + emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); + } else { /* Otherwise set FPU rounding mode to truncate before the store. */ + emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); + emit_rmro(as, XO_FLDCW, XOg_FLDCW, RID_ESP, 0); + emit_rmro(as, XO_MOVtow, lo, RID_ESP, 0); + emit_rmro(as, XO_ARITHw(XOg_OR), lo, RID_ESP, 0); + emit_loadi(as, lo, 0xc00); + emit_rmro(as, XO_FNSTCW, XOg_FNSTCW, RID_ESP, 0); + } + if (dt == IRT_U64) + emit_x87op(as, XI_FDUP); + emit_mrm(as, st == IRT_NUM ? XO_FLDq : XO_FLDd, + st == IRT_NUM ? XOg_FLDq: XOg_FLDd, + asm_fuseload(as, ir->op1, RSET_EMPTY)); +} +#endif + static void asm_strto(ASMState *as, IRIns *ir) { /* Force a spill slot for the destination register (if any). */ @@ -2644,6 +2762,18 @@ static void asm_powi(ASMState *as, IRIns *ir) ra_left(as, RID_EAX, ir->op2); } +#if LJ_64 && LJ_HASFFI +static void asm_arith64(ASMState *as, IRIns *ir, IRCallID id) +{ + const CCallInfo *ci = &lj_ir_callinfo[id]; + IRRef args[2]; + args[0] = ir->op1; + args[1] = ir->op2; + asm_setupresult(as, ir, ci); + asm_gencall(as, ci, args); +} +#endif + /* Find out whether swapping operands might be beneficial. */ static int swapops(ASMState *as, IRIns *ir) { @@ -2877,12 +3007,30 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) /* -- Comparisons --------------------------------------------------------- */ /* Virtual flags for unordered FP comparisons. */ -#define VCC_U 0x100 /* Unordered. */ -#define VCC_P 0x200 /* Needs extra CC_P branch. */ -#define VCC_S 0x400 /* Swap avoids CC_P branch. */ +#define VCC_U 0x1000 /* Unordered. */ +#define VCC_P 0x2000 /* Needs extra CC_P branch. */ +#define VCC_S 0x4000 /* Swap avoids CC_P branch. */ #define VCC_PS (VCC_P|VCC_S) -static void asm_comp_(ASMState *as, IRIns *ir, int cc) +/* Map of comparisons to flags. ORDER IR. */ +#define COMPFLAGS(ci, cin, cu, cf) ((ci)+((cu)<<4)+((cin)<<8)+(cf)) +static const uint16_t asm_compmap[IR_ABC+1] = { + /* signed non-eq unsigned flags */ + /* LT */ COMPFLAGS(CC_GE, CC_G, CC_AE, VCC_PS), + /* GE */ COMPFLAGS(CC_L, CC_L, CC_B, 0), + /* LE */ COMPFLAGS(CC_G, CC_G, CC_A, VCC_PS), + /* GT */ COMPFLAGS(CC_LE, CC_L, CC_BE, 0), + /* ULT */ COMPFLAGS(CC_AE, CC_A, CC_AE, VCC_U), + /* UGE */ COMPFLAGS(CC_B, CC_B, CC_B, VCC_U|VCC_PS), + /* ULE */ COMPFLAGS(CC_A, CC_A, CC_A, VCC_U), + /* UGT */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS), + /* EQ */ COMPFLAGS(CC_NE, CC_NE, CC_NE, VCC_P), + /* NE */ COMPFLAGS(CC_E, CC_E, CC_E, VCC_U|VCC_P), + /* ABC */ COMPFLAGS(CC_BE, CC_B, CC_BE, VCC_U|VCC_PS) /* Same as UGT. */ +}; + +/* FP and integer comparisons. */ +static void asm_comp(ASMState *as, IRIns *ir, uint32_t cc) { if (irt_isnum(ir->t)) { IRRef lref = ir->op1; @@ -3008,15 +3156,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc) if (irl+1 == ir) /* Referencing previous ins? */ as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ } else { - x86Op xo; - if (checki8(imm)) { - emit_i8(as, imm); - xo = XO_ARITHi8; - } else { - emit_i32(as, imm); - xo = XO_ARITHi; - } - emit_mrm(as, xo, r64 + XOg_CMP, left); + emit_gmrmi(as, XG_ARITHi(XOg_CMP), r64 + left, imm); } } } else { @@ -3028,8 +3168,133 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc) } } -#define asm_comp(as, ir, ci, cf, cu) \ - asm_comp_(as, ir, (ci)+((cf)<<4)+(cu)) +#if LJ_32 && LJ_HASFFI +/* 64 bit integer comparisons in 32 bit mode. */ +static void asm_comp_int64(ASMState *as, IRIns *ir) +{ + uint32_t cc = asm_compmap[(ir-1)->o]; + RegSet allow = RSET_GPR; + Reg lefthi = RID_NONE, leftlo = RID_NONE; + Reg righthi = RID_NONE, rightlo = RID_NONE; + MCLabel l_around; + x86ModRM mrm; + + as->curins--; /* Skip loword ins. Avoids failing in noconflict(), too. */ + + /* Allocate/fuse hiword operands. */ + if (irref_isk(ir->op2)) { + lefthi = asm_fuseload(as, ir->op1, allow); + } else { + lefthi = ra_alloc1(as, ir->op1, allow); + righthi = asm_fuseload(as, ir->op2, allow); + if (righthi == RID_MRM) { + if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base); + if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx); + } else { + rset_clear(allow, righthi); + } + } + mrm = as->mrm; /* Save state for hiword instruction. */ + + /* Allocate/fuse loword operands. */ + if (irref_isk((ir-1)->op2)) { + leftlo = asm_fuseload(as, (ir-1)->op1, allow); + } else { + leftlo = ra_alloc1(as, (ir-1)->op1, allow); + rightlo = asm_fuseload(as, (ir-1)->op2, allow); + if (rightlo == RID_MRM) { + if (as->mrm.base != RID_NONE) rset_clear(allow, as->mrm.base); + if (as->mrm.idx != RID_NONE) rset_clear(allow, as->mrm.idx); + } else { + rset_clear(allow, rightlo); + } + } + + /* All register allocations must be performed _before_ this point. */ + l_around = emit_label(as); + as->invmcp = as->testmcp = NULL; /* Cannot use these optimizations. */ + + /* Loword comparison and branch. */ + asm_guardcc(as, cc >> 4); /* Always use unsigned compare for loword. */ + if (ra_noreg(rightlo)) { + int32_t imm = IR((ir-1)->op2)->i; + if (imm == 0 && ((cc >> 4) & 0xa) != 0x2 && leftlo != RID_MRM) + emit_rr(as, XO_TEST, leftlo, leftlo); + else + emit_gmrmi(as, XG_ARITHi(XOg_CMP), leftlo, imm); + } else { + emit_mrm(as, XO_CMP, leftlo, rightlo); + } + + /* Hiword comparison and branches. */ + if ((cc & 15) != CC_NE) + emit_sjcc(as, CC_NE, l_around); /* Hiword unequal: skip loword compare. */ + if ((cc & 15) != CC_E) + asm_guardcc(as, cc >> 8); /* Hiword compare without equality check. */ + as->mrm = mrm; /* Restore state. */ + if (ra_noreg(righthi)) { + int32_t imm = IR(ir->op2)->i; + if (imm == 0 && (cc & 0xa) != 0x2 && lefthi != RID_MRM) + emit_rr(as, XO_TEST, lefthi, lefthi); + else + emit_gmrmi(as, XG_ARITHi(XOg_CMP), lefthi, imm); + } else { + emit_mrm(as, XO_CMP, lefthi, righthi); + } +} +#endif + +/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */ + +/* Hiword op of a split 64 bit op. Previous op must be the loword op. */ +static void asm_hiop(ASMState *as, IRIns *ir) +{ +#if LJ_32 && LJ_HASFFI + /* HIOP is marked as a store because it needs its own DCE logic. */ + int uselo = ra_used(ir-1), usehi = ra_used(ir); /* Loword/hiword used? */ + if (LJ_UNLIKELY(!(as->flags & JIT_F_OPT_DCE))) uselo = usehi = 1; + if ((ir-1)->o == IR_CONV) { /* Conversions to/from 64 bit. */ + if (usehi || uselo) { + if (irt_isfp(ir->t)) + asm_conv_fp_int64(as, ir); + else + asm_conv_int64_fp(as, ir); + } + as->curins--; /* Always skip the CONV. */ + return; + } else if ((ir-1)->o <= IR_NE) { /* 64 bit integer comparisons. ORDER IR. */ + asm_comp_int64(as, ir); + return; + } + if (!usehi) return; /* Skip unused hiword op for all remaining ops. */ + switch ((ir-1)->o) { + case IR_ADD: + asm_intarith(as, ir, uselo ? XOg_ADC : XOg_ADD); + break; + case IR_SUB: + asm_intarith(as, ir, uselo ? XOg_SBB : XOg_SUB); + break; + case IR_NEG: { + Reg dest = ra_dest(as, ir, RSET_GPR); + emit_rr(as, XO_GROUP3, XOg_NEG, dest); + if (uselo) { + emit_i8(as, 0); + emit_rr(as, XO_ARITHi8, XOg_ADC, dest); + } + ra_left(as, dest, ir->op1); + break; + } + case IR_CALLN: + ra_destreg(as, ir, RID_RETHI); + if (!uselo) + ra_allocref(as, ir->op1, RID2RSET(RID_RET)); /* Mark call as used. */ + break; + default: lua_assert(0); break; + } +#else + UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on x64 or without FFI. */ +#endif +} /* -- Stack handling ------------------------------------------------------ */ @@ -3682,21 +3947,16 @@ static void asm_ir(ASMState *as, IRIns *ir) switch ((IROp)ir->o) { /* Miscellaneous ops. */ case IR_LOOP: asm_loop(as); break; - case IR_NOP: break; + case IR_NOP: lua_assert(!ra_used(ir)); break; case IR_PHI: asm_phi(as, ir); break; + case IR_HIOP: asm_hiop(as, ir); break; /* Guarded assertions. */ - case IR_LT: asm_comp(as, ir, CC_GE, CC_AE, VCC_PS); break; - case IR_GE: asm_comp(as, ir, CC_L, CC_B, 0); break; - case IR_LE: asm_comp(as, ir, CC_G, CC_A, VCC_PS); break; - case IR_GT: asm_comp(as, ir, CC_LE, CC_BE, 0); break; - case IR_ULT: asm_comp(as, ir, CC_AE, CC_AE, VCC_U); break; - case IR_UGE: asm_comp(as, ir, CC_B, CC_B, VCC_U|VCC_PS); break; - case IR_ULE: asm_comp(as, ir, CC_A, CC_A, VCC_U); break; - case IR_ABC: - case IR_UGT: asm_comp(as, ir, CC_BE, CC_BE, VCC_U|VCC_PS); break; - case IR_EQ: asm_comp(as, ir, CC_NE, CC_NE, VCC_P); break; - case IR_NE: asm_comp(as, ir, CC_E, CC_E, VCC_U|VCC_P); break; + case IR_LT: case IR_GE: case IR_LE: case IR_GT: + case IR_ULT: case IR_UGE: case IR_ULE: case IR_UGT: + case IR_EQ: case IR_NE: case IR_ABC: + asm_comp(as, ir, asm_compmap[ir->o]); + break; case IR_RETF: asm_retf(as, ir); break; @@ -3744,7 +4004,15 @@ static void asm_ir(ASMState *as, IRIns *ir) case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: asm_fpmath(as, ir); break; - case IR_POWI: asm_powi(as, ir); break; + case IR_POWI: +#if LJ_64 && LJ_HASFFI + if (!irt_isnum(ir->t)) + asm_arith64(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : + IRCALL_lj_carith_powu64); + else +#endif + asm_powi(as, ir); + break; /* Overflow-checking arithmetic ops. Note: don't use LEA here! */ case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break; @@ -3801,6 +4069,7 @@ static void asm_trace(ASMState *as) { for (as->curins--; as->curins > as->stopins; as->curins--) { IRIns *ir = IR(as->curins); + lua_assert(!(LJ_32 && irt_isint64(ir->t))); /* Handled by SPLIT. */ if (!ra_used(ir) && !ir_sideeff(ir) && (as->flags & JIT_F_OPT_DCE)) continue; /* Dead-code elimination can be soooo easy. */ if (irt_isguard(ir->t)) @@ -3864,11 +4133,10 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T) case IR_CALLN: case IR_CALLL: case IR_CALLS: { const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; #if LJ_64 - /* NYI: add stack slots for x64 calls with many args. */ lua_assert(CCI_NARGS(ci) <= (LJ_ABI_WIN ? 4 : 6)); ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); #else - /* NYI: not fastcall-aware, but doesn't matter (yet). */ + lua_assert(!(ci->flags & CCI_FASTCALL) || CCI_NARGS(ci) <= 2); if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */ as->evenspill = (int32_t)CCI_NARGS(ci); ir->prev = REGSP_HINT(RID_RET); @@ -3878,6 +4146,12 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T) (RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH; continue; } +#if LJ_32 && LJ_HASFFI + case IR_HIOP: + if ((ir-1)->o == IR_CALLN) + ir->prev = REGSP_HINT(RID_RETHI); + break; +#endif /* C calls evict all scratch regs and return results in RID_RET. */ case IR_SNEW: case IR_NEWREF: #if !LJ_64 @@ -3894,6 +4168,14 @@ static void asm_setup_regsp(ASMState *as, GCtrace *T) as->modset = RSET_SCRATCH; break; case IR_POWI: +#if LJ_64 && LJ_HASFFI + if (!irt_isnum(ir->t)) { + ir->prev = REGSP_HINT(RID_RET); + if (inloop) + as->modset |= (RSET_SCRATCH & RSET_GPR); + continue; + } +#endif ir->prev = REGSP_HINT(RID_XMM0); if (inloop) as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX); diff --git a/src/lj_carith.c b/src/lj_carith.c index 46f07be7..134a61fb 100644 --- a/src/lj_carith.c +++ b/src/lj_carith.c @@ -230,6 +230,14 @@ int lj_carith_op(lua_State *L, MMS mm) /* -- 64 bit integer arithmetic helpers ----------------------------------- */ +#if LJ_32 +/* Signed/unsigned 64 bit multiply. */ +int64_t lj_carith_mul64(int64_t a, int64_t b) +{ + return a * b; +} +#endif + /* Unsigned 64 bit x^k. */ uint64_t lj_carith_powu64(uint64_t x, uint64_t k) { diff --git a/src/lj_carith.h b/src/lj_carith.h index 6870172b..14073603 100644 --- a/src/lj_carith.h +++ b/src/lj_carith.h @@ -12,6 +12,9 @@ LJ_FUNC int lj_carith_op(lua_State *L, MMS mm); +#if LJ_32 +LJ_FUNC int64_t lj_carith_mul64(int64_t x, int64_t k); +#endif LJ_FUNC uint64_t lj_carith_powu64(uint64_t x, uint64_t k); LJ_FUNC int64_t lj_carith_powi64(int64_t x, int64_t k); diff --git a/src/lj_crecord.c b/src/lj_crecord.c index 61210907..5eafa3a7 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -189,6 +189,7 @@ static void crec_ct_ct(jit_State *J, CType *d, CType *s, TRef dp, TRef sp, sp = emitconv(sp, dsize < 4 ? IRT_INT : dt, st, 0); #endif xstore: + if (dt == IRT_I64 || dt == IRT_U64) lj_needsplit(J); emitir(IRT(IR_XSTORE, dt), dp, sp); break; case CCX(I, C): @@ -311,6 +312,7 @@ static TRef crec_tv_ct(jit_State *J, CType *s, CTypeID sid, TRef sp) TRef ptr = emitir(IRT(IR_ADD, IRT_PTR), dp, lj_ir_kintp(J, sizeof(GCcdata))); emitir(IRT(IR_XSTORE, t), ptr, tr); + lj_needsplit(J); return dp; } else if ((sinfo & CTF_BOOL)) { /* Assume not equal to zero. Fixup and emit pending guard later. */ @@ -406,7 +408,10 @@ static void crec_ct_tv(jit_State *J, CType *d, TRef dp, TRef sp, TValue *sval) if (ctype_isenum(s->info)) s = ctype_child(cts, s); if (ctype_isnum(s->info)) { /* Load number value. */ IRType t = crec_ct2irt(s); - if (t != IRT_CDATA) sp = emitir(IRT(IR_XLOAD, t), sp, 0); + if (t != IRT_CDATA) { + sp = emitir(IRT(IR_XLOAD, t), sp, 0); + if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J); + } } goto doconv; } @@ -499,8 +504,10 @@ void LJ_FASTCALL recff_cdata_index(jit_State *J, RecordFFData *rd) if (ctype_isinteger(ctk->info) && (t = crec_ct2irt(ctk)) != IRT_CDATA) { idx = emitir(IRT(IR_ADD, IRT_PTR), idx, lj_ir_kintp(J, sizeof(GCcdata))); idx = emitir(IRT(IR_XLOAD, t), idx, 0); - if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) + if (!LJ_64 && (t == IRT_I64 || t == IRT_U64)) { idx = emitconv(idx, IRT_INT, t, 0); + lj_needsplit(J); + } goto integer_key; } } else if (tref_isstr(idx)) { @@ -664,6 +671,7 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm) CTypeID id; TRef tr, dp, ptr; MSize i; + lj_needsplit(J); if (((s[0]->info & CTF_UNSIGNED) && s[0]->size == 8) || ((s[1]->info & CTF_UNSIGNED) && s[1]->size == 8)) { dt = IRT_U64; id = CTID_UINT64; @@ -691,9 +699,6 @@ static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm) lj_ir_set(J, IRTG(op, dt), sp[0], sp[1]); J->postproc = LJ_POST_FIXGUARD; return TREF_TRUE; - } else if (mm == MM_pow) { - tr = lj_ir_call(J, dt == IRT_I64 ? IRCALL_lj_carith_powi64 : - IRCALL_lj_carith_powu64, sp[0], sp[1]); } else { if (mm == MM_div || mm == MM_mod) return 0; /* NYI: integer div, mod. */ @@ -754,10 +759,11 @@ static TRef crec_arith_ptr(jit_State *J, TRef *sp, CType **s, MMS mm) tr = emitconv(tr, IRT_INTP, IRT_INT, ((t - IRT_I8) & 1) ? 0 : IRCONV_SEXT); #else - if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) + if (!tref_typerange(sp[1], IRT_I8, IRT_U32)) { tr = emitconv(tr, IRT_INTP, t, (t == IRT_NUM || t == IRT_FLOAT) ? IRCONV_TRUNC|IRCONV_ANY : 0); + } #endif tr = emitir(IRT(IR_MUL, IRT_INTP), tr, lj_ir_kintp(J, sz)); tr = emitir(IRT(IR_ADD, IRT_PTR), sp[0], tr); @@ -790,6 +796,7 @@ void LJ_FASTCALL recff_cdata_arith(jit_State *J, RecordFFData *rd) if (ctype_isnum(ct->info)) { IRType t = crec_ct2irt(ct); if (t == IRT_CDATA) goto err_type; + if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J); tr = emitir(IRT(IR_XLOAD, t), tr, 0); } else if (!(ctype_isptr(ct->info) || ctype_isrefarray(ct->info))) { goto err_type; @@ -842,6 +849,7 @@ void LJ_FASTCALL lj_crecord_tonumber(jit_State *J, RecordFFData *rd) IRType t = crec_ct2irt(s); if (t != IRT_CDATA) { TRef tr = emitir(IRT(IR_XLOAD, t), sp, 0); /* Load number value. */ + if (t == IRT_I64 || t == IRT_U64) lj_needsplit(J); if (t == IRT_FLOAT || t == IRT_U32 || t == IRT_I64 || t == IRT_U64) tr = emitconv(tr, IRT_NUM, t, 0); J->base[0] = tr; diff --git a/src/lj_ir.h b/src/lj_ir.h index 1cb3566e..286eb219 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -33,6 +33,7 @@ /* Miscellaneous ops. */ \ _(NOP, N , ___, ___) \ _(BASE, N , lit, lit) \ + _(HIOP, S , ref, ref) \ _(LOOP, S , ___, ___) \ _(PHI, S , ref, ref) \ _(RENAME, S , ref, lit) \ @@ -212,8 +213,9 @@ IRFLDEF(FLENUM) /* CONV mode, stored in op2. */ #define IRCONV_SRCMASK 0x001f /* Source IRType. */ #define IRCONV_DSTMASK 0x03e0 /* Dest. IRType (also in ir->t). */ -#define IRCONV_NUM_INT ((IRT_NUM<<5)|IRT_INT) -#define IRCONV_INT_NUM ((IRT_INT<<5)|IRT_NUM) +#define IRCONV_DSH 5 +#define IRCONV_NUM_INT ((IRT_NUM<ksimd[2*(n)] + 15) & ~(intptr_t)15)) +/* Set/reset flag to activate the SPLIT pass for the current trace. */ +#if LJ_32 && LJ_HASFFI +#define lj_needsplit(J) (J->needsplit = 1) +#define lj_resetsplit(J) (J->needsplit = 0) +#else +#define lj_needsplit(J) UNUSED(J) +#define lj_resetsplit(J) UNUSED(J) +#endif + /* Fold state is used to fold instructions on-the-fly. */ typedef struct FoldState { IRIns ins; /* Currently emitted instruction. */ @@ -293,6 +302,9 @@ typedef struct jit_State { MSize sizesnapmap; /* Size of temp. snapshot map buffer. */ PostProc postproc; /* Required post-processing after execution. */ +#if LJ_32 && LJ_HASFFI + int needsplit; /* Need SPLIT pass. */ +#endif GCRef *trace; /* Array of traces. */ TraceNo freetrace; /* Start of scan for next free trace. */ diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 2d08e187..03caf80d 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -538,6 +538,13 @@ LJFOLDF(kfold_conv_knum_int_num) } } +LJFOLD(CONV KNUM IRCONV_U32_NUM) +LJFOLDF(kfold_conv_knum_u32_num) +{ + lua_assert((fins->op2 & IRCONV_TRUNC)); + return INTFOLD((int32_t)(uint32_t)knumleft); +} + LJFOLD(CONV KNUM IRCONV_I64_NUM) LJFOLDF(kfold_conv_knum_i64_num) { @@ -805,6 +812,7 @@ LJFOLDF(simplify_conv_u32_num) } LJFOLD(CONV CONV IRCONV_I64_NUM) /* _INT or _U32*/ +LJFOLD(CONV CONV IRCONV_U64_NUM) /* _INT or _U32*/ LJFOLDF(simplify_conv_i64_num) { PHIBARRIER(fleft); @@ -826,23 +834,6 @@ LJFOLDF(simplify_conv_i64_num) return NEXTFOLD; } -LJFOLD(CONV CONV IRCONV_U64_NUM) /* _U32*/ -LJFOLDF(simplify_conv_u64_num) -{ - PHIBARRIER(fleft); - if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) { -#if LJ_TARGET_X64 - return fleft->op1; -#else - /* Reduce to a zero-extension. */ - fins->op1 = fleft->op1; - fins->op2 = (IRT_U64<<5)|IRT_U32; - return RETRYFOLD; -#endif - } - return NEXTFOLD; -} - /* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */ LJFOLD(TOBIT CONV KNUM) LJFOLDF(simplify_tobit_conv) diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c new file mode 100644 index 00000000..3cb30514 --- /dev/null +++ b/src/lj_opt_split.c @@ -0,0 +1,343 @@ +/* +** SPLIT: Split 64 bit IR instructions into 32 bit IR instructions. +** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h +*/ + +#define lj_opt_split_c +#define LUA_CORE + +#include "lj_obj.h" + +#if LJ_HASJIT && LJ_HASFFI && LJ_32 + +#include "lj_err.h" +#include "lj_str.h" +#include "lj_ir.h" +#include "lj_jit.h" +#include "lj_iropt.h" +#include "lj_vm.h" + +/* SPLIT pass: +** +** This pass splits up 64 bit IR instructions into multiple 32 bit IR +** instructions. It's only active for 32 bit CPUs which lack native 64 bit +** operations. The FFI is currently the only emitter for 64 bit +** instructions, so this pass is disabled if the FFI is disabled. +** +** Splitting the IR in a separate pass keeps each 32 bit IR assembler +** backend simple. Only a small amount of extra functionality needs to be +** implemented. This is much easier than adding support for allocating +** register pairs to each backend (believe me, I tried). A few simple, but +** important optimizations can be performed by the SPLIT pass, which would +** be tedious to do in the backend. +** +** The basic idea is to replace each 64 bit IR instruction with its 32 bit +** equivalent plus an extra HIOP instruction. The splitted IR is not passed +** through FOLD or any other optimizations, so each HIOP is guaranteed to +** immediately follow it's counterpart. The actual functionality of HIOP is +** inferred from the previous instruction. +** +** The operands of HIOP hold the hiword input references. The output of HIOP +** is the hiword output reference, which is also used to hold the hiword +** register or spill slot information. The register allocator treats this +** instruction independent of any other instruction, which improves code +** quality compared to using fixed register pairs. +** +** It's easier to split up some instructions into two regular 32 bit +** instructions. E.g. XLOAD is split up into two XLOADs with two different +** addresses. Obviously 64 bit constants need to be split up into two 32 bit +** constants, too. Some hiword instructions can be entirely omitted, e.g. +** when zero-extending a 32 bit value to 64 bits. +** +** Here's the IR and x64 machine code for 'x.b = x.a + 1' for a struct with +** two int64_t fields: +** +** 0100 p32 ADD base +8 +** 0101 i64 XLOAD 0100 +** 0102 i64 ADD 0101 +1 +** 0103 p32 ADD base +16 +** 0104 i64 XSTORE 0103 0102 +** +** mov rax, [esi+0x8] +** add rax, +0x01 +** mov [esi+0x10], rax +** +** Here's the transformed IR and the x86 machine code after the SPLIT pass: +** +** 0100 p32 ADD base +8 +** 0101 int XLOAD 0100 +** 0102 p32 ADD base +12 +** 0103 int XLOAD 0102 +** 0104 int ADD 0101 +1 +** 0105 int HIOP 0103 +0 +** 0106 p32 ADD base +16 +** 0107 int XSTORE 0106 0104 +** 0108 p32 ADD base +20 +** 0109 int XSTORE 0108 0105 +** +** mov eax, [esi+0x8] +** mov ecx, [esi+0xc] +** add eax, +0x01 +** adc ecx, +0x00 +** mov [esi+0x10], eax +** mov [esi+0x14], ecx +** +** You may notice the reassociated hiword address computation, which is +** later fused into the mov operands by the assembler. +*/ + +/* Some local macros to save typing. Undef'd at the end. */ +#define IR(ref) (&J->cur.ir[(ref)]) + +/* Directly emit the transformed IR without updating chains etc. */ +static IRRef split_emit(jit_State *J, uint16_t ot, IRRef1 op1, IRRef1 op2) +{ + IRRef nref = lj_ir_nextins(J); + IRIns *ir = IR(nref); + ir->ot = ot; + ir->op1 = op1; + ir->op2 = op2; + return nref; +} + +/* Emit a CALLN with two split 64 bit arguments. */ +static IRRef split_call64(jit_State *J, IRRef1 *hisubst, IRIns *oir, + IRIns *ir, IRCallID id) +{ + IRRef tmp, op1 = ir->op1, op2 = ir->op2; + J->cur.nins--; +#if LJ_LE + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), oir[op1].prev, hisubst[op1]); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); +#else + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), hisubst[op1], oir[op1].prev); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, hisubst[op2]); + tmp = split_emit(J, IRT(IR_CARG, IRT_NIL), tmp, oir[op2].prev); +#endif + ir->prev = tmp = split_emit(J, IRTI(IR_CALLN), tmp, id); + return split_emit(J, IRTI(IR_HIOP), tmp, tmp); +} + +/* Get a pointer to the other 32 bit word (LE: hiword, BE: loword). */ +static IRRef split_ptr(jit_State *J, IRRef ref) +{ + IRIns *ir = IR(ref); + int32_t ofs = 4; + if (ir->o == IR_ADD && irref_isk(ir->op2)) { /* Reassociate address. */ + ofs += IR(ir->op2)->i; + ref = ir->op1; + if (ofs == 0) return ref; + } + return split_emit(J, IRTI(IR_ADD), ref, lj_ir_kint(J, ofs)); +} + +/* Transform the old IR to the new IR. */ +static void split_ir(jit_State *J) +{ + IRRef nins = J->cur.nins, nk = J->cur.nk; + MSize irlen = nins - nk; + MSize need = (irlen+1)*(sizeof(IRIns) + sizeof(IRRef1)); + IRIns *oir = (IRIns *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, need); + IRRef1 *hisubst; + IRRef ref; + + /* Copy old IR to buffer. */ + memcpy(oir, IR(nk), irlen*sizeof(IRIns)); + /* Bias hiword substitution table and old IR. Loword kept in field prev. */ + hisubst = (IRRef1 *)&oir[irlen] - nk; + oir -= nk; + + /* Remove all IR instructions, but retain IR constants. */ + J->cur.nins = REF_FIRST; + + /* Process constants and fixed references. */ + for (ref = nk; ref <= REF_BASE; ref++) { + IRIns *ir = &oir[ref]; + if (ir->o == IR_KINT64) { /* Split up 64 bit constant. */ + TValue tv = *ir_k64(ir); + ir->prev = lj_ir_kint(J, (int32_t)tv.u32.lo); + hisubst[ref] = lj_ir_kint(J, (int32_t)tv.u32.hi); + } else { + ir->prev = (IRRef1)ref; /* Identity substitution for loword. */ + } + } + + /* Process old IR instructions. */ + for (ref = REF_FIRST; ref < nins; ref++) { + IRIns *ir = &oir[ref]; + IRRef nref = lj_ir_nextins(J); + IRIns *nir = IR(nref); + + /* Copy-substitute old instruction to new instruction. */ + nir->op1 = ir->op1 < nk ? ir->op1 : oir[ir->op1].prev; + nir->op2 = ir->op2 < nk ? ir->op2 : oir[ir->op2].prev; + ir->prev = nref; /* Loword substitution. */ + nir->o = ir->o; + nir->t.irt = ir->t.irt & ~(IRT_MARK|IRT_ISPHI); + + /* Split 64 bit instructions. */ + if (irt_isint64(ir->t)) { + IRRef hi = hisubst[ir->op1]; + nir->t.irt = IRT_INT | (nir->t.irt & IRT_GUARD); /* Turn into INT op. */ + switch (ir->o) { + case IR_ADD: + case IR_SUB: + /* Use plain op for hiword if loword cannot produce a carry/borrow. */ + if (irref_isk(nir->op2) && IR(nir->op2)->i == 0) { + ir->prev = nir->op1; /* Pass through loword. */ + nir->op1 = hi; nir->op2 = hisubst[ir->op2]; + hi = nref; + break; + } + /* fallthrough */ + case IR_NEG: + hi = split_emit(J, IRTI(IR_HIOP), hi, hisubst[ir->op2]); + break; + case IR_MUL: + hi = split_call64(J, hisubst, oir, ir, IRCALL_lj_carith_mul64); + break; + case IR_POWI: + hi = split_call64(J, hisubst, oir, ir, + irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : + IRCALL_lj_carith_powu64); + break; + case IR_XLOAD: + hi = split_emit(J, IRTI(IR_XLOAD), split_ptr(J, nir->op1), ir->op2); +#if LJ_BE + ir->prev = hi; hi = nref; +#endif + break; + case IR_XSTORE: +#if LJ_LE + hi = hisubst[ir->op2]; +#else + hi = nir->op2; nir->op2 = hisubst[ir->op2]; +#endif + split_emit(J, IRTI(IR_XSTORE), split_ptr(J, nir->op1), hi); + continue; + case IR_CONV: { /* Conversion to 64 bit integer. Others handled below. */ + IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); + if (st == IRT_NUM || st == IRT_FLOAT) { /* FP to 64 bit int conv. */ + hi = split_emit(J, IRTI(IR_HIOP), nir->op1, nref); + } else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */ + /* Drop cast, since assembler doesn't care. */ + hisubst[ref] = hi; + goto fwdlo; + } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */ + IRRef k31 = lj_ir_kint(J, 31); + nir = IR(nref); /* May have been reallocated. */ + ir->prev = nir->op1; /* Pass through loword. */ + nir->o = IR_BSAR; /* hi = bsar(lo, 31). */ + nir->op2 = k31; + hi = nref; + } else { /* Zero-extend to 64 bit. */ + hisubst[ref] = lj_ir_kint(J, 0); + goto fwdlo; + } + break; + } + case IR_PHI: { + IRRef hi2; + if ((irref_isk(nir->op1) && irref_isk(nir->op2)) || + nir->op1 == nir->op2) + J->cur.nins--; /* Drop useless PHIs. */ + hi2 = hisubst[ir->op2]; + if (!((irref_isk(hi) && irref_isk(hi2)) || hi == hi2)) + split_emit(J, IRTI(IR_PHI), hi, hi2); + continue; + } + default: + lua_assert(ir->o <= IR_NE); + split_emit(J, IRTGI(IR_HIOP), hi, hisubst[ir->op2]); /* Comparisons. */ + continue; + } + hisubst[ref] = hi; /* Store hiword substitution. */ + } else if (ir->o == IR_CONV) { /* See above, too. */ + IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); + if (st == IRT_I64 || st == IRT_U64) { /* Conversion from 64 bit int. */ + if (irt_isfp(ir->t)) { /* 64 bit integer to FP conversion. */ + ir->prev = split_emit(J, IRT(IR_HIOP, irt_type(ir->t)), + hisubst[ir->op1], nref); + } else { /* Truncate to lower 32 bits. */ + fwdlo: + ir->prev = nir->op1; /* Forward loword. */ + /* Replace with NOP to avoid messing up the snapshot logic. */ + nir->ot = IRT(IR_NOP, IRT_NIL); + nir->op1 = nir->op2 = 0; + } + } + } else if (ir->o == IR_LOOP) { + J->loopref = nref; /* Needed by assembler. */ + } + } + + /* Add PHI marks. */ + for (ref = J->cur.nins-1; ref >= REF_FIRST; ref--) { + IRIns *ir = IR(ref); + if (ir->o != IR_PHI) break; + if (!irref_isk(ir->op1)) irt_setphi(IR(ir->op1)->t); + if (ir->op2 > J->loopref) irt_setphi(IR(ir->op2)->t); + } + + /* Substitute snapshot maps. */ + oir[nins].prev = J->cur.nins; /* Substitution for last snapshot. */ + { + SnapNo i, nsnap = J->cur.nsnap; + for (i = 0; i < nsnap; i++) { + SnapShot *snap = &J->cur.snap[i]; + SnapEntry *map = &J->cur.snapmap[snap->mapofs]; + MSize n, nent = snap->nent; + snap->ref = oir[snap->ref].prev; + for (n = 0; n < nent; n++) { + SnapEntry sn = map[n]; + map[n] = ((sn & 0xffff0000) | oir[snap_ref(sn)].prev); + } + } + } +} + +/* Protected callback for split pass. */ +static TValue *cpsplit(lua_State *L, lua_CFunction dummy, void *ud) +{ + jit_State *J = (jit_State *)ud; + split_ir(J); + UNUSED(L); UNUSED(dummy); + return NULL; +} + +#ifdef LUA_USE_ASSERT +/* Slow, but sure way to check whether a SPLIT pass is needed. */ +static int split_needsplit(jit_State *J) +{ + IRIns *ir, *irend; + IRRef ref; + for (ir = IR(REF_FIRST), irend = IR(J->cur.nins); ir < irend; ir++) + if (irt_isint64(ir->t)) + return 1; + for (ref = J->chain[IR_CONV]; ref; ref = IR(ref)->prev) + if ((IR(ref)->op2 & IRCONV_SRCMASK) == IRT_I64 || + (IR(ref)->op2 & IRCONV_SRCMASK) == IRT_U64) + return 1; + return 0; /* Nope. */ +} +#endif + +/* SPLIT pass. */ +void lj_opt_split(jit_State *J) +{ + lua_assert(J->needsplit >= split_needsplit(J)); /* Verify flag. */ + if (J->needsplit) { + int errcode = lj_vm_cpcall(J->L, NULL, J, cpsplit); + if (errcode) { + /* Completely reset the trace to avoid inconsistent dump on abort. */ + J->cur.nins = J->cur.nk = REF_BASE; + J->cur.nsnap = 0; + lj_err_throw(J->L, errcode); /* Propagate errors. */ + } + } +} + +#undef IR + +#endif diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 94ab3c32..37c68f4b 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -193,6 +193,7 @@ typedef enum { XI_FLD1 = 0xe8d9, XI_FLDLG2 = 0xecd9, XI_FLDLN2 = 0xedd9, + XI_FDUP = 0xc0d9, /* Really fld st0. */ XI_FPOP = 0xd8dd, /* Really fstp st0. */ XI_FPOP1 = 0xd9dd, /* Really fstp st1. */ XI_FRNDINT = 0xfcd9, @@ -263,10 +264,17 @@ typedef enum { XO_MOVD = XO_660f(6e), XO_MOVDto = XO_660f(7e), + XO_FLDd = XO_(d9), XOg_FLDd = 0, XO_FLDq = XO_(dd), XOg_FLDq = 0, XO_FILDd = XO_(db), XOg_FILDd = 0, + XO_FILDq = XO_(df), XOg_FILDq = 5, + XO_FSTPd = XO_(d9), XOg_FSTPd = 3, XO_FSTPq = XO_(dd), XOg_FSTPq = 3, XO_FISTPq = XO_(df), XOg_FISTPq = 7, + XO_FISTTPq = XO_(dd), XOg_FISTTPq = 1, + XO_FADDq = XO_(dc), XOg_FADDq = 0, + XO_FLDCW = XO_(d9), XOg_FLDCW = 5, + XO_FNSTCW = XO_(d9), XOg_FNSTCW = 7 } x86Op; /* x86 opcode groups. */ @@ -278,6 +286,7 @@ typedef uint32_t x86Group; #define XG_TOXOi8(xg) ((x86Op)(0x000000fe + (((xg)<<8) & 0xff000000))) #define XO_ARITH(a) ((x86Op)(0x030000fe + ((a)<<27))) +#define XO_ARITHw(a) ((x86Op)(0x036600fd + ((a)<<27))) typedef enum { XOg_ADD, XOg_OR, XOg_ADC, XOg_SBB, XOg_AND, XOg_SUB, XOg_XOR, XOg_CMP, diff --git a/src/lj_trace.c b/src/lj_trace.c index da20f991..b67e8f75 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -394,6 +394,7 @@ static void trace_start(jit_State *J) J->bcskip = 0; J->guardemit.irt = 0; J->postproc = LJ_POST_NONE; + lj_resetsplit(J); setgcref(J->cur.startpt, obj2gco(J->pt)); L = J->L; @@ -592,6 +593,7 @@ static TValue *trace_state(lua_State *L, lua_CFunction dummy, void *ud) } J->loopref = J->chain[IR_LOOP]; /* Needed by assembler. */ } + lj_opt_split(J); J->state = LJ_TRACE_ASM; break; diff --git a/src/ljamalg.c b/src/ljamalg.c index 4d5f7600..5d90c002 100644 --- a/src/ljamalg.c +++ b/src/ljamalg.c @@ -58,6 +58,7 @@ #include "lj_opt_narrow.c" #include "lj_opt_dce.c" #include "lj_opt_loop.c" +#include "lj_opt_split.c" #include "lj_mcode.c" #include "lj_snap.c" #include "lj_record.c"