From 892887e5841fc91d8f954e780310a66404cbaadc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 28 Mar 2016 23:05:20 +0200 Subject: [PATCH] x86: Generate BMI2 shifts and rotates, if available. Contributed by Peter Cawley. --- src/jit/dis_x86.lua | 3 +++ src/lj_asm.c | 5 ++++- src/lj_asm_x86.h | 28 ++++++++++++++++++++++------ src/lj_emit_x86.h | 11 +++++++++++ src/lj_target_x86.h | 11 +++++++++++ 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/jit/dis_x86.lua b/src/jit/dis_x86.lua index f8a21ff3..d564988e 100644 --- a/src/jit/dis_x86.lua +++ b/src/jit/dis_x86.lua @@ -244,6 +244,7 @@ nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm", [0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm", --Fx [0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt", +[0xf7] = "|sarxVrmv|shlxVrmv|shrxVrmv", }, ["3a"] = { -- [66] 0f 3a xx @@ -273,6 +274,8 @@ nil,nil,nil,nil, [0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu", [0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu", [0xdf] = "||aeskeygenassistXrmu", +--Fx +[0xf0] = "|||rorxVrmu", }, } diff --git a/src/lj_asm.c b/src/lj_asm.c index 93f6bcd6..94d7bfc4 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -2150,7 +2150,10 @@ static void asm_setup_regsp(ASMState *as) #endif #if LJ_TARGET_X86ORX64 /* Non-constant shift counts need to be in RID_ECX on x86/x64. */ - case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR: + case IR_BSHL: case IR_BSHR: case IR_BSAR: + if ((as->flags & JIT_F_BMI2)) /* Except if BMI2 is available. */ + break; + case IR_BROL: case IR_BROR: if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r)) { IR(ir->op2)->r = REGSP_HINT(RID_ECX); if (inloop) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 512e0534..718cb12e 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1956,7 +1956,7 @@ static void asm_bswap(ASMState *as, IRIns *ir) #define asm_bor(as, ir) asm_intarith(as, ir, XOg_OR) #define asm_bxor(as, ir) asm_intarith(as, ir, XOg_XOR) -static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) +static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs, x86Op xv) { IRRef rref = ir->op2; IRIns *irr = IR(rref); @@ -1965,11 +1965,27 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) int shift; dest = ra_dest(as, ir, RSET_GPR); shift = irr->i & (irt_is64(ir->t) ? 63 : 31); + if (!xv && shift && (as->flags & JIT_F_BMI2)) { + Reg left = asm_fuseloadm(as, ir->op1, RSET_GPR, irt_is64(ir->t)); + if (left != dest) { /* BMI2 rotate right by constant. */ + emit_i8(as, xs == XOg_ROL ? -shift : shift); + emit_mrm(as, VEX_64IR(ir, XV_RORX), dest, left); + return; + } + } switch (shift) { case 0: break; case 1: emit_rr(as, XO_SHIFT1, REX_64IR(ir, xs), dest); break; default: emit_shifti(as, REX_64IR(ir, xs), dest, shift); break; } + } else if ((as->flags & JIT_F_BMI2) && xv) { /* BMI2 variable shifts. */ + Reg left, right; + dest = ra_dest(as, ir, RSET_GPR); + right = ra_alloc1(as, rref, RSET_GPR); + left = asm_fuseloadm(as, ir->op1, rset_exclude(RSET_GPR, right), + irt_is64(ir->t)); + emit_mrm(as, VEX_64IR(ir, xv) ^ (right << 19), dest, left); + return; } else { /* Variable shifts implicitly use register cl (i.e. ecx). */ Reg right; dest = ra_dest(as, ir, rset_exclude(RSET_GPR, RID_ECX)); @@ -1995,11 +2011,11 @@ static void asm_bitshift(ASMState *as, IRIns *ir, x86Shift xs) */ } -#define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL) -#define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR) -#define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR) -#define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL) -#define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR) +#define asm_bshl(as, ir) asm_bitshift(as, ir, XOg_SHL, XV_SHLX) +#define asm_bshr(as, ir) asm_bitshift(as, ir, XOg_SHR, XV_SHRX) +#define asm_bsar(as, ir) asm_bitshift(as, ir, XOg_SAR, XV_SARX) +#define asm_brol(as, ir) asm_bitshift(as, ir, XOg_ROL, 0) +#define asm_bror(as, ir) asm_bitshift(as, ir, XOg_ROR, 0) /* -- Comparisons --------------------------------------------------------- */ diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index caf30859..cbaf4e85 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -13,10 +13,12 @@ if (rex != 0x40) *--(p) = rex; } #define FORCE_REX 0x200 #define REX_64 (FORCE_REX|0x080000) +#define VEX_64 0x800000 #else #define REXRB(p, rr, rb) ((void)0) #define FORCE_REX 0 #define REX_64 0 +#define VEX_64 0 #endif #define emit_i8(as, i) (*--as->mcp = (MCode)(i)) @@ -31,6 +33,13 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx, MCode *p, int delta) { int n = (int8_t)xo; + if (n == -60) { /* VEX-encoded instruction */ +#if LJ_64 + xo ^= (((rr>>1)&4)+((rx>>2)&2)+((rb>>3)&1))<<13; +#endif + *(uint32_t *)(p+delta-5) = (uint32_t)xo; + return p+delta-5; + } #if defined(__GNUC__) if (__builtin_constant_p(xo) && n == -2) p[delta-2] = (MCode)(xo >> 24); @@ -412,8 +421,10 @@ static void emit_call_(ASMState *as, MCode *target) /* Use 64 bit operations to handle 64 bit IR types. */ #if LJ_64 #define REX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? REX_64 : 0)) +#define VEX_64IR(ir, r) ((r) + (irt_is64((ir)->t) ? VEX_64 : 0)) #else #define REX_64IR(ir, r) (r) +#define VEX_64IR(ir, r) (r) #endif /* Generic move between two regs. */ diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 289f83e1..e29f4748 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -189,6 +189,11 @@ typedef struct { #define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24))) #define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24))) +#define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24))) +#define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24))) +#define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24))) +#define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24))) + /* This list of x86 opcodes is not intended to be complete. Opcodes are only ** included when needed. Take a look at DynASM or jit.dis_x86 to see the ** whole mess. @@ -231,6 +236,12 @@ typedef enum { XI_FSCALE = 0xfdd9, XI_FYL2X = 0xf1d9, + /* VEX-encoded instructions. XV_* prefix. */ + XV_RORX = XV_f20f3a(f0), + XV_SARX = XV_f30f38(f7), + XV_SHLX = XV_660f38(f7), + XV_SHRX = XV_f20f38(f7), + /* Variable-length opcodes. XO_* prefix. */ XO_MOV = XO_(8b), XO_MOVto = XO_(89),