diff --git a/src/lj_asm.c b/src/lj_asm.c index 5d48b6be..ce42c437 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -2482,6 +2482,31 @@ static void asm_intarith(ASMState *as, IRIns *ir, x86Arith xa) ra_left(as, dest, lref); } +static void asm_intmul(ASMState *as, IRIns *ir) +{ + IRRef lref = ir->op1; + IRRef rref = ir->op2; + int32_t k = 0; + if (asm_isk32(as, rref, &k)) { + /* NYI: use lea/shl/add/sub (FOLD only does 2^k) depending on CPU. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = asm_fuseload(as, lref, RSET_GPR); + x86Op xo; + if (checki8(k)) { + emit_i8(as, k); + xo = XO_IMULi8; + } else { + emit_i32(as, k); + xo = XO_IMULi; + } + emit_rr(as, xo, REX_64IR(ir, dest), left); + } else { + /* NYI: integer multiply of non-constant operands. */ + setintV(&as->J->errinfo, ir->o); + lj_trace_err_info(as->J, LJ_TRERR_NYIIR); + } +} + /* LEA is really a 4-operand ADD with an independent destination register, ** up to two source registers and an immediate. One register can be scaled ** by 1, 2, 4 or 8. This can be used to avoid moves or to fuse several @@ -3445,7 +3470,12 @@ static void asm_ir(ASMState *as, IRIns *ir) else /* Note: no need for LEA trick here. i-k is encoded as i+(-k). */ asm_intarith(as, ir, XOg_SUB); break; - case IR_MUL: asm_fparith(as, ir, XO_MULSD); break; + case IR_MUL: + if (irt_isnum(ir->t)) + asm_fparith(as, ir, XO_MULSD); + else + asm_intmul(as, ir); + break; case IR_DIV: asm_fparith(as, ir, XO_DIVSD); break; case IR_NEG: asm_fparith(as, ir, XO_XORPS); break; diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 22d211e1..cae4e5e4 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -197,6 +197,7 @@ static int32_t kfold_intop(int32_t k1, int32_t k2, IROp op) switch (op) { case IR_ADD: k1 += k2; break; case IR_SUB: k1 -= k2; break; + case IR_MUL: k1 *= k2; break; case IR_BAND: k1 &= k2; break; case IR_BOR: k1 |= k2; break; case IR_BXOR: k1 ^= k2; break; @@ -212,6 +213,7 @@ static int32_t kfold_intop(int32_t k1, int32_t k2, IROp op) LJFOLD(ADD KINT KINT) LJFOLD(SUB KINT KINT) +LJFOLD(MUL KINT KINT) LJFOLD(BAND KINT KINT) LJFOLD(BOR KINT KINT) LJFOLD(BXOR KINT KINT) @@ -680,6 +682,43 @@ LJFOLDF(simplify_intsub_k64) return RETRYFOLD; } +static TRef simplify_intmul_k(jit_State *J, int32_t k) +{ + /* Note: many more simplifications are possible, e.g. 2^k1 +- 2^k2. + ** But this is mainly intended for simple address arithmetic. + ** Also it's easier for the backend to optimize the original multiplies. + */ + if (k == 1) { /* i * 1 ==> i */ + return LEFTFOLD; + } else if ((k & (k-1)) == 0) { /* i * 2^k ==> i << k */ + fins->o = IR_BSHL; + fins->op2 = lj_ir_kint(J, lj_fls((uint32_t)k)); + return RETRYFOLD; + } + return NEXTFOLD; +} + +LJFOLD(MUL any KINT) +LJFOLDF(simplify_intmul_k32) +{ + if (fright->i == 0) /* i * 0 ==> 0 */ + return INTFOLD(0); + else if (fright->i > 0) + return simplify_intmul_k(J, fright->i); + return NEXTFOLD; +} + +LJFOLD(MUL any KINT64) +LJFOLDF(simplify_intmul_k64) + +{ + if (ir_kint64(fright)->u64 == 0) /* i * 0 ==> 0 */ + return lj_ir_kint64(J, 0); + else if (ir_kint64(fright)->u64 < 0x80000000u) + return simplify_intmul_k(J, (int32_t)ir_kint64(fright)->u64); + return NEXTFOLD; +} + LJFOLD(SUB any any) LJFOLD(SUBOV any any) LJFOLDF(simplify_intsub) @@ -816,16 +855,17 @@ LJFOLD(BROL any KINT) LJFOLD(BROR any KINT) LJFOLDF(simplify_shift_ik) { - int32_t k = (fright->i & 31); + int32_t mask = irt_is64(fins->t) ? 63 : 31; + int32_t k = (fright->i & mask); if (k == 0) /* i o 0 ==> i */ return LEFTFOLD; - if (k != fright->i) { /* i o k ==> i o (k & 31) */ + if (k != fright->i) { /* i o k ==> i o (k & mask) */ fins->op2 = (IRRef1)lj_ir_kint(J, k); return RETRYFOLD; } - if (fins->o == IR_BROR) { /* bror(i, k) ==> brol(i, (-k)&31) */ + if (fins->o == IR_BROR) { /* bror(i, k) ==> brol(i, (-k)&mask) */ fins->o = IR_BROL; - fins->op2 = (IRRef1)lj_ir_kint(J, (-k)&31); + fins->op2 = (IRRef1)lj_ir_kint(J, (-k)&mask); return RETRYFOLD; } return NEXTFOLD; @@ -841,9 +881,10 @@ LJFOLDF(simplify_shift_andk) IRIns *irk = IR(fright->op2); PHIBARRIER(fright); if ((fins->o < IR_BROL ? LJ_TARGET_MASKSHIFT : LJ_TARGET_MASKROT) && - irk->o == IR_KINT) { /* i o (j & 31) ==> i o j */ - int32_t k = irk->i & 31; - if (k == 31) { + irk->o == IR_KINT) { /* i o (j & mask) ==> i o j */ + int32_t mask = irt_is64(fins->t) ? 63 : 31; + int32_t k = irk->i & mask; + if (k == mask) { fins->op2 = fright->op1; return RETRYFOLD; } @@ -870,9 +911,29 @@ LJFOLDF(simplify_shift2_ki) return NEXTFOLD; } +LJFOLD(BSHL KINT64 any) +LJFOLD(BSHR KINT64 any) +LJFOLDF(simplify_shift1_ki64) +{ + if (ir_kint64(fleft)->u64 == 0) /* 0 o i ==> 0 */ + return LEFTFOLD; + return NEXTFOLD; +} + +LJFOLD(BSAR KINT64 any) +LJFOLD(BROL KINT64 any) +LJFOLD(BROR KINT64 any) +LJFOLDF(simplify_shift2_ki64) +{ + if (ir_kint64(fleft)->u64 == 0 || (int64_t)ir_kint64(fleft)->u64 == -1) + return LEFTFOLD; /* 0 o i ==> 0; -1 o i ==> -1 */ + return NEXTFOLD; +} + /* -- Reassociation ------------------------------------------------------- */ LJFOLD(ADD ADD KINT) +LJFOLD(MUL MUL KINT) LJFOLD(BAND BAND KINT) LJFOLD(BOR BOR KINT) LJFOLD(BXOR BXOR KINT) @@ -924,14 +985,15 @@ LJFOLDF(reassoc_shift) IRIns *irk = IR(fleft->op2); PHIBARRIER(fleft); /* The (shift any KINT) rule covers k2 == 0 and more. */ if (irk->o == IR_KINT) { /* (i o k1) o k2 ==> i o (k1 + k2) */ - int32_t k = (irk->i & 31) + (fright->i & 31); - if (k > 31) { /* Combined shift too wide? */ + int32_t mask = irt_is64(fins->t) ? 63 : 31; + int32_t k = (irk->i & mask) + (fright->i & mask); + if (k > mask) { /* Combined shift too wide? */ if (fins->o == IR_BSHL || fins->o == IR_BSHR) - return INTFOLD(0); + return mask == 31 ? INTFOLD(0) : lj_ir_kint64(J, 0); else if (fins->o == IR_BSAR) - k = 31; + k = mask; else - k &= 31; + k &= mask; } fins->op1 = fleft->op1; fins->op2 = (IRRef1)lj_ir_kint(J, k); diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 0fcee6dd..d3956a09 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -218,6 +218,7 @@ typedef enum { XO_SHIFTi = XO_(c1), XO_SHIFT1 = XO_(d1), XO_SHIFTcl = XO_(d3), + XO_IMULi = XO_(69), XO_IMULi8 = XO_(6b), XO_CMP = XO_(3b), XO_TEST = XO_(85),