mirror of
https://github.com/LuaJIT/LuaJIT.git
synced 2025-02-07 15:14:08 +00:00
Disable FMA by default. Use -Ofma or jit.opt.start("+fma") to enable.
See the discussion in #918 for the rationale.
This commit is contained in:
parent
7d5d4a1b1a
commit
de2e1ca9d3
@ -220,6 +220,12 @@ mix the three forms, but note that setting an optimization level
|
|||||||
overrides all earlier flags.
|
overrides all earlier flags.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
|
Note that <tt>-Ofma</tt> is not enabled by default at any level,
|
||||||
|
because it affects floating-point result accuracy. Only enable this,
|
||||||
|
if you fully understand the trade-offs of FMA for performance (higher),
|
||||||
|
determinism (lower) and numerical accuracy (higher).
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
Here are the available flags and at what optimization levels they
|
Here are the available flags and at what optimization levels they
|
||||||
are enabled:
|
are enabled:
|
||||||
</p>
|
</p>
|
||||||
@ -251,6 +257,8 @@ are enabled:
|
|||||||
<td class="flag_name">sink</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
|
<td class="flag_name">sink</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
|
||||||
<tr class="even">
|
<tr class="even">
|
||||||
<td class="flag_name">fuse</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
|
<td class="flag_name">fuse</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="flag_name">fma </td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_desc">Fused multiply-add</td></tr>
|
||||||
</table>
|
</table>
|
||||||
<p>
|
<p>
|
||||||
Here are the parameters and their default settings:
|
Here are the parameters and their default settings:
|
||||||
|
@ -313,7 +313,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if !LJ_SOFTFP
|
#if !LJ_SOFTFP
|
||||||
/* Fuse to multiply-add/sub instruction. */
|
/*
|
||||||
|
** Fuse to multiply-add/sub instruction.
|
||||||
|
** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA.
|
||||||
|
** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets.
|
||||||
|
*/
|
||||||
static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
|
static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
|
||||||
{
|
{
|
||||||
IRRef lref = ir->op1, rref = ir->op2;
|
IRRef lref = ir->op1, rref = ir->op2;
|
||||||
|
@ -337,7 +337,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air)
|
|||||||
{
|
{
|
||||||
IRRef lref = ir->op1, rref = ir->op2;
|
IRRef lref = ir->op1, rref = ir->op2;
|
||||||
IRIns *irm;
|
IRIns *irm;
|
||||||
if (lref != rref &&
|
if ((as->flags & JIT_F_OPT_FMA) &&
|
||||||
|
lref != rref &&
|
||||||
((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
|
((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
|
||||||
ra_noreg(irm->r)) ||
|
ra_noreg(irm->r)) ||
|
||||||
(mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
|
(mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
|
||||||
|
@ -235,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir)
|
|||||||
{
|
{
|
||||||
IRRef lref = ir->op1, rref = ir->op2;
|
IRRef lref = ir->op1, rref = ir->op2;
|
||||||
IRIns *irm;
|
IRIns *irm;
|
||||||
if (lref != rref &&
|
if ((as->flags & JIT_F_OPT_FMA) &&
|
||||||
|
lref != rref &&
|
||||||
((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
|
((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
|
||||||
ra_noreg(irm->r)) ||
|
ra_noreg(irm->r)) ||
|
||||||
(mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
|
(mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
|
||||||
|
@ -87,10 +87,11 @@
|
|||||||
#define JIT_F_OPT_ABC (JIT_F_OPT << 7)
|
#define JIT_F_OPT_ABC (JIT_F_OPT << 7)
|
||||||
#define JIT_F_OPT_SINK (JIT_F_OPT << 8)
|
#define JIT_F_OPT_SINK (JIT_F_OPT << 8)
|
||||||
#define JIT_F_OPT_FUSE (JIT_F_OPT << 9)
|
#define JIT_F_OPT_FUSE (JIT_F_OPT << 9)
|
||||||
|
#define JIT_F_OPT_FMA (JIT_F_OPT << 10)
|
||||||
|
|
||||||
/* Optimizations names for -O. Must match the order above. */
|
/* Optimizations names for -O. Must match the order above. */
|
||||||
#define JIT_F_OPTSTRING \
|
#define JIT_F_OPTSTRING \
|
||||||
"\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse"
|
"\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma"
|
||||||
|
|
||||||
/* Optimization levels set a fixed combination of flags. */
|
/* Optimization levels set a fixed combination of flags. */
|
||||||
#define JIT_F_OPT_0 0
|
#define JIT_F_OPT_0 0
|
||||||
@ -99,6 +100,7 @@
|
|||||||
#define JIT_F_OPT_3 (JIT_F_OPT_2|\
|
#define JIT_F_OPT_3 (JIT_F_OPT_2|\
|
||||||
JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
|
JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
|
||||||
#define JIT_F_OPT_DEFAULT JIT_F_OPT_3
|
#define JIT_F_OPT_DEFAULT JIT_F_OPT_3
|
||||||
|
/* Note: FMA is not set by default. */
|
||||||
|
|
||||||
/* -- JIT engine parameters ----------------------------------------------- */
|
/* -- JIT engine parameters ----------------------------------------------- */
|
||||||
|
|
||||||
|
@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
|
|||||||
|
|
||||||
/* -- Helper functions ---------------------------------------------------- */
|
/* -- Helper functions ---------------------------------------------------- */
|
||||||
|
|
||||||
|
/* Required to prevent the C compiler from applying FMA optimizations.
|
||||||
|
**
|
||||||
|
** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory.
|
||||||
|
** But the current state of C compilers is a mess in this regard.
|
||||||
|
** Also, this function is not performance sensitive at all.
|
||||||
|
*/
|
||||||
|
LJ_NOINLINE static double lj_vm_floormul(double x, double y)
|
||||||
|
{
|
||||||
|
return lj_vm_floor(x / y) * y;
|
||||||
|
}
|
||||||
|
|
||||||
double lj_vm_foldarith(double x, double y, int op)
|
double lj_vm_foldarith(double x, double y, int op)
|
||||||
{
|
{
|
||||||
switch (op) {
|
switch (op) {
|
||||||
@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op)
|
|||||||
case IR_SUB - IR_ADD: return x-y; break;
|
case IR_SUB - IR_ADD: return x-y; break;
|
||||||
case IR_MUL - IR_ADD: return x*y; break;
|
case IR_MUL - IR_ADD: return x*y; break;
|
||||||
case IR_DIV - IR_ADD: return x/y; break;
|
case IR_DIV - IR_ADD: return x/y; break;
|
||||||
case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
|
case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break;
|
||||||
case IR_POW - IR_ADD: return pow(x, y); break;
|
case IR_POW - IR_ADD: return pow(x, y); break;
|
||||||
case IR_NEG - IR_ADD: return -x; break;
|
case IR_NEG - IR_ADD: return -x; break;
|
||||||
case IR_ABS - IR_ADD: return fabs(x); break;
|
case IR_ABS - IR_ADD: return fabs(x); break;
|
||||||
|
@ -2636,7 +2636,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|
|||||||
|.macro ins_arithmod, res, reg1, reg2
|
|.macro ins_arithmod, res, reg1, reg2
|
||||||
| fdiv d2, reg1, reg2
|
| fdiv d2, reg1, reg2
|
||||||
| frintm d2, d2
|
| frintm d2, d2
|
||||||
| fmsub res, d2, reg2, reg1
|
| // Cannot use fmsub, because FMA is not enabled by default.
|
||||||
|
| fmul d2, d2, reg2
|
||||||
|
| fsub res, reg1, d2
|
||||||
|.endmacro
|
|.endmacro
|
||||||
|
|
|
|
||||||
|.macro ins_arithdn, intins, fpins
|
|.macro ins_arithdn, intins, fpins
|
||||||
|
Loading…
Reference in New Issue
Block a user