diff --git a/src/vm_s390x.dasc b/src/vm_s390x.dasc index f5471117..f6f1adb1 100644 --- a/src/vm_s390x.dasc +++ b/src/vm_s390x.dasc @@ -1,4 +1,4 @@ -|// Low-level VM code for IBM z/Architecture (s390x) CPUs. +|// Low-level VM code for IBM z/Architecture (s390x) CPUs in LJ_GC64 mode. |// Bytecode interpreter, fast functions and helper functions. |// Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h | @@ -32,7 +32,7 @@ |.define BASE, r7 // Base of current Lua stack frame. |.define KBASE, r8 // Constants of current Lua function. |.define PC, r9 // Next PC. -|.define GLREG, r10 // Global state. +|.define DISPATCH, r10 // Opcode dispatch table. |.define LREG, r11 // Register holding lua_State (also in SAVE_L). | |// The following temporaries are not saved across C calls, except for RD. @@ -56,6 +56,8 @@ |.define CRET1, r2 | |.define SP, r15 +|.define OP, r2 +|.define TMP1, r3 | |// Stack layout while in interpreter. Must match with lj_frame.h. |.define CFRAME_SPACE, 240 // Delta for SP, 8 byte aligned. @@ -134,14 +136,29 @@ |.macro ins_A; .endmacro |.macro ins_AD; .endmacro |.macro ins_AJ; .endmacro -|.macro ins_ABC; .endmacro -|.macro ins_AB_; .endmacro +|.macro ins_ABC; .endmacro +|.macro ins_AB_; .endmacro |.macro ins_A_C; .endmacro |.macro ins_AND; .endmacro | -|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster). +|// Instruction decode+dispatch. +| // TODO: tune this, right now we always decode RA-D even if they aren't used. |.macro ins_NEXT - +| l RD, (PC) +| // 32 63 +| // [ B | C | A | OP ] +| // [ D | A | OP ] +| llhr RA, RD +| srl RA, #8 +| llcr OP, RD +| srl RD, #16 +| lr RB, RD +| srl RB, #8 +| llcr RC, RD +| la PC, 4(PC) +| llgfr TMP1, OP +| sll TMP1, #3 // TMP1=OP*8 +| b 0(TMP1, DISPATCH) |.endmacro | |// Instruction footer. @@ -151,8 +168,6 @@ | .define ins_next_, ins_NEXT |.else | // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch. -| // Affects only certain kinds of benchmarks (and only with -j off). -| // Around 10%-30% slower on Core2, a lot more slower on P4. | .macro ins_next | jmp ->ins_next | .endmacro