diff --git a/src/vm_s390x.dasc b/src/vm_s390x.dasc
index f5471117..f6f1adb1 100644
--- a/src/vm_s390x.dasc
+++ b/src/vm_s390x.dasc
@@ -1,4 +1,4 @@
-|// Low-level VM code for IBM z/Architecture (s390x) CPUs.
+|// Low-level VM code for IBM z/Architecture (s390x) CPUs in LJ_GC64 mode.
 |// Bytecode interpreter, fast functions and helper functions.
 |// Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h
 |
@@ -32,7 +32,7 @@
 |.define BASE,			r7	// Base of current Lua stack frame.
 |.define KBASE,			r8	// Constants of current Lua function.
 |.define PC,			r9	// Next PC.
-|.define GLREG,			r10	// Global state.
+|.define DISPATCH,		r10	// Opcode dispatch table.
 |.define LREG,			r11	// Register holding lua_State (also in SAVE_L).
 |
 |// The following temporaries are not saved across C calls, except for RD.
@@ -56,6 +56,8 @@
 |.define CRET1,			r2
 |
 |.define SP,			r15
+|.define OP,			r2
+|.define TMP1,			r3
 |
 |// Stack layout while in interpreter. Must match with lj_frame.h.
 |.define CFRAME_SPACE,	240	// Delta for SP, 8 byte aligned.
@@ -134,14 +136,29 @@
 |.macro ins_A; .endmacro
 |.macro ins_AD; .endmacro
 |.macro ins_AJ; .endmacro
-|.macro ins_ABC;  .endmacro
-|.macro ins_AB_;  .endmacro
+|.macro ins_ABC; .endmacro
+|.macro ins_AB_; .endmacro
 |.macro ins_A_C; .endmacro
 |.macro ins_AND; .endmacro
 |
-|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
+|// Instruction decode+dispatch.
+|  // TODO: tune this, right now we always decode RA-D even if they aren't used.
 |.macro ins_NEXT
-
+|  l RD, (PC)
+|  // 32                 63
+|  // [  B |  C |  A | OP ]
+|  // [    D    |  A | OP ]
+|  llhr RA, RD
+|  srl RA, #8
+|  llcr OP, RD
+|  srl RD, #16
+|  lr RB, RD
+|  srl RB, #8
+|  llcr RC, RD
+|  la PC, 4(PC)
+|  llgfr TMP1, OP
+|  sll TMP1, #3 // TMP1=OP*8
+|  b 0(TMP1, DISPATCH)
 |.endmacro
 |
 |// Instruction footer.
@@ -151,8 +168,6 @@
 |  .define ins_next_, ins_NEXT
 |.else
 |  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
-|  // Affects only certain kinds of benchmarks (and only with -j off).
-|  // Around 10%-30% slower on Core2, a lot more slower on P4.
 |  .macro ins_next
 |    jmp ->ins_next
 |  .endmacro