Use execute rather than loop for mvc and avoid jumps in fast path.

Not sure if this works, the tests don't exercise the stack code.
2025-02-08 15:34:09 +00:00 · 2017-01-10 14:12:06 -05:00 · 2017-01-10 14:12:06 -05:00 · e933353feb
commit e933353feb
parent 660ddd1db2
2 changed files with 31 additions and 29 deletions
--- a/src/Makefile
+++ b/src/Makefile
@ -56,7 +56,7 @@ CCOPT_mips=
 #
 CCDEBUG=
 # Uncomment the next line to generate debug information:
-#CCDEBUG= -g
+CCDEBUG= -g
 #
 CCWARN= -Wall
 # Uncomment the next line to enable more warnings:
--- a/src/vm_s390x.dasc
+++ b/src/vm_s390x.dasc
@ -2142,47 +2142,26 @@ static void build_subroutines(BuildCtx *ctx)
  |->vm_ffi_call:			// Call C function via FFI.
  |  // Caveat: needs special frame unwinding, see below.
  |.if FFI
-  |  .type CCSTATE, CCallState, r10
-  |  stmg r6, r15, 48(sp) // TODO: need to save r6, but might be better in separate store?
+  |  .type CCSTATE, CCallState, r8
+  |  stmg r6, r15, 48(sp)
  |  lgr CCSTATE, CARG1
+  |  lg r7, CCSTATE->func // TODO: move further up?
  |
  |  // Readjust stack.
  |  sgf sp, CCSTATE->spadj
  |
  |  // Copy stack slots.
-  |  llgc r0, CCSTATE->nsp
-  |  cghi r0, 0
-  |  jle >3
-  |  lay r1, (offsetof(CCallState, stack))(CCSTATE)	// Source.
-  |  lay r11, (CCALL_SPS_EXTRA*8)(sp)			// Destination.
+  |  llgc r1, CCSTATE->nsp
+  |  chi r1, 0
+  |  jh >2
  |1:
-  |  cghi r0, 256
-  |  jl >2
-  |  mvc 0(256, r11), 0(r1)
-  |  aghi r1, 256*8
-  |  aghi r11, 256*8
-  |  aghi r0, -256
-  |  j <1
-  |2:
-  |  cghi r0, 0
-  |  je >3
-  |  // TODO: exrl mvc rather than loop.
-  |  mvc 0(8, r11), 0(r1)
-  |  aghi r1, 8
-  |  aghi r11, 8
-  |  aghi r0, -1
-  |  j <2
-  |3:
-  |
  |  lmg CARG1, CARG5, CCSTATE->gpr[0]
  |  // TODO: conditionally load FPRs?
  |  ld FARG1, CCSTATE->fpr[0]
  |  ld FARG2, CCSTATE->fpr[1]
  |  ld FARG3, CCSTATE->fpr[2]
  |  ld FARG4, CCSTATE->fpr[3]
-  |5:
-  |  lg r1, CCSTATE->func // TODO: move further up?
-  |  basr r14, r1
+  |  basr r14, r7
  |
  |  stg CRET1, CCSTATE->gpr[0]
  |  stg f0, CCSTATE->fpr[0]
@ -2190,6 +2169,29 @@ static void build_subroutines(BuildCtx *ctx)
  |  agf sp, CCSTATE->spadj
  |  lmg r6, r15, 48(sp)
  |  br r14
+  |
+  |2:
+  |  lay r10, (offsetof(CCallState, stack))(CCSTATE)	// Source.
+  |  lay r11, (CCALL_SPS_EXTRA*8)(sp)			// Destination.
+  |3:
+  |  chi r1, 256
+  |  jl >4
+  |  mvc 0(256, r11), 0(r10)
+  |  la r10, 256*8(r10)
+  |  la r11, 256*8(r11)
+  |  ahi r1, -256
+  |  j <3
+  |
+  |4:
+  |  ahi r1, -1
+  |  jl <1
+  |  larl r9, >5
+  |  ex r1, 0(r9) // TODO: exrl is faster but needs z10.
+  |  j <1
+  |
+  |5:
+  |  // exrl target
+  |  mvc 0(1, r11), 0(r10)
  |.endif
  |// Note: vm_ffi_call must be the last function in this object file!
  |