From cf903edb30e0cbd620ebd4bac02d4e2b4410fd02 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 29 Aug 2023 02:12:13 +0200 Subject: [PATCH] FFI: Unify stack setup for C calls in interpreter. --- src/lj_ccall.c | 57 +++++++++++++++++++++++++--------------------- src/lj_ccall.h | 7 +++--- src/vm_arm.dasc | 8 +++---- src/vm_arm64.dasc | 8 +++---- src/vm_mips.dasc | 1 - src/vm_mips64.dasc | 1 - src/vm_ppc.dasc | 3 +-- src/vm_x64.dasc | 8 +++---- src/vm_x86.dasc | 24 +++++++++++-------- 9 files changed, 63 insertions(+), 54 deletions(-) diff --git a/src/lj_ccall.c b/src/lj_ccall.c index 04e306eb..9001cb5a 100644 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -20,12 +20,15 @@ #if LJ_TARGET_X86 /* -- x86 calling conventions --------------------------------------------- */ +#define CCALL_PUSH(arg) \ + *(GPRArg *)((uint8_t *)cc->stack + nsp) = (GPRArg)(arg), nsp += CTSIZE_PTR + #if LJ_ABI_WIN #define CCALL_HANDLE_STRUCTRET \ /* Return structs bigger than 8 by reference (on stack only). */ \ cc->retref = (sz > 8); \ - if (cc->retref) cc->stack[nsp++] = (GPRArg)dp; + if (cc->retref) CCALL_PUSH(dp); #define CCALL_HANDLE_COMPLEXRET CCALL_HANDLE_STRUCTRET @@ -40,7 +43,7 @@ if (ngpr < maxgpr) \ cc->gpr[ngpr++] = (GPRArg)dp; \ else \ - cc->stack[nsp++] = (GPRArg)dp; \ + CCALL_PUSH(dp); \ } else { /* Struct with single FP field ends up in FPR. */ \ cc->resx87 = ccall_classify_struct(cts, ctr); \ } @@ -56,7 +59,7 @@ if (ngpr < maxgpr) \ cc->gpr[ngpr++] = (GPRArg)dp; \ else \ - cc->stack[nsp++] = (GPRArg)dp; + CCALL_PUSH(dp); #endif @@ -67,7 +70,7 @@ if (ngpr < maxgpr) \ cc->gpr[ngpr++] = (GPRArg)dp; \ else \ - cc->stack[nsp++] = (GPRArg)dp; \ + CCALL_PUSH(dp); \ } #endif @@ -278,8 +281,8 @@ if (ngpr < maxgpr) { \ dp = &cc->gpr[ngpr]; \ if (ngpr + n > maxgpr) { \ - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ + nsp += (ngpr + n - maxgpr) * CTSIZE_PTR; /* Assumes contiguous gpr/stack fields. */ \ + if (nsp > CCALL_SIZE_STACK) goto err_nyi; /* Too many arguments. */ \ ngpr = maxgpr; \ } else { \ ngpr += n; \ @@ -471,8 +474,8 @@ if (ngpr < maxgpr) { \ dp = &cc->gpr[ngpr]; \ if (ngpr + n > maxgpr) { \ - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ + nsp += (ngpr + n - maxgpr) * CTSIZE_PTR; /* Assumes contiguous gpr/stack fields. */ \ + if (nsp > CCALL_SIZE_STACK) goto err_nyi; /* Too many arguments. */ \ ngpr = maxgpr; \ } else { \ ngpr += n; \ @@ -565,8 +568,8 @@ if (ngpr < maxgpr) { \ dp = &cc->gpr[ngpr]; \ if (ngpr + n > maxgpr) { \ - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ + nsp += (ngpr + n - maxgpr) * CTSIZE_PTR; /* Assumes contiguous gpr/stack fields. */ \ + if (nsp > CCALL_SIZE_STACK) goto err_nyi; /* Too many arguments. */ \ ngpr = maxgpr; \ } else { \ ngpr += n; \ @@ -698,10 +701,11 @@ static int ccall_struct_arg(CCallState *cc, CTState *cts, CType *d, int *rcl, lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg)); if (ccall_struct_reg(cc, cts, dp, rcl)) { /* Register overflow? Pass on stack. */ - MSize nsp = cc->nsp, n = rcl[1] ? 2 : 1; - if (nsp + n > CCALL_MAXSTACK) return 1; /* Too many arguments. */ - cc->nsp = nsp + n; - memcpy(&cc->stack[nsp], dp, n*CTSIZE_PTR); + MSize nsp = cc->nsp, sz = rcl[1] ? 2*CTSIZE_PTR : CTSIZE_PTR; + if (nsp + sz > CCALL_SIZE_STACK) + return 1; /* Too many arguments. */ + cc->nsp = nsp + sz; + memcpy((uint8_t *)cc->stack + nsp, dp, sz); } return 0; /* Ok. */ } @@ -1022,22 +1026,23 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, } else { sz = CTSIZE_PTR; } - sz = (sz + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1); - n = sz / CTSIZE_PTR; /* Number of GPRs or stack slots needed. */ + n = (sz + CTSIZE_PTR-1) / CTSIZE_PTR; /* Number of GPRs or stack slots needed. */ CCALL_HANDLE_REGARG /* Handle register arguments. */ /* Otherwise pass argument on stack. */ - if (CCALL_ALIGN_STACKARG && !rp && (d->info & CTF_ALIGN) > CTALIGN_PTR) { - MSize align = (1u << ctype_align(d->info-CTALIGN_PTR)) -1; - nsp = (nsp + align) & ~align; /* Align argument on stack. */ + if (CCALL_ALIGN_STACKARG) { /* Align argument on stack. */ + MSize align = (1u << ctype_align(d->info)) - 1; + if (rp) + align = CTSIZE_PTR-1; + nsp = (nsp + align) & ~align; } - if (nsp + n > CCALL_MAXSTACK) { /* Too many arguments. */ + dp = ((uint8_t *)cc->stack) + nsp; + nsp += n * CTSIZE_PTR; + if (nsp > CCALL_SIZE_STACK) { /* Too many arguments. */ err_nyi: lj_err_caller(L, LJ_ERR_FFI_NYICALL); } - dp = &cc->stack[nsp]; - nsp += n; isva = 0; done: @@ -1099,10 +1104,10 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, #if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) cc->nfpr = nfpr; /* Required for vararg functions. */ #endif - cc->nsp = nsp; - cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA)*CTSIZE_PTR; - if (nsp > CCALL_SPS_FREE) - cc->spadj += (((nsp-CCALL_SPS_FREE)*CTSIZE_PTR + 15u) & ~15u); + cc->nsp = (nsp + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1); + cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA) * CTSIZE_PTR; + if (cc->nsp > CCALL_SPS_FREE * CTSIZE_PTR) + cc->spadj += (((cc->nsp - CCALL_SPS_FREE * CTSIZE_PTR) + 15u) & ~15u); return gcsteps; } diff --git a/src/lj_ccall.h b/src/lj_ccall.h index 547415f7..57300817 100644 --- a/src/lj_ccall.h +++ b/src/lj_ccall.h @@ -152,14 +152,15 @@ typedef union FPRArg { LJ_STATIC_ASSERT(CCALL_NUM_GPR <= CCALL_MAX_GPR); LJ_STATIC_ASSERT(CCALL_NUM_FPR <= CCALL_MAX_FPR); -#define CCALL_MAXSTACK 32 +#define CCALL_NUM_STACK 31 +#define CCALL_SIZE_STACK (CCALL_NUM_STACK * CTSIZE_PTR) /* -- C call state -------------------------------------------------------- */ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState { void (*func)(void); /* Pointer to called function. */ uint32_t spadj; /* Stack pointer adjustment. */ - uint8_t nsp; /* Number of stack slots. */ + uint8_t nsp; /* Number of bytes on stack. */ uint8_t retref; /* Return value by reference. */ #if LJ_TARGET_X64 uint8_t ngpr; /* Number of arguments in GPRs. */ @@ -178,7 +179,7 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState { FPRArg fpr[CCALL_NUM_FPR]; /* Arguments/results in FPRs. */ #endif GPRArg gpr[CCALL_NUM_GPR]; /* Arguments/results in GPRs. */ - GPRArg stack[CCALL_MAXSTACK]; /* Stack slots. */ + GPRArg stack[CCALL_NUM_STACK]; /* Stack slots. */ } CCallState; /* -- C call handling ----------------------------------------------------- */ diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc index 4f0798e0..0d1ea95f 100644 --- a/src/vm_arm.dasc +++ b/src/vm_arm.dasc @@ -2571,16 +2571,16 @@ static void build_subroutines(BuildCtx *ctx) |.endif | mov r11, sp | sub sp, sp, CARG1 // Readjust stack. - | subs CARG2, CARG2, #1 + | subs CARG2, CARG2, #4 |.if HFABI | vldm RB, {d0-d7} |.endif | ldr RB, CCSTATE->func | bmi >2 |1: // Copy stack slots. - | ldr CARG4, [CARG3, CARG2, lsl #2] - | str CARG4, [sp, CARG2, lsl #2] - | subs CARG2, CARG2, #1 + | ldr CARG4, [CARG3, CARG2] + | str CARG4, [sp, CARG2] + | subs CARG2, CARG2, #4 | bpl <1 |2: | ldrd CARG12, CCSTATE->gpr[0] diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index a7a9392c..698b4210 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -2222,14 +2222,14 @@ static void build_subroutines(BuildCtx *ctx) | ldr TMP0w, CCSTATE:x0->spadj | ldrb TMP1w, CCSTATE->nsp | add TMP2, CCSTATE, #offsetof(CCallState, stack) - | subs TMP1, TMP1, #1 + | subs TMP1, TMP1, #8 | ldr TMP3, CCSTATE->func | sub sp, sp, TMP0 | bmi >2 |1: // Copy stack slots - | ldr TMP0, [TMP2, TMP1, lsl #3] - | str TMP0, [sp, TMP1, lsl #3] - | subs TMP1, TMP1, #1 + | ldr TMP0, [TMP2, TMP1] + | str TMP0, [sp, TMP1] + | subs TMP1, TMP1, #8 | bpl <1 |2: | ldp x0, x1, CCSTATE->gpr[0] diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc index 94a878b9..f276745c 100644 --- a/src/vm_mips.dasc +++ b/src/vm_mips.dasc @@ -2951,7 +2951,6 @@ static void build_subroutines(BuildCtx *ctx) | move TMP2, sp | subu sp, sp, TMP1 | sw ra, -4(TMP2) - | sll CARG2, CARG2, 2 | sw r16, -8(TMP2) | sw CCSTATE, -12(TMP2) | move r16, TMP2 diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc index f8e181ee..6c215f2b 100644 --- a/src/vm_mips64.dasc +++ b/src/vm_mips64.dasc @@ -3065,7 +3065,6 @@ static void build_subroutines(BuildCtx *ctx) | move TMP2, sp | dsubu sp, sp, TMP1 | sd ra, -8(TMP2) - | sll CARG2, CARG2, 3 | sd r16, -16(TMP2) | sd CCSTATE, -24(TMP2) | move r16, TMP2 diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc index 73a70a00..f2e5a08f 100644 --- a/src/vm_ppc.dasc +++ b/src/vm_ppc.dasc @@ -3269,14 +3269,13 @@ static void build_subroutines(BuildCtx *ctx) | stw TMP0, 4(sp) | cmpwi cr1, CARG3, 0 | mr TMP2, sp - | addic. CARG2, CARG2, -1 + | addic. CARG2, CARG2, -4 | stwux sp, sp, TMP1 | crnot 4*cr1+eq, 4*cr1+eq // For vararg calls. | stw r14, -4(TMP2) | stw CCSTATE, -8(TMP2) | mr r14, TMP2 | la TMP1, CCSTATE->stack - | slwi CARG2, CARG2, 2 | blty >2 | la TMP2, 8(sp) |1: diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc index a8649b4e..3635ba28 100644 --- a/src/vm_x64.dasc +++ b/src/vm_x64.dasc @@ -2755,12 +2755,12 @@ static void build_subroutines(BuildCtx *ctx) | | // Copy stack slots. | movzx ecx, byte CCSTATE->nsp - | sub ecx, 1 + | sub ecx, 8 | js >2 |1: - | mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)] - | mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax - | sub ecx, 1 + | mov rax, [CCSTATE+rcx+offsetof(CCallState, stack)] + | mov [rsp+rcx+CCALL_SPS_EXTRA*8], rax + | sub ecx, 8 | jns <1 |2: | diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc index bda9d7d7..c44a24ff 100644 --- a/src/vm_x86.dasc +++ b/src/vm_x86.dasc @@ -3314,19 +3314,25 @@ static void build_subroutines(BuildCtx *ctx) | | // Copy stack slots. | movzx ecx, byte CCSTATE->nsp - | sub ecx, 1 + |.if X64 + | sub ecx, 8 | js >2 |1: - |.if X64 - | mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)] - | mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax - |.else - | mov eax, [CCSTATE+ecx*4+offsetof(CCallState, stack)] - | mov [esp+ecx*4], eax - |.endif - | sub ecx, 1 + | mov rax, [CCSTATE+rcx+offsetof(CCallState, stack)] + | mov [rsp+rcx+CCALL_SPS_EXTRA*8], rax + | sub ecx, 8 | jns <1 |2: + |.else + | sub ecx, 4 + | js >2 + |1: + | mov eax, [CCSTATE+ecx+offsetof(CCallState, stack)] + | mov [esp+ecx], eax + | sub ecx, 4 + | jns <1 + |2: + |.endif | |.if X64 | movzx eax, byte CCSTATE->nfpr