From 096a7cf4e4fde11749b7bdd978355fabaa84d6bb Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 13 Apr 2016 16:10:03 +0200 Subject: [PATCH 01/57] x64/LJ_GC64: Fix BC_UCLO check for fast-path. Thanks to Vyacheslav Egorov. --- src/vm_x64.dasc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc index c7a7740b..759e30ec 100644 --- a/src/vm_x64.dasc +++ b/src/vm_x64.dasc @@ -3516,7 +3516,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ins_AD // RA = level, RD = target | branchPC RD // Do this first to free RD. | mov L:RB, SAVE_L - | cmp dword L:RB->openupval, 0 + | cmp aword L:RB->openupval, 0 | je >1 | mov L:RB->base, BASE | lea CARG2, [BASE+RA*8] // Caveat: CARG2 == BASE From e5b5e079c364bb429a85f6c740c478e2dd820381 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 14 Apr 2016 00:14:42 +0200 Subject: [PATCH 02/57] MIPS: Fix BC_ISNEXT fallback path. Thanks to RT-RK.com. --- src/vm_mips.dasc | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc index b135fbe0..7fc1fb2c 100644 --- a/src/vm_mips.dasc +++ b/src/vm_mips.dasc @@ -3619,24 +3619,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_ISNEXT: | // RA = base*8, RD = target (points to ITERN) | addu RA, BASE, RA - | lw TMP0, -24+HI(RA) - | lw CFUNC:TMP1, -24+LO(RA) - | lw TMP2, -16+HI(RA) - | lw TMP3, -8+HI(RA) + | srl TMP0, RD, 1 + | lw CARG1, -24+HI(RA) + | lw CFUNC:CARG2, -24+LO(RA) + | addu TMP0, PC, TMP0 + | lw CARG3, -16+HI(RA) + | lw CARG4, -8+HI(RA) | li AT, LJ_TFUNC - | bne TMP0, AT, >5 - |. addiu TMP2, TMP2, -LJ_TTAB - | lbu TMP1, CFUNC:TMP1->ffid - | addiu TMP3, TMP3, -LJ_TNIL - | srl TMP0, RD, 1 - | or TMP2, TMP2, TMP3 - | addiu TMP1, TMP1, -FF_next_N - | addu TMP0, PC, TMP0 - | or TMP1, TMP1, TMP2 - | bnez TMP1, >5 - |. lui TMP2, (-(BCBIAS_J*4 >> 16) & 65535) + | bne CARG1, AT, >5 + |. lui TMP2, (-(BCBIAS_J*4 >> 16) & 65535) + | lbu CARG2, CFUNC:CARG2->ffid + | addiu CARG3, CARG3, -LJ_TTAB + | addiu CARG4, CARG4, -LJ_TNIL + | or CARG3, CARG3, CARG4 + | addiu CARG2, CARG2, -FF_next_N + | or CARG2, CARG2, CARG3 + | bnez CARG2, >5 + |. lui TMP1, 0xfffe | addu PC, TMP0, TMP2 - | lui TMP1, 0xfffe | ori TMP1, TMP1, 0x7fff | sw r0, -8+LO(RA) // Initialize control var. | sw TMP1, -8+HI(RA) @@ -3646,7 +3646,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | li TMP3, BC_JMP | li TMP1, BC_ITERC | sb TMP3, -4+OFS_OP(PC) - | addu PC, TMP0, TMP2 + | addu PC, TMP0, TMP2 | b <1 |. sb TMP1, OFS_OP(PC) break; From 0c6fdc1039a3a4450d366fba7af4b29de73f0dc6 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 18 Apr 2016 10:57:49 +0200 Subject: [PATCH 03/57] Rewrite memory block allocator. Use a mix of linear probing and pseudo-random probing. Workaround for 1GB MAP_32BIT limit on Linux/x64. Now 2GB with !LJ_GC64. Enforce 128TB LJ_GC64 limit for > 47 bit memory layouts (ARM64). --- src/lj_alloc.c | 267 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 176 insertions(+), 91 deletions(-) diff --git a/src/lj_alloc.c b/src/lj_alloc.c index 32de45ec..5d0834ec 100644 --- a/src/lj_alloc.c +++ b/src/lj_alloc.c @@ -72,13 +72,56 @@ #define IS_DIRECT_BIT (SIZE_T_ONE) + +/* Determine system-specific block allocation method. */ #if LJ_TARGET_WINDOWS #define WIN32_LEAN_AND_MEAN #include -#if LJ_64 && !LJ_GC64 +#define LJ_ALLOC_VIRTUALALLOC 1 +#if LJ_64 && !LJ_GC64 +#define LJ_ALLOC_NTAVM 1 +#endif + +#else + +#include +/* If this include fails, then rebuild with: -DLUAJIT_USE_SYSMALLOC */ +#include + +#define LJ_ALLOC_MMAP 1 + +#if LJ_64 + +#define LJ_ALLOC_MMAP_PROBE 1 + +#if LJ_GC64 +#define LJ_ALLOC_MBITS 47 /* 128 TB in LJ_GC64 mode. */ +#elif LJ_TARGET_X64 && LJ_HASJIT +/* Due to limitations in the x64 compiler backend. */ +#define LJ_ALLOC_MBITS 31 /* 2 GB on x64 with !LJ_GC64. */ +#else +#define LJ_ALLOC_MBITS 32 /* 4 GB on other archs with !LJ_GC64. */ +#endif + +#endif + +#if LJ_64 && !LJ_GC64 && defined(MAP_32BIT) +#define LJ_ALLOC_MMAP32 1 +#endif + +#if LJ_TARGET_LINUX +#define LJ_ALLOC_MREMAP 1 +#endif + +#endif + + +#if LJ_ALLOC_VIRTUALALLOC + +#if LJ_ALLOC_NTAVM /* Undocumented, but hey, that's what we all love so much about Windows. */ typedef long (*PNTAVM)(HANDLE handle, void **addr, ULONG zbits, size_t *size, ULONG alloctype, ULONG prot); @@ -89,14 +132,15 @@ static PNTAVM ntavm; */ #define NTAVM_ZEROBITS 1 -static void INIT_MMAP(void) +static void init_mmap(void) { ntavm = (PNTAVM)GetProcAddress(GetModuleHandleA("ntdll.dll"), "NtAllocateVirtualMemory"); } +#define INIT_MMAP() init_mmap() /* Win64 32 bit MMAP via NtAllocateVirtualMemory. */ -static LJ_AINLINE void *CALL_MMAP(size_t size) +static void *CALL_MMAP(size_t size) { DWORD olderr = GetLastError(); void *ptr = NULL; @@ -107,7 +151,7 @@ static LJ_AINLINE void *CALL_MMAP(size_t size) } /* For direct MMAP, use MEM_TOP_DOWN to minimize interference */ -static LJ_AINLINE void *DIRECT_MMAP(size_t size) +static void *DIRECT_MMAP(size_t size) { DWORD olderr = GetLastError(); void *ptr = NULL; @@ -119,10 +163,8 @@ static LJ_AINLINE void *DIRECT_MMAP(size_t size) #else -#define INIT_MMAP() ((void)0) - /* Win32 MMAP via VirtualAlloc */ -static LJ_AINLINE void *CALL_MMAP(size_t size) +static void *CALL_MMAP(size_t size) { DWORD olderr = GetLastError(); void *ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); @@ -131,7 +173,7 @@ static LJ_AINLINE void *CALL_MMAP(size_t size) } /* For direct MMAP, use MEM_TOP_DOWN to minimize interference */ -static LJ_AINLINE void *DIRECT_MMAP(size_t size) +static void *DIRECT_MMAP(size_t size) { DWORD olderr = GetLastError(); void *ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN, @@ -143,7 +185,7 @@ static LJ_AINLINE void *DIRECT_MMAP(size_t size) #endif /* This function supports releasing coalesed segments */ -static LJ_AINLINE int CALL_MUNMAP(void *ptr, size_t size) +static int CALL_MUNMAP(void *ptr, size_t size) { DWORD olderr = GetLastError(); MEMORY_BASIC_INFORMATION minfo; @@ -163,10 +205,7 @@ static LJ_AINLINE int CALL_MUNMAP(void *ptr, size_t size) return 0; } -#else - -#include -#include +#elif LJ_ALLOC_MMAP #define MMAP_PROT (PROT_READ|PROT_WRITE) #if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) @@ -174,107 +213,145 @@ static LJ_AINLINE int CALL_MUNMAP(void *ptr, size_t size) #endif #define MMAP_FLAGS (MAP_PRIVATE|MAP_ANONYMOUS) -#if LJ_64 && !LJ_GC64 -/* 64 bit mode with 32 bit pointers needs special support for allocating -** memory in the lower 2GB. +#if LJ_ALLOC_MMAP_PROBE + +#define LJ_ALLOC_MMAP_PROBE_MAX 30 +#define LJ_ALLOC_MMAP_PROBE_LINEAR 5 + +#define LJ_ALLOC_MMAP_PROBE_LOWER ((uintptr_t)0x4000) + +/* No point in a giant ifdef mess. Just try to open /dev/urandom. +** It doesn't really matter if this fails, since we get some ASLR bits from +** every unsuitable allocation, too. And we prefer linear allocation, anyway. */ +#include +#include -#if defined(MAP_32BIT) - -#if defined(__sun__) -#define MMAP_REGION_START ((uintptr_t)0x1000) -#else -/* Actually this only gives us max. 1GB in current Linux kernels. */ -#define MMAP_REGION_START ((uintptr_t)0) -#endif - -static LJ_AINLINE void *CALL_MMAP(size_t size) +static uintptr_t mmap_probe_seed(void) { - int olderr = errno; - void *ptr = mmap((void *)MMAP_REGION_START, size, MMAP_PROT, MAP_32BIT|MMAP_FLAGS, -1, 0); - errno = olderr; - return ptr; + uintptr_t val; + int fd = open("/dev/urandom", O_RDONLY); + if (fd != -1) { + int ok = ((size_t)read(fd, &val, sizeof(val)) == sizeof(val)); + (void)close(fd); + if (ok) return val; + } + return 1; /* Punt. */ } -#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__) || LJ_TARGET_CYGWIN - -/* OSX and FreeBSD mmap() use a naive first-fit linear search. -** That's perfect for us. Except that -pagezero_size must be set for OSX, -** otherwise the lower 4GB are blocked. And the 32GB RLIMIT_DATA needs -** to be reduced to 250MB on FreeBSD. -*/ -#if LJ_TARGET_OSX || defined(__DragonFly__) -#define MMAP_REGION_START ((uintptr_t)0x10000) -#elif LJ_TARGET_PS4 -#define MMAP_REGION_START ((uintptr_t)0x4000) -#else -#define MMAP_REGION_START ((uintptr_t)0x10000000) -#endif -#define MMAP_REGION_END ((uintptr_t)0x80000000) - -#if (defined(__FreeBSD__) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4 -#include -#endif - -static LJ_AINLINE void *CALL_MMAP(size_t size) +static void *mmap_probe(size_t size) { - int olderr = errno; /* Hint for next allocation. Doesn't need to be thread-safe. */ - static uintptr_t alloc_hint = MMAP_REGION_START; - int retry = 0; -#if (defined(__FreeBSD__) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4 - static int rlimit_modified = 0; - if (LJ_UNLIKELY(rlimit_modified == 0)) { - struct rlimit rlim; - rlim.rlim_cur = rlim.rlim_max = MMAP_REGION_START; - setrlimit(RLIMIT_DATA, &rlim); /* Ignore result. May fail below. */ - rlimit_modified = 1; - } -#endif - for (;;) { - void *p = mmap((void *)alloc_hint, size, MMAP_PROT, MMAP_FLAGS, -1, 0); - if ((uintptr_t)p >= MMAP_REGION_START && - (uintptr_t)p + size < MMAP_REGION_END) { - alloc_hint = (uintptr_t)p + size; + static uintptr_t hint_addr = 0; + static uintptr_t hint_prng = 0; + int olderr = errno; + int retry; + for (retry = 0; retry < LJ_ALLOC_MMAP_PROBE_MAX; retry++) { + void *p = mmap((void *)hint_addr, size, MMAP_PROT, MMAP_FLAGS, -1, 0); + uintptr_t addr = (uintptr_t)p; + if ((addr >> LJ_ALLOC_MBITS) == 0 && addr >= LJ_ALLOC_MMAP_PROBE_LOWER) { + /* We got a suitable address. Bump the hint address. */ + hint_addr = addr + size; errno = olderr; return p; } - if (p != CMFAIL) munmap(p, size); -#if defined(__sun__) || defined(__DragonFly__) - alloc_hint += 0x1000000; /* Need near-exhaustive linear scan. */ - if (alloc_hint + size < MMAP_REGION_END) continue; -#endif - if (retry) break; - retry = 1; - alloc_hint = MMAP_REGION_START; + if (p != MFAIL) { + munmap(p, size); + } else if (errno == ENOMEM) { + return MFAIL; + } + if (hint_addr) { + /* First, try linear probing. */ + if (retry < LJ_ALLOC_MMAP_PROBE_LINEAR) { + hint_addr += 0x1000000; + if (((hint_addr + size) >> LJ_ALLOC_MBITS) != 0) + hint_addr = 0; + continue; + } else if (retry == LJ_ALLOC_MMAP_PROBE_LINEAR) { + /* Next, try a no-hint probe to get back an ASLR address. */ + hint_addr = 0; + continue; + } + } + /* Finally, try pseudo-random probing. */ + if (LJ_UNLIKELY(hint_prng == 0)) { + hint_prng = mmap_probe_seed(); + } + /* The unsuitable address we got has some ASLR PRNG bits. */ + hint_addr ^= addr & ~((uintptr_t)(LJ_PAGESIZE-1)); + do { /* The PRNG itself is very weak, but see above. */ + hint_prng = hint_prng * 1103515245 + 12345; + hint_addr ^= hint_prng * (uintptr_t)LJ_PAGESIZE; + hint_addr &= (((uintptr_t)1 << LJ_ALLOC_MBITS)-1); + } while (hint_addr < LJ_ALLOC_MMAP_PROBE_LOWER); } errno = olderr; - return CMFAIL; + return MFAIL; } -#else - -#error "NYI: need an equivalent of MAP_32BIT for this 64 bit OS" - #endif -#else +#if LJ_ALLOC_MMAP32 -/* 32 bit mode and GC64 mode is easy. */ -static LJ_AINLINE void *CALL_MMAP(size_t size) +#if defined(__sun__) +#define LJ_ALLOC_MMAP32_START ((uintptr_t)0x1000) +#else +#define LJ_ALLOC_MMAP32_START ((uintptr_t)0) +#endif + +static void *mmap_map32(size_t size) +{ +#if LJ_ALLOC_MMAP_PROBE + static int fallback = 0; + if (fallback) + return mmap_probe(size); +#endif + { + int olderr = errno; + void *ptr = mmap((void *)LJ_ALLOC_MMAP32_START, size, MMAP_PROT, MAP_32BIT|MMAP_FLAGS, -1, 0); + errno = olderr; + /* This only allows 1GB on Linux. So fallback to probing to get 2GB. */ +#if LJ_ALLOC_MMAP_PROBE + if (ptr == MFAIL) { + fallback = 1; + return mmap_probe(size); + } +#endif + return ptr; + } +} + +#endif + +#if LJ_ALLOC_MMAP32 +#define CALL_MMAP(size) mmap_map32(size) +#elif LJ_ALLOC_MMAP_PROBE +#define CALL_MMAP(size) mmap_probe(size) +#else +static void *CALL_MMAP(size_t size) { int olderr = errno; void *ptr = mmap(NULL, size, MMAP_PROT, MMAP_FLAGS, -1, 0); errno = olderr; return ptr; } +#endif + +#if (defined(__FreeBSD__) || defined(__FreeBSD_kernel__)) && !LJ_TARGET_PS4 + +#include + +static void init_mmap(void) +{ + struct rlimit rlim; + rlim.rlim_cur = rlim.rlim_max = 0x10000; + setrlimit(RLIMIT_DATA, &rlim); /* Ignore result. May fail later. */ +} +#define INIT_MMAP() init_mmap() #endif -#define INIT_MMAP() ((void)0) -#define DIRECT_MMAP(s) CALL_MMAP(s) - -static LJ_AINLINE int CALL_MUNMAP(void *ptr, size_t size) +static int CALL_MUNMAP(void *ptr, size_t size) { int olderr = errno; int ret = munmap(ptr, size); @@ -282,10 +359,9 @@ static LJ_AINLINE int CALL_MUNMAP(void *ptr, size_t size) return ret; } -#if LJ_TARGET_LINUX +#if LJ_ALLOC_MREMAP /* Need to define _GNU_SOURCE to get the mremap prototype. */ -static LJ_AINLINE void *CALL_MREMAP_(void *ptr, size_t osz, size_t nsz, - int flags) +static void *CALL_MREMAP_(void *ptr, size_t osz, size_t nsz, int flags) { int olderr = errno; ptr = mremap(ptr, osz, nsz, flags); @@ -305,6 +381,15 @@ static LJ_AINLINE void *CALL_MREMAP_(void *ptr, size_t osz, size_t nsz, #endif + +#ifndef INIT_MMAP +#define INIT_MMAP() ((void)0) +#endif + +#ifndef DIRECT_MMAP +#define DIRECT_MMAP(s) CALL_MMAP(s) +#endif + #ifndef CALL_MREMAP #define CALL_MREMAP(addr, osz, nsz, mv) ((void)osz, MFAIL) #endif From 73680a5fc760cb39760e4bbfce1166ce75de237f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 18 Apr 2016 11:16:13 +0200 Subject: [PATCH 04/57] x86/x64: Search for exit jumps with instruction length decoder. Contributed by Peter Cawley. --- src/lj_asm_x86.h | 113 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 9 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index ffd59d33..39a792c2 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -2776,6 +2776,106 @@ static void asm_setup_target(ASMState *as) /* -- Trace patching ------------------------------------------------------ */ +static const uint8_t map_op1[256] = { +0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x20, +0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x51,0x51, +0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51, +0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51,0x92,0x92,0x92,0x92,0x52,0x45,0x10,0x51, +#if LJ_64 +0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x14,0x14,0x14,0x14,0x14,0x14,0x14,0x14, +#else +0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51, +#endif +0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51, +0x51,0x51,0x92,0x92,0x10,0x10,0x12,0x11,0x45,0x86,0x52,0x93,0x51,0x51,0x51,0x51, +0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52, +0x93,0x86,0x93,0x93,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92, +0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x51,0x47,0x51,0x51,0x51,0x51,0x51, +#if LJ_64 +0x59,0x59,0x59,0x59,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51, +#else +0x55,0x55,0x55,0x55,0x51,0x51,0x51,0x51,0x52,0x45,0x51,0x51,0x51,0x51,0x51,0x51, +#endif +0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x05,0x05,0x05,0x05,0x05,0x05,0x05,0x05, +0x93,0x93,0x53,0x51,0x70,0x71,0x93,0x86,0x54,0x51,0x53,0x51,0x51,0x52,0x51,0x51, +0x92,0x92,0x92,0x92,0x52,0x52,0x51,0x51,0x92,0x92,0x92,0x92,0x92,0x92,0x92,0x92, +0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x45,0x45,0x47,0x52,0x51,0x51,0x51,0x51, +0x10,0x51,0x10,0x10,0x51,0x51,0x63,0x66,0x51,0x51,0x51,0x51,0x51,0x51,0x92,0x92 +}; + +static const uint8_t map_op2[256] = { +0x93,0x93,0x93,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x51,0x52,0x51,0x93,0x52,0x94, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x34,0x51,0x35,0x51,0x51,0x51,0x51,0x51, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x53,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x94,0x54,0x54,0x54,0x93,0x93,0x93,0x52,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46,0x46, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x52,0x52,0x52,0x93,0x94,0x93,0x51,0x51,0x52,0x52,0x52,0x93,0x94,0x93,0x93,0x93, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x94,0x93,0x93,0x93,0x93,0x93, +0x93,0x93,0x94,0x93,0x94,0x94,0x94,0x93,0x52,0x52,0x52,0x52,0x52,0x52,0x52,0x52, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93, +0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x93,0x52 +}; + +static uint32_t asm_x86_inslen(const uint8_t* p) +{ + uint32_t result = 0; + uint32_t prefixes = 0; + uint32_t x = map_op1[*p]; + for (;;) { + switch (x >> 4) { + case 0: return result + x + (prefixes & 4); + case 1: prefixes |= x; x = map_op1[*++p]; result++; break; + case 2: x = map_op2[*++p]; break; + case 3: p++; goto mrm; + case 4: result -= (prefixes & 2); /* fallthrough */ + case 5: return result + (x & 15); + case 6: /* Group 3. */ + if (p[1] & 0x38) return result + 2; + if ((prefixes & 2) && (x == 0x66)) return result + 4; + return result + (x & 15); + case 7: /* VEX c4/c5. */ + if (LJ_32 && p[1] < 0xc0) { + x = 2; + goto mrm; + } + if (x == 0x70) { + x = *++p & 0x1f; + result++; + if (x >= 2) { + p += 2; + result += 2; + goto mrm; + } + } + p++; + result++; + x = map_op2[*++p]; + break; + case 8: result -= (prefixes & 2); /* fallthrough */ + case 9: mrm: /* ModR/M and possibly SIB. */ + result += (x & 15); + x = *++p; + switch (x >> 6) { + case 0: if ((x & 7) == 5) return result + 4; break; + case 1: result++; break; + case 2: result += 4; break; + case 3: return result; + } + if ((x & 7) == 4) { + result++; + if (x < 0x40 && (p[1] & 7) == 5) result += 4; + } + return result; + } + } +} + /* Patch exit jumps of existing machine code to a new target. */ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) { @@ -2788,18 +2888,13 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px) *(int32_t *)(p+len-4) = jmprel(p+len, target); /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */ - for (; p < pe; p++) - if (*(uint32_t *)(p+(LJ_64 ? 3 : 2)) == stateaddr && p[0] == XI_MOVmi) { - p += LJ_64 ? 11 : 10; + for (; p < pe; p += asm_x86_inslen(p)) + if (*(uint32_t *)(p+(LJ_64 ? 3 : 2)) == stateaddr && p[0] == XI_MOVmi) break; - } lua_assert(p < pe); - for (; p < pe; p++) { - if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) { + for (; p < pe; p += asm_x86_inslen(p)) + if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) *(int32_t *)(p+2) = jmprel(p+6, target); - p += 5; - } - } lj_mcode_sync(T->mcode, T->mcode + T->szmcode); lj_mcode_patch(J, mcarea, 1); } From cc4f5d056ab93521451631f28501015f054d8976 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 18 Apr 2016 13:40:49 +0200 Subject: [PATCH 05/57] Whitespace. --- src/lj_asm_x86.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 39a792c2..86a5b0a8 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -2841,17 +2841,17 @@ static uint32_t asm_x86_inslen(const uint8_t* p) return result + (x & 15); case 7: /* VEX c4/c5. */ if (LJ_32 && p[1] < 0xc0) { - x = 2; - goto mrm; + x = 2; + goto mrm; } if (x == 0x70) { - x = *++p & 0x1f; - result++; - if (x >= 2) { - p += 2; - result += 2; - goto mrm; - } + x = *++p & 0x1f; + result++; + if (x >= 2) { + p += 2; + result += 2; + goto mrm; + } } p++; result++; @@ -2868,8 +2868,8 @@ static uint32_t asm_x86_inslen(const uint8_t* p) case 3: return result; } if ((x & 7) == 4) { - result++; - if (x < 0x40 && (p[1] & 7) == 5) result += 4; + result++; + if (x < 0x40 && (p[1] & 7) == 5) result += 4; } return result; } From 2f0001fad05731ea3787b27cf9b19e5293c358b8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 21 Apr 2016 17:00:58 +0200 Subject: [PATCH 06/57] Fix handling of non-numeric strings in arithmetic coercions. Thanks to Vyacheslav Egorov. --- src/lj_ffrecord.c | 6 ++---- src/lj_iropt.h | 4 ++-- src/lj_opt_narrow.c | 42 ++++++++++++++++++++++-------------------- src/lj_record.c | 4 ++-- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index c258aee1..99d54233 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -539,10 +539,8 @@ static void LJ_FASTCALL recff_math_degrad(jit_State *J, RecordFFData *rd) static void LJ_FASTCALL recff_math_pow(jit_State *J, RecordFFData *rd) { - TRef tr = lj_ir_tonum(J, J->base[0]); - if (!tref_isnumber_str(J->base[1])) - lj_trace_err(J, LJ_TRERR_BADTYPE); - J->base[0] = lj_opt_narrow_pow(J, tr, J->base[1], &rd->argv[1]); + J->base[0] = lj_opt_narrow_pow(J, J->base[0], J->base[1], + &rd->argv[0], &rd->argv[1]); UNUSED(rd); } diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 1836e1b0..4bf95f15 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -142,8 +142,8 @@ LJ_FUNC TRef LJ_FASTCALL lj_opt_narrow_cindex(jit_State *J, TRef key); LJ_FUNC TRef lj_opt_narrow_arith(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc, IROp op); LJ_FUNC TRef lj_opt_narrow_unm(jit_State *J, TRef rc, TValue *vc); -LJ_FUNC TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vc); -LJ_FUNC TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc); +LJ_FUNC TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc); +LJ_FUNC TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc); LJ_FUNC IRType lj_opt_narrow_forl(jit_State *J, cTValue *forbase); /* Optimization passes. */ diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c index b1ab5ba8..36be66ee 100644 --- a/src/lj_opt_narrow.c +++ b/src/lj_opt_narrow.c @@ -517,18 +517,24 @@ static int numisint(lua_Number n) return (n == (lua_Number)lj_num2int(n)); } +/* Convert string to number. Error out for non-numeric string values. */ +static TRef conv_str_tonum(jit_State *J, TRef tr, TValue *o) +{ + if (tref_isstr(tr)) { + tr = emitir(IRTG(IR_STRTO, IRT_NUM), tr, 0); + /* Would need an inverted STRTO for this rare and useless case. */ + if (!lj_strscan_num(strV(o), o)) /* Convert in-place. Value used below. */ + lj_trace_err(J, LJ_TRERR_BADTYPE); /* Punt if non-numeric. */ + } + return tr; +} + /* Narrowing of arithmetic operations. */ TRef lj_opt_narrow_arith(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc, IROp op) { - if (tref_isstr(rb)) { - rb = emitir(IRTG(IR_STRTO, IRT_NUM), rb, 0); - lj_strscan_num(strV(vb), vb); - } - if (tref_isstr(rc)) { - rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0); - lj_strscan_num(strV(vc), vc); - } + rb = conv_str_tonum(J, rb, vb); + rc = conv_str_tonum(J, rc, vc); /* Must not narrow MUL in non-DUALNUM variant, because it loses -0. */ if ((op >= IR_ADD && op <= (LJ_DUALNUM ? IR_MUL : IR_SUB)) && tref_isinteger(rb) && tref_isinteger(rc) && @@ -543,10 +549,7 @@ TRef lj_opt_narrow_arith(jit_State *J, TRef rb, TRef rc, /* Narrowing of unary minus operator. */ TRef lj_opt_narrow_unm(jit_State *J, TRef rc, TValue *vc) { - if (tref_isstr(rc)) { - rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0); - lj_strscan_num(strV(vc), vc); - } + rc = conv_str_tonum(J, rc, vc); if (tref_isinteger(rc)) { if ((uint32_t)numberVint(vc) != 0x80000000u) return emitir(IRTGI(IR_SUBOV), lj_ir_kint(J, 0), rc); @@ -556,11 +559,11 @@ TRef lj_opt_narrow_unm(jit_State *J, TRef rc, TValue *vc) } /* Narrowing of modulo operator. */ -TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vc) +TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc) { TRef tmp; - if (tvisstr(vc) && !lj_strscan_num(strV(vc), vc)) - lj_trace_err(J, LJ_TRERR_BADTYPE); + rb = conv_str_tonum(J, rb, vb); + rc = conv_str_tonum(J, rc, vc); if ((LJ_DUALNUM || (J->flags & JIT_F_OPT_NARROW)) && tref_isinteger(rb) && tref_isinteger(rc) && (tvisint(vc) ? intV(vc) != 0 : !tviszero(vc))) { @@ -577,10 +580,11 @@ TRef lj_opt_narrow_mod(jit_State *J, TRef rb, TRef rc, TValue *vc) } /* Narrowing of power operator or math.pow. */ -TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc) +TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc) { - if (tvisstr(vc) && !lj_strscan_num(strV(vc), vc)) - lj_trace_err(J, LJ_TRERR_BADTYPE); + rb = conv_str_tonum(J, rb, vb); + rb = lj_ir_tonum(J, rb); /* Left arg is always treated as an FP number. */ + rc = conv_str_tonum(J, rc, vc); /* Narrowing must be unconditional to preserve (-x)^i semantics. */ if (tvisint(vc) || numisint(numV(vc))) { int checkrange = 0; @@ -591,8 +595,6 @@ TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vc) checkrange = 1; } if (!tref_isinteger(rc)) { - if (tref_isstr(rc)) - rc = emitir(IRTG(IR_STRTO, IRT_NUM), rc, 0); /* Guarded conversion to integer! */ rc = emitir(IRTGI(IR_CONV), rc, IRCONV_INT_NUM|IRCONV_CHECK); } diff --git a/src/lj_record.c b/src/lj_record.c index 9b51c51f..ff7825ee 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1884,14 +1884,14 @@ void lj_record_ins(jit_State *J) case BC_MODVN: case BC_MODVV: recmod: if (tref_isnumber_str(rb) && tref_isnumber_str(rc)) - rc = lj_opt_narrow_mod(J, rb, rc, rcv); + rc = lj_opt_narrow_mod(J, rb, rc, rbv, rcv); else rc = rec_mm_arith(J, &ix, MM_mod); break; case BC_POW: if (tref_isnumber_str(rb) && tref_isnumber_str(rc)) - rc = lj_opt_narrow_pow(J, lj_ir_tonum(J, rb), rc, rcv); + rc = lj_opt_narrow_pow(J, rb, rc, rbv, rcv); else rc = rec_mm_arith(J, &ix, MM_pow); break; From 7b26e9c998095ef9fbc4540908df6c30a693baa0 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 24 Apr 2016 17:10:24 +0200 Subject: [PATCH 07/57] Fix GCC 6 -Wmisleading-indentation warnings. Thanks to Roman Tsisyk. --- src/lj_cparse.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/lj_cparse.c b/src/lj_cparse.c index 5b359306..c82b10d8 100644 --- a/src/lj_cparse.c +++ b/src/lj_cparse.c @@ -310,13 +310,17 @@ static CPToken cp_next_(CPState *cp) else return '/'; break; case '|': - if (cp_get(cp) != '|') return '|'; cp_get(cp); return CTOK_OROR; + if (cp_get(cp) != '|') return '|'; + cp_get(cp); return CTOK_OROR; case '&': - if (cp_get(cp) != '&') return '&'; cp_get(cp); return CTOK_ANDAND; + if (cp_get(cp) != '&') return '&'; + cp_get(cp); return CTOK_ANDAND; case '=': - if (cp_get(cp) != '=') return '='; cp_get(cp); return CTOK_EQ; + if (cp_get(cp) != '=') return '='; + cp_get(cp); return CTOK_EQ; case '!': - if (cp_get(cp) != '=') return '!'; cp_get(cp); return CTOK_NE; + if (cp_get(cp) != '=') return '!'; + cp_get(cp); return CTOK_NE; case '<': if (cp_get(cp) == '=') { cp_get(cp); return CTOK_LE; } else if (cp->c == '<') { cp_get(cp); return CTOK_SHL; } @@ -326,7 +330,8 @@ static CPToken cp_next_(CPState *cp) else if (cp->c == '>') { cp_get(cp); return CTOK_SHR; } return '>'; case '-': - if (cp_get(cp) != '>') return '-'; cp_get(cp); return CTOK_DEREF; + if (cp_get(cp) != '>') return '-'; + cp_get(cp); return CTOK_DEREF; case '$': return cp_param(cp); case '\0': return CTOK_EOF; From ac42037db0ea0e0c8f4934b5103db522ab405129 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 24 Apr 2016 17:32:12 +0200 Subject: [PATCH 08/57] Constrain value range of lj_ir_kptr() to unsigned 32 bit pointers. Thanks to Peter Cawley. --- src/lj_ffrecord.c | 8 +------- src/lj_ir.c | 2 +- src/lj_obj.h | 8 ++++++-- src/lj_record.c | 7 +------ 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index a960ea50..942ecdb2 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -104,7 +104,6 @@ static void recff_stitch(jit_State *J) TValue *base = L->base; const BCIns *pc = frame_pc(base-1); TValue *pframe = frame_prevl(base-1); - TRef trcont; lua_assert(!LJ_FR2); /* TODO_FR2: handle frame shift. */ /* Move func + args up in Lua stack and insert continuation. */ @@ -118,12 +117,7 @@ static void recff_stitch(jit_State *J) /* Ditto for the IR. */ memmove(&J->base[1], &J->base[-1], sizeof(TRef)*(J->maxslot+1)); -#if LJ_64 - trcont = lj_ir_kptr(J, (void *)((int64_t)cont-(int64_t)lj_vm_asm_begin)); -#else - trcont = lj_ir_kptr(J, (void *)cont); -#endif - J->base[0] = trcont | TREF_CONT; + J->base[0] = lj_ir_kptr(J, contptr(cont)) | TREF_CONT; J->ktracep = lj_ir_k64_reserve(J); lua_assert(irt_toitype_(IRT_P64) == LJ_TTRACE); J->base[-1] = emitir(IRT(IR_XLOAD, IRT_P64), lj_ir_kptr(J, &J->ktracep->gcr), 0); diff --git a/src/lj_ir.c b/src/lj_ir.c index 63c98254..b4087aa7 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -345,7 +345,7 @@ TRef lj_ir_kptr_(jit_State *J, IROp op, void *ptr) { IRIns *ir, *cir = J->cur.ir; IRRef ref; - lua_assert((void *)(intptr_t)i32ptr(ptr) == ptr); + lua_assert((void *)(uintptr_t)u32ptr(ptr) == ptr); for (ref = J->chain[op]; ref; ref = cir[ref].prev) if (mref(cir[ref].ptr, void) == ptr) goto found; diff --git a/src/lj_obj.h b/src/lj_obj.h index 059eb132..25da9455 100644 --- a/src/lj_obj.h +++ b/src/lj_obj.h @@ -843,12 +843,16 @@ static LJ_AINLINE void setlightudV(TValue *o, void *p) #endif #if LJ_FR2 -#define setcont(o, f) ((o)->u64 = (uint64_t)(uintptr_t)(void *)(f)) +#define contptr(f) ((void *)(f)) +#define setcont(o, f) ((o)->u64 = (uint64_t)(uintptr_t)contptr(f)) #elif LJ_64 +#define contptr(f) \ + ((void *)(uintptr_t)(uint32_t)((intptr_t)(f) - (intptr_t)lj_vm_asm_begin)) #define setcont(o, f) \ ((o)->u64 = (uint64_t)(void *)(f) - (uint64_t)lj_vm_asm_begin) #else -#define setcont(o, f) setlightudV((o), (void *)(f)) +#define contptr(f) ((void *)(f)) +#define setcont(o, f) setlightudV((o), contptr(f)) #endif #define tvchecklive(L, o) \ diff --git a/src/lj_record.c b/src/lj_record.c index 306a85cb..8a72b0c9 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -882,12 +882,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) static BCReg rec_mm_prep(jit_State *J, ASMFunction cont) { BCReg s, top = cont == lj_cont_cat ? J->maxslot : curr_proto(J->L)->framesize; -#if LJ_64 - TRef trcont = lj_ir_kptr(J, (void *)((int64_t)cont-(int64_t)lj_vm_asm_begin)); -#else - TRef trcont = lj_ir_kptr(J, (void *)cont); -#endif - J->base[top] = trcont | TREF_CONT; + J->base[top] = lj_ir_kptr(J, contptr(cont)) | TREF_CONT; J->framedepth++; for (s = J->maxslot; s < top; s++) J->base[s] = 0; /* Clear frame gap to avoid resurrecting previous refs. */ From 221268b17d950d1cdfc36ee85b82c89c83a974fb Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Tue, 3 May 2016 18:30:01 +0200 Subject: [PATCH 09/57] Use the GDB JIT API in a thread-safe manner. Thanks to Peter Cawley. --- src/lj_gdbjit.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c index f2bd865e..8b72be7d 100644 --- a/src/lj_gdbjit.c +++ b/src/lj_gdbjit.c @@ -719,6 +719,20 @@ static void gdbjit_buildobj(GDBJITctx *ctx) /* -- Interface to GDB JIT API -------------------------------------------- */ +static int gdbjit_lock; + +static void gdbjit_lock_acquire() +{ + while (__sync_lock_test_and_set(&gdbjit_lock, 1)) { + /* Just spin; futexes or pthreads aren't worth the portability cost. */ + } +} + +static void gdbjit_lock_release() +{ + __sync_lock_release(&gdbjit_lock); +} + /* Add new entry to GDB JIT symbol chain. */ static void gdbjit_newentry(lua_State *L, GDBJITctx *ctx) { @@ -730,6 +744,7 @@ static void gdbjit_newentry(lua_State *L, GDBJITctx *ctx) ctx->T->gdbjit_entry = (void *)eo; /* Link new entry to chain and register it. */ eo->entry.prev_entry = NULL; + gdbjit_lock_acquire(); eo->entry.next_entry = __jit_debug_descriptor.first_entry; if (eo->entry.next_entry) eo->entry.next_entry->prev_entry = &eo->entry; @@ -739,6 +754,7 @@ static void gdbjit_newentry(lua_State *L, GDBJITctx *ctx) __jit_debug_descriptor.relevant_entry = &eo->entry; __jit_debug_descriptor.action_flag = GDBJIT_REGISTER; __jit_debug_register_code(); + gdbjit_lock_release(); } /* Add debug info for newly compiled trace and notify GDB. */ @@ -770,6 +786,7 @@ void lj_gdbjit_deltrace(jit_State *J, GCtrace *T) { GDBJITentryobj *eo = (GDBJITentryobj *)T->gdbjit_entry; if (eo) { + gdbjit_lock_acquire(); if (eo->entry.prev_entry) eo->entry.prev_entry->next_entry = eo->entry.next_entry; else @@ -779,6 +796,7 @@ void lj_gdbjit_deltrace(jit_State *J, GCtrace *T) __jit_debug_descriptor.relevant_entry = &eo->entry; __jit_debug_descriptor.action_flag = GDBJIT_UNREGISTER; __jit_debug_register_code(); + gdbjit_lock_release(); lj_mem_free(J2G(J), eo, eo->sz); } } From f05280e4156df2d13d87b1639157c63ed4e3b393 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 6 May 2016 12:08:00 +0200 Subject: [PATCH 10/57] x86/x64: Fix instruction length decoder. Thanks to Peter Cawley. --- src/lj_asm_x86.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 86a5b0a8..02918e23 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -2836,9 +2836,9 @@ static uint32_t asm_x86_inslen(const uint8_t* p) case 4: result -= (prefixes & 2); /* fallthrough */ case 5: return result + (x & 15); case 6: /* Group 3. */ - if (p[1] & 0x38) return result + 2; - if ((prefixes & 2) && (x == 0x66)) return result + 4; - return result + (x & 15); + if (p[1] & 0x38) x = 2; + else if ((prefixes & 2) && (x == 0x66)) x = 4; + goto mrm; case 7: /* VEX c4/c5. */ if (LJ_32 && p[1] < 0xc0) { x = 2; From 35b09e692ead67181755795352f1df12547fd4fa Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 7 May 2016 12:32:15 +0200 Subject: [PATCH 11/57] Windows/x86: Add full exception interoperability. Contributed by Peter Cawley. --- doc/extensions.html | 27 +++++++------- src/host/buildvm.c | 2 +- src/host/buildvm_peobj.c | 28 +++++++++++++-- src/lj_err.c | 38 +++++++++++++++----- src/lj_frame.h | 12 +++++++ src/lj_vm.h | 4 +++ src/vm_x86.dasc | 77 ++++++++++++++++++++++++++++++++++++++-- 7 files changed, 159 insertions(+), 29 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index 7f712a62..368b527c 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -365,25 +365,30 @@ the toolchain used to compile LuaJIT: POSIX/x64, DWARF2 unwinding -GCC 4.3+ +GCC 4.3+, Clang Full -Other platforms, DWARF2 unwinding -GCC -Limited +ARM -DLUAJIT_UNWIND_EXTERNAL +GCC, Clang +Full +Other platforms, DWARF2 unwinding +GCC, Clang +Limited + + Windows/x64 MSVC or WinSDK Full - + Windows/x86 Any -No +Full - + Other platforms Other compilers No @@ -432,14 +437,6 @@ C++ destructors.
  • Lua errors cannot be caught on the C++ side.
  • Throwing Lua errors across C++ frames will not call C++ destructors.
  • -
  • Additionally, on Windows/x86 with SEH-based C++ exceptions: -it's not safe to throw a Lua error across any frames containing -a C++ function with any try/catch construct or using variables with -(implicit) destructors. This also applies to any functions which may be -inlined in such a function. It doesn't matter whether lua_error() -is called inside or outside of a try/catch or whether any object actually -needs to be destroyed: the SEH chain is corrupted and this will eventually -lead to the termination of the process.

  • diff --git a/src/host/buildvm.c b/src/host/buildvm.c index 6d9e09e1..57b4dc97 100644 --- a/src/host/buildvm.c +++ b/src/host/buildvm.c @@ -110,7 +110,7 @@ static const char *sym_decorate(BuildCtx *ctx, if (p) { #if LJ_TARGET_X86ORX64 if (!LJ_64 && (ctx->mode == BUILD_coffasm || ctx->mode == BUILD_peobj)) - name[0] = '@'; + name[0] = name[1] == 'R' ? '_' : '@'; /* Just for _RtlUnwind@16. */ else *p = '\0'; #elif LJ_TARGET_PPC && !LJ_TARGET_CONSOLE diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index e8c927d8..42f6ac84 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -109,6 +109,8 @@ enum { #if LJ_TARGET_X64 PEOBJ_SECT_PDATA, PEOBJ_SECT_XDATA, +#elif LJ_TARGET_X86 + PEOBJ_SECT_SXDATA, #endif PEOBJ_SECT_RDATA_Z, PEOBJ_NSECTIONS @@ -208,6 +210,13 @@ void emit_peobj(BuildCtx *ctx) sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE; /* Flags: 40 = read, 30 = align4, 40 = initialized data. */ pesect[PEOBJ_SECT_XDATA].flags = 0x40300040; +#elif LJ_TARGET_X86 + memcpy(pesect[PEOBJ_SECT_SXDATA].name, ".sxdata", sizeof(".sxdata")-1); + pesect[PEOBJ_SECT_SXDATA].ofs = sofs; + sofs += (pesect[PEOBJ_SECT_SXDATA].size = 4); + pesect[PEOBJ_SECT_SXDATA].relocofs = sofs; + /* Flags: 40 = read, 30 = align4, 02 = lnk_info, 40 = initialized data. */ + pesect[PEOBJ_SECT_SXDATA].flags = 0x40300240; #endif memcpy(pesect[PEOBJ_SECT_RDATA_Z].name, ".rdata$Z", sizeof(".rdata$Z")-1); @@ -232,7 +241,7 @@ void emit_peobj(BuildCtx *ctx) nrsym = ctx->nrelocsym; pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym; #if LJ_TARGET_X64 - pehdr.nsyms += 1; /* Symbol for lj_err_unwind_win64. */ + pehdr.nsyms += 1; /* Symbol for lj_err_unwind_win. */ #endif /* Write PE object header and all sections. */ @@ -312,6 +321,19 @@ void emit_peobj(BuildCtx *ctx) reloc.type = PEOBJ_RELOC_ADDR32NB; owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); } +#elif LJ_TARGET_X86 + /* Write .sxdata section. */ + for (i = 0; i < nrsym; i++) { + if (!strcmp(ctx->relocsym[i], "_lj_err_unwind_win")) { + uint32_t symidx = 1+2+i; + owrite(ctx, &symidx, 4); + break; + } + } + if (i == nrsym) { + fprintf(stderr, "Error: extern lj_err_unwind_win not used\n"); + exit(1); + } #endif /* Write .rdata$Z section. */ @@ -333,8 +355,10 @@ void emit_peobj(BuildCtx *ctx) #if LJ_TARGET_X64 emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA); emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA); - emit_peobj_sym(ctx, "lj_err_unwind_win64", 0, + emit_peobj_sym(ctx, "lj_err_unwind_win", 0, PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN); +#elif LJ_TARGET_X86 + emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_SXDATA); #endif emit_peobj_sym(ctx, ctx->beginsym, 0, diff --git a/src/lj_err.c b/src/lj_err.c index a847ca07..1314c8db 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -46,7 +46,8 @@ ** the wrapper function feature. Lua errors thrown through C++ frames ** cannot be caught by C++ code and C++ destructors are not run. ** -** EXT is the default on x64 systems, INT is the default on all other systems. +** EXT is the default on x64 systems and on Windows, INT is the default on all +** other systems. ** ** EXT can be manually enabled on POSIX systems using GCC and DWARF2 stack ** unwinding with -DLUAJIT_UNWIND_EXTERNAL. *All* C code must be compiled @@ -55,7 +56,6 @@ ** and all C libraries that have callbacks which may be used to call back ** into Lua. C++ code must *not* be compiled with -fno-exceptions. ** -** EXT cannot be enabled on WIN32 since system exceptions use code-driven SEH. ** EXT is mandatory on WIN64 since the calling convention has an abundance ** of callee-saved registers (rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15). ** The POSIX/x64 interpreter only saves r12/r13 for INT (e.g. PS4). @@ -63,7 +63,7 @@ #if defined(__GNUC__) && (LJ_TARGET_X64 || defined(LUAJIT_UNWIND_EXTERNAL)) && !LJ_NO_UNWIND #define LJ_UNWIND_EXT 1 -#elif LJ_TARGET_X64 && LJ_TARGET_WINDOWS +#elif LJ_TARGET_WINDOWS #define LJ_UNWIND_EXT 1 #endif @@ -384,7 +384,7 @@ static void err_raise_ext(int errcode) #endif /* LJ_TARGET_ARM */ -#elif LJ_TARGET_X64 && LJ_ABI_WIN +#elif LJ_ABI_WIN /* ** Someone in Redmond owes me several days of my life. A lot of this is @@ -402,6 +402,7 @@ static void err_raise_ext(int errcode) #define WIN32_LEAN_AND_MEAN #include +#if LJ_TARGET_X64 /* Taken from: http://www.nynaeve.net/?p=99 */ typedef struct UndocumentedDispatcherContext { ULONG64 ControlPc; @@ -416,11 +417,14 @@ typedef struct UndocumentedDispatcherContext { ULONG ScopeIndex; ULONG Fill0; } UndocumentedDispatcherContext; +#else +typedef void *UndocumentedDispatcherContext; +#endif /* Another wild guess. */ extern void __DestructExceptionObject(EXCEPTION_RECORD *rec, int nothrow); -#ifdef MINGW_SDK_INIT +#if LJ_TARGET_X64 && defined(MINGW_SDK_INIT) /* Workaround for broken MinGW64 declaration. */ VOID RtlUnwindEx_FIXED(PVOID,PVOID,PVOID,PVOID,PVOID,PVOID) asm("RtlUnwindEx"); #define RtlUnwindEx RtlUnwindEx_FIXED @@ -434,10 +438,15 @@ VOID RtlUnwindEx_FIXED(PVOID,PVOID,PVOID,PVOID,PVOID,PVOID) asm("RtlUnwindEx"); #define LJ_EXCODE_CHECK(cl) (((cl) ^ LJ_EXCODE) <= 0xff) #define LJ_EXCODE_ERRCODE(cl) ((int)((cl) & 0xff)) -/* Win64 exception handler for interpreter frame. */ -LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec, - void *cf, CONTEXT *ctx, UndocumentedDispatcherContext *dispatch) +/* Windows exception handler for interpreter frame. */ +LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win(EXCEPTION_RECORD *rec, + void *f, CONTEXT *ctx, UndocumentedDispatcherContext *dispatch) { +#if LJ_TARGET_X64 + void *cf = f; +#else + void *cf = (char *)f - CFRAME_OFS_SEH; +#endif lua_State *L = cframe_L(cf); int errcode = LJ_EXCODE_CHECK(rec->ExceptionCode) ? LJ_EXCODE_ERRCODE(rec->ExceptionCode) : LUA_ERRRUN; @@ -457,6 +466,7 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec, /* Don't catch access violations etc. */ return ExceptionContinueSearch; } +#if LJ_TARGET_X64 /* Unwind the stack and call all handlers for all lower C frames ** (including ourselves) again with EH_UNWINDING set. Then set ** rsp = cf, rax = errcode and jump to the specified target. @@ -466,6 +476,18 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec, lj_vm_unwind_c_eh), rec, (void *)(uintptr_t)errcode, ctx, dispatch->HistoryTable); /* RtlUnwindEx should never return. */ +#else + UNUSED(ctx); + UNUSED(dispatch); + /* Call all handlers for all lower C frames (including ourselves) again + ** with EH_UNWINDING set. Then call the specified function, passing cf + ** and errcode. + */ + lj_vm_rtlunwind(cf, (void *)rec, + (cframe_unwind_ff(cf2) && errcode != LUA_YIELD) ? + (void *)lj_vm_unwind_ff : (void *)lj_vm_unwind_c, errcode); + /* lj_vm_rtlunwind does not return. */ +#endif } } return ExceptionContinueSearch; diff --git a/src/lj_frame.h b/src/lj_frame.h index fc0e281c..db2e4da1 100644 --- a/src/lj_frame.h +++ b/src/lj_frame.h @@ -116,6 +116,17 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */ /* These definitions must match with the arch-specific *.dasc files. */ #if LJ_TARGET_X86 +#if LJ_ABI_WIN +#define CFRAME_OFS_ERRF (19*4) +#define CFRAME_OFS_NRES (18*4) +#define CFRAME_OFS_PREV (17*4) +#define CFRAME_OFS_L (16*4) +#define CFRAME_OFS_SEH (9*4) +#define CFRAME_OFS_PC (6*4) +#define CFRAME_OFS_MULTRES (5*4) +#define CFRAME_SIZE (16*4) +#define CFRAME_SHIFT_MULTRES 0 +#else #define CFRAME_OFS_ERRF (15*4) #define CFRAME_OFS_NRES (14*4) #define CFRAME_OFS_PREV (13*4) @@ -124,6 +135,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */ #define CFRAME_OFS_MULTRES (5*4) #define CFRAME_SIZE (12*4) #define CFRAME_SHIFT_MULTRES 0 +#endif #elif LJ_TARGET_X64 #if LJ_ABI_WIN #define CFRAME_OFS_PREV (13*8) diff --git a/src/lj_vm.h b/src/lj_vm.h index be35295d..d605b143 100644 --- a/src/lj_vm.h +++ b/src/lj_vm.h @@ -17,6 +17,10 @@ LJ_ASMF int lj_vm_cpcall(lua_State *L, lua_CFunction func, void *ud, LJ_ASMF int lj_vm_resume(lua_State *L, TValue *base, int nres1, ptrdiff_t ef); LJ_ASMF_NORET void LJ_FASTCALL lj_vm_unwind_c(void *cframe, int errcode); LJ_ASMF_NORET void LJ_FASTCALL lj_vm_unwind_ff(void *cframe); +#if LJ_ABI_WIN && LJ_TARGET_X86 +LJ_ASMF_NORET void LJ_FASTCALL lj_vm_rtlunwind(void *cframe, void *excptrec, + void *unwinder, int errcode); +#endif LJ_ASMF void lj_vm_unwind_c_eh(void); LJ_ASMF void lj_vm_unwind_ff_eh(void); #if LJ_TARGET_X86ORX64 diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc index f108c0b5..39ccaa2e 100644 --- a/src/vm_x86.dasc +++ b/src/vm_x86.dasc @@ -121,19 +121,68 @@ |//----------------------------------------------------------------------- |.if not X64 // x86 stack layout. | +|.if WIN +| +|.define CFRAME_SPACE, aword*9 // Delta for esp (see <--). +|.macro saveregs_ +| push edi; push esi; push ebx +| push extern lj_err_unwind_win +| fs; push dword [0] +| fs; mov [0], esp +| sub esp, CFRAME_SPACE +|.endmacro +|.macro restoreregs +| add esp, CFRAME_SPACE +| fs; pop dword [0] +| pop edi // Short for esp += 4. +| pop ebx; pop esi; pop edi; pop ebp +|.endmacro +| +|.else +| |.define CFRAME_SPACE, aword*7 // Delta for esp (see <--). |.macro saveregs_ | push edi; push esi; push ebx | sub esp, CFRAME_SPACE |.endmacro -|.macro saveregs -| push ebp; saveregs_ -|.endmacro |.macro restoreregs | add esp, CFRAME_SPACE | pop ebx; pop esi; pop edi; pop ebp |.endmacro | +|.endif +| +|.macro saveregs +| push ebp; saveregs_ +|.endmacro +| +|.if WIN +|.define SAVE_ERRF, aword [esp+aword*19] // vm_pcall/vm_cpcall only. +|.define SAVE_NRES, aword [esp+aword*18] +|.define SAVE_CFRAME, aword [esp+aword*17] +|.define SAVE_L, aword [esp+aword*16] +|//----- 16 byte aligned, ^^^ arguments from C caller +|.define SAVE_RET, aword [esp+aword*15] //<-- esp entering interpreter. +|.define SAVE_R4, aword [esp+aword*14] +|.define SAVE_R3, aword [esp+aword*13] +|.define SAVE_R2, aword [esp+aword*12] +|//----- 16 byte aligned +|.define SAVE_R1, aword [esp+aword*11] +|.define SEH_FUNC, aword [esp+aword*10] +|.define SEH_NEXT, aword [esp+aword*9] //<-- esp after register saves. +|.define UNUSED2, aword [esp+aword*8] +|//----- 16 byte aligned +|.define UNUSED1, aword [esp+aword*7] +|.define SAVE_PC, aword [esp+aword*6] +|.define TMP2, aword [esp+aword*5] +|.define TMP1, aword [esp+aword*4] +|//----- 16 byte aligned +|.define ARG4, aword [esp+aword*3] +|.define ARG3, aword [esp+aword*2] +|.define ARG2, aword [esp+aword*1] +|.define ARG1, aword [esp] //<-- esp while in interpreter. +|//----- 16 byte aligned, ^^^ arguments for C callee +|.else |.define SAVE_ERRF, aword [esp+aword*15] // vm_pcall/vm_cpcall only. |.define SAVE_NRES, aword [esp+aword*14] |.define SAVE_CFRAME, aword [esp+aword*13] @@ -154,6 +203,7 @@ |.define ARG2, aword [esp+aword*1] |.define ARG1, aword [esp] //<-- esp while in interpreter. |//----- 16 byte aligned, ^^^ arguments for C callee +|.endif | |// FPARGx overlaps ARGx and ARG(x+1) on x86. |.define FPARG3, qword [esp+qword*1] @@ -554,6 +604,10 @@ static void build_subroutines(BuildCtx *ctx) |.else | mov eax, FCARG2 // Error return status for vm_pcall. | mov esp, FCARG1 + |.if WIN + | lea FCARG1, SEH_NEXT + | fs; mov [0], FCARG1 + |.endif |.endif |->vm_unwind_c_eh: // Landing pad for external unwinder. | mov L:RB, SAVE_L @@ -577,6 +631,10 @@ static void build_subroutines(BuildCtx *ctx) |.else | and FCARG1, CFRAME_RAWMASK | mov esp, FCARG1 + |.if WIN + | lea FCARG1, SEH_NEXT + | fs; mov [0], FCARG1 + |.endif |.endif |->vm_unwind_ff_eh: // Landing pad for external unwinder. | mov L:RB, SAVE_L @@ -590,6 +648,19 @@ static void build_subroutines(BuildCtx *ctx) | set_vmstate INTERP | jmp ->vm_returnc // Increments RD/MULTRES and returns. | + |.if WIN and not X64 + |->vm_rtlunwind@16: // Thin layer around RtlUnwind. + | // (void *cframe, void *excptrec, void *unwinder, int errcode) + | mov [esp], FCARG1 // Return value for RtlUnwind. + | push FCARG2 // Exception record for RtlUnwind. + | push 0 // Ignored by RtlUnwind. + | push dword [FCARG1+CFRAME_OFS_SEH] + | call extern RtlUnwind@16 // Violates ABI (clobbers too much). + | mov FCARG1, eax + | mov FCARG2, [esp+4] // errcode (for vm_unwind_c). + | ret // Jump to unwinder. + |.endif + | |//----------------------------------------------------------------------- |//-- Grow stack for calls ----------------------------------------------- |//----------------------------------------------------------------------- From 573daa9c97ee83e371f232ba3623e592af1dfee3 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 19 May 2016 15:09:08 +0200 Subject: [PATCH 12/57] Fix dependencies. --- src/Makefile.dep | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index 2c329f55..76b29108 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -215,19 +215,19 @@ ljamalg.o: ljamalg.c lua.h luaconf.h lauxlib.h lj_gc.c lj_obj.h lj_def.h \ lj_func.c lj_udata.c lj_meta.c lj_strscan.h lj_lib.h lj_debug.c \ lj_state.c lj_lex.h lj_alloc.h luajit.h lj_dispatch.c lj_ccallback.h \ lj_profile.h lj_vmevent.c lj_vmevent.h lj_vmmath.c lj_strscan.c \ - lj_strfmt.c lj_api.c lj_profile.c lj_lex.c lualib.h lj_parse.h \ - lj_parse.c lj_bcread.c lj_bcdump.h lj_bcwrite.c lj_load.c lj_ctype.c \ - lj_cdata.c lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h lj_ccallback.c \ - lj_target.h lj_target_*.h lj_mcode.h lj_carith.c lj_carith.h lj_clib.c \ - lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_ir.c lj_ircall.h \ - lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h lj_opt_narrow.c \ - lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c lj_opt_sink.c \ - lj_mcode.c lj_snap.c lj_record.c lj_record.h lj_ffrecord.h lj_crecord.c \ - lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h lj_emit_*.h \ - lj_asm_*.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c lib_aux.c \ - lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c lib_io.c \ - lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c lib_ffi.c \ - lib_init.c + lj_strfmt.c lj_strfmt_num.c lj_api.c lj_profile.c lj_lex.c lualib.h \ + lj_parse.h lj_parse.c lj_bcread.c lj_bcdump.h lj_bcwrite.c lj_load.c \ + lj_ctype.c lj_cdata.c lj_cconv.h lj_cconv.c lj_ccall.c lj_ccall.h \ + lj_ccallback.c lj_target.h lj_target_*.h lj_mcode.h lj_carith.c \ + lj_carith.h lj_clib.c lj_clib.h lj_cparse.c lj_cparse.h lj_lib.c lj_ir.c \ + lj_ircall.h lj_iropt.h lj_opt_mem.c lj_opt_fold.c lj_folddef.h \ + lj_opt_narrow.c lj_opt_dce.c lj_opt_loop.c lj_snap.h lj_opt_split.c \ + lj_opt_sink.c lj_mcode.c lj_snap.c lj_record.c lj_record.h lj_ffrecord.h \ + lj_crecord.c lj_crecord.h lj_ffrecord.c lj_recdef.h lj_asm.c lj_asm.h \ + lj_emit_*.h lj_asm_*.h lj_trace.c lj_gdbjit.h lj_gdbjit.c lj_alloc.c \ + lib_aux.c lib_base.c lj_libdef.h lib_math.c lib_string.c lib_table.c \ + lib_io.c lib_os.c lib_package.c lib_debug.c lib_bit.c lib_jit.c \ + lib_ffi.c lib_init.c luajit.o: luajit.c lua.h luaconf.h lauxlib.h lualib.h luajit.h lj_arch.h host/buildvm.o: host/buildvm.c host/buildvm.h lj_def.h lua.h luaconf.h \ lj_arch.h lj_obj.h lj_def.h lj_arch.h lj_gc.h lj_obj.h lj_bc.h lj_ir.h \ From 5837c2a2fb1ba66510c9100a296966020f1610a3 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 20 May 2016 19:43:34 +0200 Subject: [PATCH 13/57] Remove assumption that lj_math_random_step() doesn't clobber FPRs. --- src/lj_ircall.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/lj_ircall.h b/src/lj_ircall.h index 916df462..ed1b8d68 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -86,12 +86,6 @@ typedef struct CCallInfo { #define IRCALLCOND_FFI32(x) NULL #endif -#if LJ_TARGET_X86 -#define CCI_RANDFPR 0 /* Clang on OSX/x86 is overzealous. */ -#else -#define CCI_RANDFPR CCI_NOFPRCLOBBER -#endif - #if LJ_SOFTFP #define ARG1_FP 2 /* Treat as 2 32 bit arguments. */ #else @@ -118,7 +112,7 @@ typedef struct CCallInfo { _(ANY, lj_gc_step_jit, 2, FS, NIL, CCI_L) \ _(ANY, lj_gc_barrieruv, 2, FS, NIL, 0) \ _(ANY, lj_mem_newgco, 2, FS, P32, CCI_L) \ - _(ANY, lj_math_random_step, 1, FS, NUM, CCI_CASTU64|CCI_RANDFPR)\ + _(ANY, lj_math_random_step, 1, FS, NUM, CCI_CASTU64) \ _(ANY, lj_vm_modi, 2, FN, INT, 0) \ _(ANY, sinh, ARG1_FP, N, NUM, 0) \ _(ANY, cosh, ARG1_FP, N, NUM, 0) \ From d4f3b1136b08ec65cf7e76691d63a7730d832ddc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 20 May 2016 19:45:38 +0200 Subject: [PATCH 14/57] Workaround for MinGW headers lacking some exception definitions. --- src/lj_err.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lj_err.c b/src/lj_err.c index 1314c8db..600e6ee6 100644 --- a/src/lj_err.c +++ b/src/lj_err.c @@ -439,7 +439,7 @@ VOID RtlUnwindEx_FIXED(PVOID,PVOID,PVOID,PVOID,PVOID,PVOID) asm("RtlUnwindEx"); #define LJ_EXCODE_ERRCODE(cl) ((int)((cl) & 0xff)) /* Windows exception handler for interpreter frame. */ -LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win(EXCEPTION_RECORD *rec, +LJ_FUNCA int lj_err_unwind_win(EXCEPTION_RECORD *rec, void *f, CONTEXT *ctx, UndocumentedDispatcherContext *dispatch) { #if LJ_TARGET_X64 @@ -464,7 +464,7 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win(EXCEPTION_RECORD *rec, setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP)); } else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) { /* Don't catch access violations etc. */ - return ExceptionContinueSearch; + return 1; /* ExceptionContinueSearch */ } #if LJ_TARGET_X64 /* Unwind the stack and call all handlers for all lower C frames @@ -490,7 +490,7 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win(EXCEPTION_RECORD *rec, #endif } } - return ExceptionContinueSearch; + return 1; /* ExceptionContinueSearch */ } /* Raise Windows exception. */ From 37e1e70313367d0264be9a2b9e563a8a94745303 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 20 May 2016 20:24:06 +0200 Subject: [PATCH 15/57] Add guard for obscure aliasing between open upvalues and SSA slots. Thanks to Peter Cawley. --- doc/status.html | 6 ------ src/lj_asm_arm.h | 1 - src/lj_asm_mips.h | 1 - src/lj_asm_ppc.h | 1 - src/lj_asm_x86.h | 1 - src/lj_record.c | 10 ++++++++-- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/doc/status.html b/doc/status.html index c305f47a..b3524397 100644 --- a/doc/status.html +++ b/doc/status.html @@ -89,12 +89,6 @@ hooks for non-Lua functions) and shows slightly different behavior in LuaJIT (no per-coroutine hooks, no tail call counting).
  • -Some checks are missing in the JIT-compiled code for obscure situations -with open upvalues aliasing one of the SSA slots later on (or -vice versa). Bonus points, if you can find a real world test case for -this. -
  • -
  • Currently some out-of-memory errors from on-trace code are not handled correctly. The error may fall through an on-trace pcall or it may be passed on to the function set with diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index ff688746..a722a11f 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -976,7 +976,6 @@ static void asm_newref(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { - /* NYI: Check that UREFO is still open and not aliasing a slot. */ Reg dest = ra_dest(as, ir, RSET_GPR); if (irref_isk(ir->op1)) { GCfunc *fn = ir_kfunc(IR(ir->op1)); diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index 4045fe80..66953aac 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -793,7 +793,6 @@ static void asm_newref(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { - /* NYI: Check that UREFO is still open and not aliasing a slot. */ Reg dest = ra_dest(as, ir, RSET_GPR); if (irref_isk(ir->op1)) { GCfunc *fn = ir_kfunc(IR(ir->op1)); diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index e8f3d08b..e1b0c980 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -789,7 +789,6 @@ static void asm_newref(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { - /* NYI: Check that UREFO is still open and not aliasing a slot. */ Reg dest = ra_dest(as, ir, RSET_GPR); if (irref_isk(ir->op1)) { GCfunc *fn = ir_kfunc(IR(ir->op1)); diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 02918e23..db3e49f8 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1215,7 +1215,6 @@ static void asm_newref(ASMState *as, IRIns *ir) static void asm_uref(ASMState *as, IRIns *ir) { - /* NYI: Check that UREFO is still open and not aliasing a slot. */ Reg dest = ra_dest(as, ir, RSET_GPR); if (irref_isk(ir->op1)) { GCfunc *fn = ir_kfunc(IR(ir->op1)); diff --git a/src/lj_record.c b/src/lj_record.c index ff7825ee..44b3667f 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -1343,13 +1343,17 @@ noconstify: /* Note: this effectively limits LJ_MAX_UPVAL to 127. */ uv = (uv << 8) | (hashrot(uvp->dhash, uvp->dhash + HASH_BIAS) & 0xff); if (!uvp->closed) { + uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_P32), fn, uv)); /* In current stack? */ if (uvval(uvp) >= tvref(J->L->stack) && uvval(uvp) < tvref(J->L->maxstack)) { int32_t slot = (int32_t)(uvval(uvp) - (J->L->base - J->baseslot)); if (slot >= 0) { /* Aliases an SSA slot? */ + emitir(IRTG(IR_EQ, IRT_P32), + REF_BASE, + emitir(IRT(IR_ADD, IRT_P32), uref, + lj_ir_kint(J, (slot - 1) * -8))); slot -= (int32_t)J->baseslot; /* Note: slot number may be negative! */ - /* NYI: add IR to guard that it's still aliasing the same slot. */ if (val == 0) { return getslot(J, slot); } else { @@ -1359,7 +1363,9 @@ noconstify: } } } - uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_P32), fn, uv)); + emitir(IRTG(IR_UGT, IRT_P32), + emitir(IRT(IR_SUB, IRT_P32), uref, REF_BASE), + lj_ir_kint(J, (J->baseslot + J->maxslot) * 8)); } else { needbarrier = 1; uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_P32), fn, uv)); From 1931b38da5a9ea075df73a966630308d3988bb96 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 20 May 2016 22:41:42 +0200 Subject: [PATCH 16/57] LJ_GC64: Introduce IRT_PGC. Contributed by Peter Cawley. --- src/lj_asm.c | 2 +- src/lj_ffrecord.c | 38 ++++++++++++++++---------------- src/lj_ir.h | 3 ++- src/lj_ircall.h | 40 ++++++++++++++++----------------- src/lj_opt_fold.c | 2 +- src/lj_record.c | 56 +++++++++++++++++++++++------------------------ 6 files changed, 71 insertions(+), 70 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 94d7bfc4..9cddd0c9 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1055,7 +1055,7 @@ static void asm_bufhdr(ASMState *as, IRIns *ir) } } else { Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, sb)); - /* Passing ir isn't strictly correct, but it's an IRT_P32, too. */ + /* Passing ir isn't strictly correct, but it's an IRT_PGC, too. */ emit_storeofs(as, ir, tmp, sb, offsetof(SBuf, p)); emit_loadofs(as, ir, tmp, sb, offsetof(SBuf, b)); } diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 942ecdb2..56497778 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -173,7 +173,7 @@ static void LJ_FASTCALL recff_nyi(jit_State *J, RecordFFData *rd) /* Emit BUFHDR for the global temporary buffer. */ static TRef recff_bufhdr(jit_State *J) { - return emitir(IRT(IR_BUFHDR, IRT_P32), + return emitir(IRT(IR_BUFHDR, IRT_PGC), lj_ir_kptr(J, &J2G(J)->tmpbuf), IRBUFHDR_RESET); } @@ -223,7 +223,7 @@ static void LJ_FASTCALL recff_setmetatable(jit_State *J, RecordFFData *rd) ix.tab = tr; copyTV(J->L, &ix.tabv, &rd->argv[0]); lj_record_mm_lookup(J, &ix, MM_metatable); /* Guard for no __metatable. */ - fref = emitir(IRT(IR_FREF, IRT_P32), tr, IRFL_TAB_META); + fref = emitir(IRT(IR_FREF, IRT_PGC), tr, IRFL_TAB_META); mtref = tref_isnil(mt) ? lj_ir_knull(J, IRT_TAB) : mt; emitir(IRT(IR_FSTORE, IRT_TAB), fref, mtref); if (!tref_isnil(mt)) @@ -289,7 +289,7 @@ int32_t lj_ffrecord_select_mode(jit_State *J, TRef tr, TValue *tv) if (strV(tv)->len == 1) { emitir(IRTG(IR_EQ, IRT_STR), tr, lj_ir_kstr(J, strV(tv))); } else { - TRef trptr = emitir(IRT(IR_STRREF, IRT_P32), tr, lj_ir_kint(J, 0)); + TRef trptr = emitir(IRT(IR_STRREF, IRT_PGC), tr, lj_ir_kint(J, 0)); TRef trchar = emitir(IRT(IR_XLOAD, IRT_U8), trptr, IRXLOAD_READONLY); emitir(IRTG(IR_EQ, IRT_INT), trchar, lj_ir_kint(J, '#')); } @@ -814,7 +814,7 @@ static void LJ_FASTCALL recff_string_range(jit_State *J, RecordFFData *rd) /* Also handle empty range here, to avoid extra traces. */ TRef trptr, trslen = emitir(IRTI(IR_SUB), trend, trstart); emitir(IRTGI(IR_GE), trslen, tr0); - trptr = emitir(IRT(IR_STRREF, IRT_P32), trstr, trstart); + trptr = emitir(IRT(IR_STRREF, IRT_PGC), trstr, trstart); J->base[0] = emitir(IRT(IR_SNEW, IRT_STR), trptr, trslen); } else { /* Range underflow: return empty string. */ emitir(IRTGI(IR_LT), trend, trstart); @@ -830,7 +830,7 @@ static void LJ_FASTCALL recff_string_range(jit_State *J, RecordFFData *rd) rd->nres = len; for (i = 0; i < len; i++) { TRef tmp = emitir(IRTI(IR_ADD), trstart, lj_ir_kint(J, (int32_t)i)); - tmp = emitir(IRT(IR_STRREF, IRT_P32), trstr, tmp); + tmp = emitir(IRT(IR_STRREF, IRT_PGC), trstr, tmp); J->base[i] = emitir(IRT(IR_XLOAD, IRT_U8), tmp, IRXLOAD_READONLY); } } else { /* Empty range or range underflow: return no results. */ @@ -852,7 +852,7 @@ static void LJ_FASTCALL recff_string_char(jit_State *J, RecordFFData *rd) if (i > 1) { /* Concatenate the strings, if there's more than one. */ TRef hdr = recff_bufhdr(J), tr = hdr; for (i = 0; J->base[i] != 0; i++) - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, J->base[i]); + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, J->base[i]); J->base[0] = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr); } UNUSED(rd); @@ -869,14 +869,14 @@ static void LJ_FASTCALL recff_string_rep(jit_State *J, RecordFFData *rd) emitir(IRTGI(vrep > 1 ? IR_GT : IR_LE), rep, lj_ir_kint(J, 1)); if (vrep > 1) { TRef hdr2 = recff_bufhdr(J); - TRef tr2 = emitir(IRT(IR_BUFPUT, IRT_P32), hdr2, sep); - tr2 = emitir(IRT(IR_BUFPUT, IRT_P32), tr2, str); + TRef tr2 = emitir(IRT(IR_BUFPUT, IRT_PGC), hdr2, sep); + tr2 = emitir(IRT(IR_BUFPUT, IRT_PGC), tr2, str); str2 = emitir(IRT(IR_BUFSTR, IRT_STR), tr2, hdr2); } } tr = hdr = recff_bufhdr(J); if (str2) { - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, str); + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, str); str = str2; rep = emitir(IRTI(IR_ADD), rep, lj_ir_kint(J, -1)); } @@ -927,8 +927,8 @@ static void LJ_FASTCALL recff_string_find(jit_State *J, RecordFFData *rd) if ((J->base[2] && tref_istruecond(J->base[3])) || (emitir(IRTG(IR_EQ, IRT_STR), trpat, lj_ir_kstr(J, pat)), !lj_str_haspattern(pat))) { /* Search for fixed string. */ - TRef trsptr = emitir(IRT(IR_STRREF, IRT_P32), trstr, trstart); - TRef trpptr = emitir(IRT(IR_STRREF, IRT_P32), trpat, tr0); + TRef trsptr = emitir(IRT(IR_STRREF, IRT_PGC), trstr, trstart); + TRef trpptr = emitir(IRT(IR_STRREF, IRT_PGC), trpat, tr0); TRef trslen = emitir(IRTI(IR_SUB), trlen, trstart); TRef trplen = emitir(IRTI(IR_FLOAD), trpat, IRFL_STR_LEN); TRef tr = lj_ir_call(J, IRCALL_lj_str_find, trsptr, trpptr, trslen, trplen); @@ -936,13 +936,13 @@ static void LJ_FASTCALL recff_string_find(jit_State *J, RecordFFData *rd) if (lj_str_find(strdata(str)+(MSize)start, strdata(pat), str->len-(MSize)start, pat->len)) { TRef pos; - emitir(IRTG(IR_NE, IRT_P32), tr, trp0); - pos = emitir(IRTI(IR_SUB), tr, emitir(IRT(IR_STRREF, IRT_P32), trstr, tr0)); + emitir(IRTG(IR_NE, IRT_PGC), tr, trp0); + pos = emitir(IRTI(IR_SUB), tr, emitir(IRT(IR_STRREF, IRT_PGC), trstr, tr0)); J->base[0] = emitir(IRTI(IR_ADD), pos, lj_ir_kint(J, 1)); J->base[1] = emitir(IRTI(IR_ADD), pos, trplen); rd->nres = 2; } else { - emitir(IRTG(IR_EQ, IRT_P32), tr, trp0); + emitir(IRTG(IR_EQ, IRT_PGC), tr, trp0); J->base[0] = TREF_NIL; } } else { /* Search for pattern. */ @@ -969,7 +969,7 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd) IRCallID id; switch (STRFMT_TYPE(sf)) { case STRFMT_LIT: - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, lj_ir_kstr(J, lj_str_new(J->L, fs.str, fs.len))); break; case STRFMT_INT: @@ -978,7 +978,7 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd) if (!tref_isinteger(tra)) goto handle_num; if (sf == STRFMT_INT) { /* Shortcut for plain %d. */ - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, emitir(IRT(IR_TOSTR, IRT_STR), tra, IRTOSTR_INT)); } else { #if LJ_HASFFI @@ -1008,7 +1008,7 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd) return; } if (sf == STRFMT_STR) /* Shortcut for plain %s. */ - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, tra); + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, tra); else if ((sf & STRFMT_T_QUOTED)) tr = lj_ir_call(J, IRCALL_lj_strfmt_putquoted, tr, tra); else @@ -1017,7 +1017,7 @@ static void LJ_FASTCALL recff_string_format(jit_State *J, RecordFFData *rd) case STRFMT_CHAR: tra = lj_opt_narrow_toint(J, tra); if (sf == STRFMT_CHAR) /* Shortcut for plain %c. */ - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, emitir(IRT(IR_TOSTR, IRT_STR), tra, IRTOSTR_CHAR)); else tr = lj_ir_call(J, IRCALL_lj_strfmt_putfchar, tr, trsf, tra); @@ -1125,7 +1125,7 @@ static void LJ_FASTCALL recff_io_write(jit_State *J, RecordFFData *rd) ptrdiff_t i = rd->data == 0 ? 1 : 0; for (; J->base[i]; i++) { TRef str = lj_ir_tostr(J, J->base[i]); - TRef buf = emitir(IRT(IR_STRREF, IRT_P32), str, zero); + TRef buf = emitir(IRT(IR_STRREF, IRT_PGC), str, zero); TRef len = emitir(IRTI(IR_FLOAD), str, IRFL_STR_LEN); if (tref_isk(len) && IR(tref_ref(len))->i == 1) { IRIns *irs = IR(tref_ref(str)); diff --git a/src/lj_ir.h b/src/lj_ir.h index cd8df59d..8a655b64 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -318,9 +318,10 @@ IRTDEF(IRTENUM) /* Native pointer type and the corresponding integer type. */ IRT_PTR = LJ_64 ? IRT_P64 : IRT_P32, + IRT_PGC = LJ_GC64 ? IRT_P64 : IRT_P32, + IRT_IGC = LJ_GC64 ? IRT_I64 : IRT_INT, IRT_INTP = LJ_64 ? IRT_I64 : IRT_INT, IRT_UINTP = LJ_64 ? IRT_U64 : IRT_U32, - /* TODO_GC64: major changes required for all uses of IRT_P32. */ /* Additional flags. */ IRT_MARK = 0x20, /* Marker for misc. purposes. */ diff --git a/src/lj_ircall.h b/src/lj_ircall.h index aa625915..f2a6ecec 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -123,39 +123,39 @@ typedef struct CCallInfo { /* Function definitions for CALL* instructions. */ #define IRCALLDEF(_) \ _(ANY, lj_str_cmp, 2, FN, INT, CCI_NOFPRCLOBBER) \ - _(ANY, lj_str_find, 4, N, P32, 0) \ + _(ANY, lj_str_find, 4, N, PGC, 0) \ _(ANY, lj_str_new, 3, S, STR, CCI_L) \ _(ANY, lj_strscan_num, 2, FN, INT, 0) \ _(ANY, lj_strfmt_int, 2, FN, STR, CCI_L) \ _(ANY, lj_strfmt_num, 2, FN, STR, CCI_L) \ _(ANY, lj_strfmt_char, 2, FN, STR, CCI_L) \ - _(ANY, lj_strfmt_putint, 2, FL, P32, 0) \ - _(ANY, lj_strfmt_putnum, 2, FL, P32, 0) \ - _(ANY, lj_strfmt_putquoted, 2, FL, P32, 0) \ - _(ANY, lj_strfmt_putfxint, 3, L, P32, XA_64) \ - _(ANY, lj_strfmt_putfnum_int, 3, L, P32, XA_FP) \ - _(ANY, lj_strfmt_putfnum_uint, 3, L, P32, XA_FP) \ - _(ANY, lj_strfmt_putfnum, 3, L, P32, XA_FP) \ - _(ANY, lj_strfmt_putfstr, 3, L, P32, 0) \ - _(ANY, lj_strfmt_putfchar, 3, L, P32, 0) \ - _(ANY, lj_buf_putmem, 3, S, P32, 0) \ - _(ANY, lj_buf_putstr, 2, FL, P32, 0) \ - _(ANY, lj_buf_putchar, 2, FL, P32, 0) \ - _(ANY, lj_buf_putstr_reverse, 2, FL, P32, 0) \ - _(ANY, lj_buf_putstr_lower, 2, FL, P32, 0) \ - _(ANY, lj_buf_putstr_upper, 2, FL, P32, 0) \ - _(ANY, lj_buf_putstr_rep, 3, L, P32, 0) \ - _(ANY, lj_buf_puttab, 5, L, P32, 0) \ + _(ANY, lj_strfmt_putint, 2, FL, PGC, 0) \ + _(ANY, lj_strfmt_putnum, 2, FL, PGC, 0) \ + _(ANY, lj_strfmt_putquoted, 2, FL, PGC, 0) \ + _(ANY, lj_strfmt_putfxint, 3, L, PGC, XA_64) \ + _(ANY, lj_strfmt_putfnum_int, 3, L, PGC, XA_FP) \ + _(ANY, lj_strfmt_putfnum_uint, 3, L, PGC, XA_FP) \ + _(ANY, lj_strfmt_putfnum, 3, L, PGC, XA_FP) \ + _(ANY, lj_strfmt_putfstr, 3, L, PGC, 0) \ + _(ANY, lj_strfmt_putfchar, 3, L, PGC, 0) \ + _(ANY, lj_buf_putmem, 3, S, PGC, 0) \ + _(ANY, lj_buf_putstr, 2, FL, PGC, 0) \ + _(ANY, lj_buf_putchar, 2, FL, PGC, 0) \ + _(ANY, lj_buf_putstr_reverse, 2, FL, PGC, 0) \ + _(ANY, lj_buf_putstr_lower, 2, FL, PGC, 0) \ + _(ANY, lj_buf_putstr_upper, 2, FL, PGC, 0) \ + _(ANY, lj_buf_putstr_rep, 3, L, PGC, 0) \ + _(ANY, lj_buf_puttab, 5, L, PGC, 0) \ _(ANY, lj_buf_tostr, 1, FL, STR, 0) \ _(ANY, lj_tab_new_ah, 3, A, TAB, CCI_L) \ _(ANY, lj_tab_new1, 2, FS, TAB, CCI_L) \ _(ANY, lj_tab_dup, 2, FS, TAB, CCI_L) \ _(ANY, lj_tab_clear, 1, FS, NIL, 0) \ - _(ANY, lj_tab_newkey, 3, S, P32, CCI_L) \ + _(ANY, lj_tab_newkey, 3, S, PGC, CCI_L) \ _(ANY, lj_tab_len, 1, FL, INT, 0) \ _(ANY, lj_gc_step_jit, 2, FS, NIL, CCI_L) \ _(ANY, lj_gc_barrieruv, 2, FS, NIL, 0) \ - _(ANY, lj_mem_newgco, 2, FS, P32, CCI_L) \ + _(ANY, lj_mem_newgco, 2, FS, PGC, CCI_L) \ _(ANY, lj_math_random_step, 1, FS, NUM, CCI_CASTU64) \ _(ANY, lj_vm_modi, 2, FN, INT, 0) \ _(ANY, sinh, 1, N, NUM, XA_FP) \ diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index e1d13691..95b58a1a 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -502,7 +502,7 @@ LJFOLDF(kfold_strref_snew) PHIBARRIER(ir); fins->op2 = emitir(IRTI(IR_ADD), ir->op2, fins->op2); /* Clobbers fins! */ fins->op1 = str; - fins->ot = IRT(IR_STRREF, IRT_P32); + fins->ot = IRT(IR_STRREF, IRT_PGC); return RETRYFOLD; } } diff --git a/src/lj_record.c b/src/lj_record.c index 568b73aa..f7c53567 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -643,8 +643,8 @@ static TRef rec_call_specialize(jit_State *J, GCfunc *fn, TRef tr) GCproto *pt = funcproto(fn); /* Too many closures created? Probably not a monomorphic function. */ if (pt->flags >= PROTO_CLC_POLY) { /* Specialize to prototype instead. */ - TRef trpt = emitir(IRT(IR_FLOAD, IRT_P32), tr, IRFL_FUNC_PC); - emitir(IRTG(IR_EQ, IRT_P32), trpt, lj_ir_kptr(J, proto_bc(pt))); + TRef trpt = emitir(IRT(IR_FLOAD, IRT_PGC), tr, IRFL_FUNC_PC); + emitir(IRTG(IR_EQ, IRT_PGC), trpt, lj_ir_kptr(J, proto_bc(pt))); (void)lj_ir_kgc(J, obj2gco(pt), IRT_PROTO); /* Prevent GC of proto. */ return tr; } @@ -905,7 +905,7 @@ int lj_record_mm_lookup(jit_State *J, RecordIndex *ix, MMS mm) cTValue *mo; if (LJ_HASFFI && udtype == UDTYPE_FFI_CLIB) { /* Specialize to the C library namespace object. */ - emitir(IRTG(IR_EQ, IRT_P32), ix->tab, lj_ir_kptr(J, udataV(&ix->tabv))); + emitir(IRTG(IR_EQ, IRT_PGC), ix->tab, lj_ir_kptr(J, udataV(&ix->tabv))); } else { /* Specialize to the type of userdata. */ TRef tr = emitir(IRT(IR_FLOAD, IRT_U8), ix->tab, IRFL_UDATA_UDTYPE); @@ -1252,8 +1252,8 @@ static TRef rec_idx_key(jit_State *J, RecordIndex *ix, IRRef *rbref, if ((MSize)k < t->asize) { /* Currently an array key? */ TRef arrayref; rec_idx_abc(J, asizeref, ikey, t->asize); - arrayref = emitir(IRT(IR_FLOAD, IRT_P32), ix->tab, IRFL_TAB_ARRAY); - return emitir(IRT(IR_AREF, IRT_P32), arrayref, ikey); + arrayref = emitir(IRT(IR_FLOAD, IRT_PGC), ix->tab, IRFL_TAB_ARRAY); + return emitir(IRT(IR_AREF, IRT_PGC), arrayref, ikey); } else { /* Currently not in array (may be an array extension)? */ emitir(IRTGI(IR_ULE), asizeref, ikey); /* Inv. bounds check. */ if (k == 0 && tref_isk(key)) @@ -1293,13 +1293,13 @@ static TRef rec_idx_key(jit_State *J, RecordIndex *ix, IRRef *rbref, *rbguard = J->guardemit; hm = emitir(IRTI(IR_FLOAD), ix->tab, IRFL_TAB_HMASK); emitir(IRTGI(IR_EQ), hm, lj_ir_kint(J, (int32_t)t->hmask)); - node = emitir(IRT(IR_FLOAD, IRT_P32), ix->tab, IRFL_TAB_NODE); + node = emitir(IRT(IR_FLOAD, IRT_PGC), ix->tab, IRFL_TAB_NODE); kslot = lj_ir_kslot(J, key, hslot / sizeof(Node)); - return emitir(IRTG(IR_HREFK, IRT_P32), node, kslot); + return emitir(IRTG(IR_HREFK, IRT_PGC), node, kslot); } } /* Fall back to a regular hash lookup. */ - return emitir(IRT(IR_HREF, IRT_P32), ix->tab, key); + return emitir(IRT(IR_HREF, IRT_PGC), ix->tab, key); } /* Determine whether a key is NOT one of the fast metamethod names. */ @@ -1382,7 +1382,7 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) IRType t = itype2irt(oldv); TRef res; if (oldv == niltvg(J2G(J))) { - emitir(IRTG(IR_EQ, IRT_P32), xref, lj_ir_kkptr(J, niltvg(J2G(J)))); + emitir(IRTG(IR_EQ, IRT_PGC), xref, lj_ir_kkptr(J, niltvg(J2G(J)))); res = TREF_NIL; } else { res = emitir(IRTG(loadop, t), xref, 0); @@ -1412,7 +1412,7 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) if (hasmm) emitir(IRTG(loadop, IRT_NIL), xref, 0); /* Guard for nil value. */ else if (xrefop == IR_HREF) - emitir(IRTG(oldv == niltvg(J2G(J)) ? IR_EQ : IR_NE, IRT_P32), + emitir(IRTG(oldv == niltvg(J2G(J)) ? IR_EQ : IR_NE, IRT_PGC), xref, lj_ir_kkptr(J, niltvg(J2G(J)))); if (ix->idxchain && lj_record_mm_lookup(J, ix, MM_newindex)) { lua_assert(hasmm); @@ -1423,7 +1423,7 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) TRef key = ix->key; if (tref_isinteger(key)) /* NEWREF needs a TValue as a key. */ key = emitir(IRTN(IR_CONV), key, IRCONV_NUM_INT); - xref = emitir(IRT(IR_NEWREF, IRT_P32), ix->tab, key); + xref = emitir(IRT(IR_NEWREF, IRT_PGC), ix->tab, key); keybarrier = 0; /* NEWREF already takes care of the key barrier. */ #ifdef LUAJIT_ENABLE_TABLE_BUMP if ((J->flags & JIT_F_OPT_SINK)) /* Avoid a separate flag. */ @@ -1433,7 +1433,7 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) } else if (!lj_opt_fwd_wasnonnil(J, loadop, tref_ref(xref))) { /* Cannot derive that the previous value was non-nil, must do checks. */ if (xrefop == IR_HREF) /* Guard against store to niltv. */ - emitir(IRTG(IR_NE, IRT_P32), xref, lj_ir_kkptr(J, niltvg(J2G(J)))); + emitir(IRTG(IR_NE, IRT_PGC), xref, lj_ir_kkptr(J, niltvg(J2G(J)))); if (ix->idxchain) { /* Metamethod lookup required? */ /* A check for NULL metatable is cheaper (hoistable) than a load. */ if (!mt) { @@ -1455,7 +1455,7 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) emitir(IRT(IR_TBAR, IRT_NIL), ix->tab, 0); /* Invalidate neg. metamethod cache for stores with certain string keys. */ if (!nommstr(J, ix->key)) { - TRef fref = emitir(IRT(IR_FREF, IRT_P32), ix->tab, IRFL_TAB_NOMM); + TRef fref = emitir(IRT(IR_FREF, IRT_PGC), ix->tab, IRFL_TAB_NOMM); emitir(IRT(IR_FSTORE, IRT_U8), fref, lj_ir_kint(J, 0)); } J->needsnap = 1; @@ -1541,15 +1541,15 @@ noconstify: /* Note: this effectively limits LJ_MAX_UPVAL to 127. */ uv = (uv << 8) | (hashrot(uvp->dhash, uvp->dhash + HASH_BIAS) & 0xff); if (!uvp->closed) { - uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_P32), fn, uv)); + uref = tref_ref(emitir(IRTG(IR_UREFO, IRT_PGC), fn, uv)); /* In current stack? */ if (uvval(uvp) >= tvref(J->L->stack) && uvval(uvp) < tvref(J->L->maxstack)) { int32_t slot = (int32_t)(uvval(uvp) - (J->L->base - J->baseslot)); if (slot >= 0) { /* Aliases an SSA slot? */ - emitir(IRTG(IR_EQ, IRT_P32), + emitir(IRTG(IR_EQ, IRT_PGC), REF_BASE, - emitir(IRT(IR_ADD, IRT_P32), uref, + emitir(IRT(IR_ADD, IRT_PGC), uref, lj_ir_kint(J, (slot - 1 - LJ_FR2) * -8))); slot -= (int32_t)J->baseslot; /* Note: slot number may be negative! */ if (val == 0) { @@ -1561,12 +1561,12 @@ noconstify: } } } - emitir(IRTG(IR_UGT, IRT_P32), - emitir(IRT(IR_SUB, IRT_P32), uref, REF_BASE), + emitir(IRTG(IR_UGT, IRT_PGC), + emitir(IRT(IR_SUB, IRT_PGC), uref, REF_BASE), lj_ir_kint(J, (J->baseslot + J->maxslot) * 8)); } else { needbarrier = 1; - uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_P32), fn, uv)); + uref = tref_ref(emitir(IRTG(IR_UREFC, IRT_PGC), fn, uv)); } if (val == 0) { /* Upvalue load */ IRType t = itype2irt(uvval(uvp)); @@ -1733,11 +1733,11 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) else emitir(IRTGI(IR_EQ), fr, lj_ir_kint(J, (int32_t)frame_ftsz(J->L->base-1))); - vbase = emitir(IRTI(IR_SUB), REF_BASE, fr); - vbase = emitir(IRT(IR_ADD, IRT_P32), vbase, lj_ir_kint(J, frofs-8)); + vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr); + vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8)); for (i = 0; i < nload; i++) { IRType t = itype2irt(&J->L->base[i-1-nvararg]); - TRef aref = emitir(IRT(IR_AREF, IRT_P32), + TRef aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, lj_ir_kint(J, (int32_t)i)); TRef tr = emitir(IRTG(IR_VLOAD, t), aref, 0); if (irtype_ispri(t)) tr = TREF_PRI(t); /* Canonicalize primitives. */ @@ -1783,10 +1783,10 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) } if (idx != 0 && idx <= nvararg) { IRType t; - TRef aref, vbase = emitir(IRTI(IR_SUB), REF_BASE, fr); - vbase = emitir(IRT(IR_ADD, IRT_P32), vbase, lj_ir_kint(J, frofs-8)); + TRef aref, vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr); + vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8)); t = itype2irt(&J->L->base[idx-2-nvararg]); - aref = emitir(IRT(IR_AREF, IRT_P32), vbase, tridx); + aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, tridx); tr = emitir(IRTG(IR_VLOAD, t), aref, 0); if (irtype_ispri(t)) tr = TREF_PRI(t); /* Canonicalize primitives. */ } @@ -1840,10 +1840,10 @@ static TRef rec_cat(jit_State *J, BCReg baseslot, BCReg topslot) break; } xbase = ++trp; - tr = hdr = emitir(IRT(IR_BUFHDR, IRT_P32), + tr = hdr = emitir(IRT(IR_BUFHDR, IRT_PGC), lj_ir_kptr(J, &J2G(J)->tmpbuf), IRBUFHDR_RESET); do { - tr = emitir(IRT(IR_BUFPUT, IRT_P32), tr, *trp++); + tr = emitir(IRT(IR_BUFPUT, IRT_PGC), tr, *trp++); } while (trp <= top); tr = emitir(IRT(IR_BUFSTR, IRT_STR), tr, hdr); J->maxslot = (BCReg)(xbase - J->base); @@ -2481,7 +2481,7 @@ void lj_record_setup(jit_State *J) J->bc_extent = ~(MSize)0; /* Emit instructions for fixed references. Also triggers initial IR alloc. */ - emitir_raw(IRT(IR_BASE, IRT_P32), J->parent, J->exitno); + emitir_raw(IRT(IR_BASE, IRT_PGC), J->parent, J->exitno); for (i = 0; i <= 2; i++) { IRIns *ir = IR(REF_NIL-i); ir->i = 0; From cfa188f1349ba4c843394b53f270cb64635b9805 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 May 2016 00:02:45 +0200 Subject: [PATCH 17/57] Move common 32/64 bit in-memory FP constants to jit_State. Prerequisite for immovable IR. Contributed by Peter Cawley. --- src/lj_asm_mips.h | 12 ++++-------- src/lj_asm_ppc.h | 14 +++++--------- src/lj_asm_x86.h | 16 ++++++---------- src/lj_ir.c | 10 +++++----- src/lj_jit.h | 35 ++++++++++++++++++++++++++++++++++- src/lj_trace.c | 24 +++++++++++++++++++++++- 6 files changed, 77 insertions(+), 34 deletions(-) diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index ecb38c5d..d37bc132 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -459,12 +459,10 @@ static void asm_conv(ASMState *as, IRIns *ir) dest, dest); if (irt_isfloat(ir->t)) emit_lsptr(as, MIPSI_LWC1, (tmp & 31), - (void *)lj_ir_k64_find(as->J, U64x(4f000000,4f000000)), - RSET_GPR); + (void *)&as->J->k32[LJ_K32_2P31], RSET_GPR); else emit_lsptr(as, MIPSI_LDC1, (tmp & 31), - (void *)lj_ir_k64_find(as->J, U64x(41e00000,00000000)), - RSET_GPR); + (void *)&as->J->k64[LJ_K64_2P31], RSET_GPR); emit_tg(as, MIPSI_MTC1, RID_TMP, dest); emit_dst(as, MIPSI_XOR, RID_TMP, RID_TMP, left); emit_ti(as, MIPSI_LUI, RID_TMP, 0x8000); @@ -494,12 +492,10 @@ static void asm_conv(ASMState *as, IRIns *ir) tmp, left, tmp); if (st == IRT_FLOAT) emit_lsptr(as, MIPSI_LWC1, (tmp & 31), - (void *)lj_ir_k64_find(as->J, U64x(4f000000,4f000000)), - RSET_GPR); + (void *)&as->J->k32[LJ_K32_2P31], RSET_GPR); else emit_lsptr(as, MIPSI_LDC1, (tmp & 31), - (void *)lj_ir_k64_find(as->J, U64x(41e00000,00000000)), - RSET_GPR); + (void *)&as->J->k64[LJ_K64_2P31], RSET_GPR); } else { emit_tg(as, MIPSI_MFC1, dest, tmp); emit_fg(as, st == IRT_FLOAT ? MIPSI_TRUNC_W_S : MIPSI_TRUNC_W_D, diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index 4cf1649a..e270b36c 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -393,8 +393,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left) emit_asi(as, PPCI_XORIS, RID_TMP, dest, 0x8000); emit_tai(as, PPCI_LWZ, dest, RID_SP, SPOFS_TMPLO); emit_lsptr(as, PPCI_LFS, (fbias & 31), - (void *)lj_ir_k64_find(as->J, U64x(59800004,59800000)), - RSET_GPR); + (void *)&as->J->k32[LJ_K32_2P52_2P31], RSET_GPR); emit_fai(as, PPCI_STFD, tmp, RID_SP, SPOFS_TMP); emit_fb(as, PPCI_FCTIWZ, tmp, left); } @@ -433,13 +432,11 @@ static void asm_conv(ASMState *as, IRIns *ir) Reg left = ra_alloc1(as, lref, allow); Reg hibias = ra_allock(as, 0x43300000, rset_clear(allow, left)); Reg fbias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); - const float *kbias; if (irt_isfloat(ir->t)) emit_fb(as, PPCI_FRSP, dest, dest); emit_fab(as, PPCI_FSUB, dest, dest, fbias); emit_fai(as, PPCI_LFD, dest, RID_SP, SPOFS_TMP); - kbias = (const float *)lj_ir_k64_find(as->J, U64x(59800004,59800000)); - if (st == IRT_U32) kbias++; - emit_lsptr(as, PPCI_LFS, (fbias & 31), (void *)kbias, + emit_lsptr(as, PPCI_LFS, (fbias & 31), + &as->J->k32[st == IRT_U32 ? LJ_K32_2P52 : LJ_K32_2P52_2P31], rset_clear(allow, hibias)); emit_tai(as, PPCI_STW, st == IRT_U32 ? left : RID_TMP, RID_SP, SPOFS_TMPLO); @@ -472,8 +469,7 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_fb(as, PPCI_FCTIWZ, tmp, tmp); emit_fab(as, PPCI_FSUB, tmp, left, tmp); emit_lsptr(as, PPCI_LFS, (tmp & 31), - (void *)lj_ir_k64_find(as->J, U64x(4f000000,00000000)), - RSET_GPR); + (void *)&as->J->k32[LJ_K32_2P31], RSET_GPR); } else { emit_tai(as, PPCI_LWZ, dest, RID_SP, SPOFS_TMPLO); emit_fai(as, PPCI_STFD, tmp, RID_SP, SPOFS_TMP); @@ -974,7 +970,7 @@ static void asm_sload(ASMState *as, IRIns *ir) emit_fab(as, PPCI_FSUB, dest, dest, fbias); emit_fai(as, PPCI_LFD, dest, RID_SP, SPOFS_TMP); emit_lsptr(as, PPCI_LFS, (fbias & 31), - (void *)lj_ir_k64_find(as->J, U64x(59800004,59800000)), + (void *)&as->J->k32[LJ_K32_2P52_2P31], rset_clear(allow, hibias)); emit_tai(as, PPCI_STW, tmp, RID_SP, SPOFS_TMPLO); emit_tai(as, PPCI_STW, hibias, RID_SP, SPOFS_TMPHI); diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 6cd3800d..e3ed7554 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -696,7 +696,7 @@ static void asm_conv(ASMState *as, IRIns *ir) if (left == dest) return; /* Avoid the XO_XORPS. */ } else if (LJ_32 && st == IRT_U32) { /* U32 to FP conversion on x86. */ /* number = (2^52+2^51 .. u32) - (2^52+2^51) */ - cTValue *k = lj_ir_k64_find(as->J, U64x(43380000,00000000)); + cTValue *k = &as->J->k64[LJ_K64_TOBIT]; Reg bias = ra_scratch(as, rset_exclude(RSET_FPR, dest)); if (irt_isfloat(ir->t)) emit_rr(as, XO_CVTSD2SS, dest, dest); @@ -711,7 +711,7 @@ static void asm_conv(ASMState *as, IRIns *ir) asm_fuseloadm(as, lref, RSET_GPR, st64); if (LJ_64 && st == IRT_U64) { MCLabel l_end = emit_label(as); - const void *k = lj_ir_k64_find(as->J, U64x(43f00000,00000000)); + cTValue *k = &as->J->k64[LJ_K64_2P64]; emit_rma(as, XO_ADDSD, dest, k); /* Add 2^64 to compensate. */ emit_sjcc(as, CC_NS, l_end); emit_rr(as, XO_TEST, left|REX_64, left); /* Check if u64 >= 2^63. */ @@ -738,11 +738,9 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_gri(as, XG_ARITHi(XOg_ADD), dest, (int32_t)0x80000000); emit_rr(as, op, dest|REX_64, tmp); if (st == IRT_NUM) - emit_rma(as, XO_ADDSD, tmp, lj_ir_k64_find(as->J, - LJ_64 ? U64x(c3f00000,00000000) : U64x(c1e00000,00000000))); + emit_rma(as, XO_ADDSD, tmp, &as->J->k64[LJ_K64_M2P64_31]); else - emit_rma(as, XO_ADDSS, tmp, lj_ir_k64_find(as->J, - LJ_64 ? U64x(00000000,df800000) : U64x(00000000,cf000000))); + emit_rma(as, XO_ADDSS, tmp, &as->J->k32[LJ_K32_M2P64_31]); emit_sjcc(as, CC_NS, l_end); emit_rr(as, XO_TEST, dest|REX_64, dest); /* Check if dest negative. */ emit_rr(as, op, dest|REX_64, tmp); @@ -828,8 +826,7 @@ static void asm_conv_fp_int64(ASMState *as, IRIns *ir) if (((ir-1)->op2 & IRCONV_SRCMASK) == IRT_U64) { /* For inputs in [2^63,2^64-1] add 2^64 to compensate. */ MCLabel l_end = emit_label(as); - emit_rma(as, XO_FADDq, XOg_FADDq, - lj_ir_k64_find(as->J, U64x(43f00000,00000000))); + emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_2P64]); emit_sjcc(as, CC_NS, l_end); emit_rr(as, XO_TEST, hi, hi); /* Check if u64 >= 2^63. */ } else { @@ -869,8 +866,7 @@ static void asm_conv_int64_fp(ASMState *as, IRIns *ir) emit_rmro(as, XO_FISTTPq, XOg_FISTTPq, RID_ESP, 0); else emit_rmro(as, XO_FISTPq, XOg_FISTPq, RID_ESP, 0); - emit_rma(as, XO_FADDq, XOg_FADDq, - lj_ir_k64_find(as->J, U64x(c3f00000,00000000))); + emit_rma(as, XO_FADDq, XOg_FADDq, &as->J->k64[LJ_K64_M2P64]); emit_sjcc(as, CC_NS, l_pop); emit_rr(as, XO_TEST, hi, hi); /* Check if out-of-range (2^63). */ } diff --git a/src/lj_ir.c b/src/lj_ir.c index b4087aa7..6a1ecc13 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -204,12 +204,12 @@ typedef struct K64Array { void lj_ir_k64_freeall(jit_State *J) { K64Array *k; - for (k = mref(J->k64, K64Array); k; ) { + for (k = mref(J->k64p, K64Array); k; ) { K64Array *next = mref(k->next, K64Array); lj_mem_free(J2G(J), k, sizeof(K64Array)); k = next; } - setmref(J->k64, NULL); + setmref(J->k64p, NULL); } /* Get new 64 bit constant slot. */ @@ -223,7 +223,7 @@ static TValue *ir_k64_add(jit_State *J, K64Array *kp, uint64_t u64) if (kp) setmref(kp->next, kn); /* Chain to the end of the list. */ else - setmref(J->k64, kn); /* Link first array. */ + setmref(J->k64p, kn); /* Link first array. */ kp = kn; } ntv = &kp->k[kp->numk++]; /* Add to current array. */ @@ -237,7 +237,7 @@ cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64) K64Array *k, *kp = NULL; MSize idx; /* Search for the constant in the whole chain of arrays. */ - for (k = mref(J->k64, K64Array); k; k = mref(k->next, K64Array)) { + for (k = mref(J->k64p, K64Array); k; k = mref(k->next, K64Array)) { kp = k; /* Remember previous element in list. */ for (idx = 0; idx < k->numk; idx++) { /* Search one array. */ TValue *tv = &k->k[idx]; @@ -254,7 +254,7 @@ TValue *lj_ir_k64_reserve(jit_State *J) K64Array *k, *kp = NULL; lj_ir_k64_find(J, 0); /* Intern dummy 0 to protect the reserved slot. */ /* Find last K64Array, if any. */ - for (k = mref(J->k64, K64Array); k; k = mref(k->next, K64Array)) kp = k; + for (k = mref(J->k64p, K64Array); k; k = mref(k->next, K64Array)) kp = k; return ir_k64_add(J, kp, 0); /* Set to 0. Final value is set later. */ } diff --git a/src/lj_jit.h b/src/lj_jit.h index 2d2e833a..6a47961b 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -308,6 +308,37 @@ enum { LJ_KSIMD__MAX }; +enum { +#if LJ_TARGET_X86ORX64 + LJ_K64_TOBIT, /* 2^52 + 2^51 */ + LJ_K64_2P64, /* 2^64 */ + LJ_K64_M2P64, /* -2^64 */ +#if LJ_32 + LJ_K64_M2P64_31, /* -2^64 or -2^31 */ +#else + LJ_K64_M2P64_31 = LJ_K64_M2P64, +#endif +#endif +#if LJ_TARGET_MIPS + LJ_K64_2P31, /* 2^31 */ +#endif + LJ_K64__MAX, +}; + +enum { +#if LJ_TARGET_X86ORX64 + LJ_K32_M2P64_31, /* -2^64 or -2^31 */ +#endif +#if LJ_TARGET_PPC + LJ_K32_2P52_2P31, /* 2^52 + 2^31 */ + LJ_K32_2P52, /* 2^52 */ +#endif +#if LJ_TARGET_PPC || LJ_TARGET_MIPS + LJ_K32_2P31, /* 2^31 */ +#endif + LJ_K32__MAX +}; + /* Get 16 byte aligned pointer to SIMD constant. */ #define LJ_KSIMD(J, n) \ ((TValue *)(((intptr_t)&J->ksimd[2*(n)] + 15) & ~(intptr_t)15)) @@ -360,8 +391,10 @@ typedef struct jit_State { int32_t framedepth; /* Current frame depth. */ int32_t retdepth; /* Return frame depth (count of RETF). */ - MRef k64; /* Pointer to chained array of 64 bit constants. */ + MRef k64p; /* Pointer to chained array of 64 bit constants. */ TValue ksimd[LJ_KSIMD__MAX*2+1]; /* 16 byte aligned SIMD constants. */ + TValue k64[LJ_K64__MAX]; /* Common 8 byte constants used by backends. */ + uint32_t k32[LJ_K32__MAX]; /* Ditto for 4 byte constants. */ IRIns *irbuf; /* Temp. IR instruction buffer. Biased with REF_BIAS. */ IRRef irtoplim; /* Upper limit of instuction buffer (biased). */ diff --git a/src/lj_trace.c b/src/lj_trace.c index 7970aba6..0d54c0af 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -297,13 +297,35 @@ void lj_trace_initstate(global_State *g) { jit_State *J = G2J(g); TValue *tv; - /* Initialize SIMD constants. */ + + /* Initialize aligned SIMD constants. */ tv = LJ_KSIMD(J, LJ_KSIMD_ABS); tv[0].u64 = U64x(7fffffff,ffffffff); tv[1].u64 = U64x(7fffffff,ffffffff); tv = LJ_KSIMD(J, LJ_KSIMD_NEG); tv[0].u64 = U64x(80000000,00000000); tv[1].u64 = U64x(80000000,00000000); + + /* Initialize 32/64 bit constants. */ +#if LJ_TARGET_X86ORX64 + J->k64[LJ_K64_TOBIT].u64 = U64x(43380000,00000000); + J->k64[LJ_K64_2P64].u64 = U64x(43f00000,00000000); + J->k64[LJ_K64_M2P64].u64 = U64x(c3f00000,00000000); +#if LJ_32 + J->k64[LJ_K64_M2P64_31].u64 = U64x(c1e00000,00000000); +#endif + J->k32[LJ_K32_M2P64_31] = LJ_64 ? 0xdf800000 : 0xcf000000; +#endif +#if LJ_TARGET_PPC + J->k32[LJ_K32_2P52_2P31] = 0x59800004; + J->k32[LJ_K32_2P52] = 0x59800000; +#endif +#if LJ_TARGET_PPC || LJ_TARGET_MIPS + J->k32[LJ_K32_2P31] = 0x4f000000; +#endif +#if LJ_TARGET_MIPS + J->k64[LJ_K64_2P31].u64 = U64x(41e00000,00000000); +#endif } /* Free everything associated with the JIT compiler state. */ From 786dbb2ebdde16eadd7464cd5cbeb5d95a5e46f0 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 May 2016 00:30:36 +0200 Subject: [PATCH 18/57] Add IR_FLOAD with REF_NIL for field loads from GG_State. Contributed by Peter Cawley. --- src/Makefile.dep | 4 ++-- src/lj_asm_arm.h | 32 ++++++++++++++++++-------------- src/lj_asm_mips.h | 20 +++++++++++++------- src/lj_asm_ppc.h | 20 +++++++++++++------- src/lj_asm_x86.h | 11 ++++++++++- src/lj_ir.c | 8 ++++++++ src/lj_iropt.h | 2 ++ src/lj_opt_split.c | 6 ++++++ 8 files changed, 72 insertions(+), 31 deletions(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index 76b29108..1df1ce6c 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -162,8 +162,8 @@ lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ lj_opt_sink.o: lj_opt_sink.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_ir.h lj_jit.h lj_iropt.h lj_target.h lj_target_*.h lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \ - lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_ir.h \ - lj_jit.h lj_ircall.h lj_iropt.h lj_vm.h + lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_dispatch.h \ + lj_bc.h lj_jit.h lj_ir.h lj_ircall.h lj_iropt.h lj_vm.h lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_buf.h lj_str.h lj_tab.h \ lj_func.h lj_state.h lj_bc.h lj_ctype.h lj_strfmt.h lj_lex.h lj_parse.h \ diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index c64d59e7..23f42919 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -997,22 +997,26 @@ static ARMIns asm_fxstoreins(IRIns *ir) static void asm_fload(ASMState *as, IRIns *ir) { - Reg dest = ra_dest(as, ir, RSET_GPR); - Reg idx = ra_alloc1(as, ir->op1, RSET_GPR); - ARMIns ai = asm_fxloadins(ir); - int32_t ofs; - if (ir->op2 == IRFL_TAB_ARRAY) { - ofs = asm_fuseabase(as, ir->op1); - if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ - emit_dn(as, ARMI_ADD|ARMI_K12|ofs, dest, idx); - return; + if (ir->op1 == REF_NIL) { + lua_assert(!ra_used(ir)); /* We can end up here if DCE is turned off. */ + } else { + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg idx = ra_alloc1(as, ir->op1, RSET_GPR); + ARMIns ai = asm_fxloadins(ir); + int32_t ofs; + if (ir->op2 == IRFL_TAB_ARRAY) { + ofs = asm_fuseabase(as, ir->op1); + if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ + emit_dn(as, ARMI_ADD|ARMI_K12|ofs, dest, idx); + return; + } } + ofs = field_ofs[ir->op2]; + if ((ai & 0x04000000)) + emit_lso(as, ai, dest, idx, ofs); + else + emit_lsox(as, ai, dest, idx, ofs); } - ofs = field_ofs[ir->op2]; - if ((ai & 0x04000000)) - emit_lso(as, ai, dest, idx, ofs); - else - emit_lsox(as, ai, dest, idx, ofs); } static void asm_fstore(ASMState *as, IRIns *ir) diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index d37bc132..6094be68 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -896,17 +896,23 @@ static MIPSIns asm_fxstoreins(IRIns *ir) static void asm_fload(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - Reg idx = ra_alloc1(as, ir->op1, RSET_GPR); MIPSIns mi = asm_fxloadins(ir); + Reg idx; int32_t ofs; - if (ir->op2 == IRFL_TAB_ARRAY) { - ofs = asm_fuseabase(as, ir->op1); - if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ - emit_tsi(as, MIPSI_ADDIU, dest, idx, ofs); - return; + if (ir->op1 == REF_NIL) { + idx = RID_JGL; + ofs = ir->op2 - 32768; + } else { + idx = ra_alloc1(as, ir->op1, RSET_GPR); + if (ir->op2 == IRFL_TAB_ARRAY) { + ofs = asm_fuseabase(as, ir->op1); + if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ + emit_tsi(as, MIPSI_ADDIU, dest, idx, ofs); + return; + } } + ofs = field_ofs[ir->op2]; } - ofs = field_ofs[ir->op2]; lua_assert(!irt_isfp(ir->t)); emit_tsi(as, mi, dest, idx, ofs); } diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index e270b36c..46821515 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -804,17 +804,23 @@ static PPCIns asm_fxstoreins(IRIns *ir) static void asm_fload(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); - Reg idx = ra_alloc1(as, ir->op1, RSET_GPR); PPCIns pi = asm_fxloadins(ir); + Reg idx; int32_t ofs; - if (ir->op2 == IRFL_TAB_ARRAY) { - ofs = asm_fuseabase(as, ir->op1); - if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ - emit_tai(as, PPCI_ADDI, dest, idx, ofs); - return; + if (ir->op1 == REF_NIL) { + idx = RID_JGL; + ofs = ir->op2 - 32768; + } else { + idx = ra_alloc1(as, ir->op1, RSET_GPR); + if (ir->op2 == IRFL_TAB_ARRAY) { + ofs = asm_fuseabase(as, ir->op1); + if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ + emit_tai(as, PPCI_ADDI, dest, idx, ofs); + return; + } } + ofs = field_ofs[ir->op2]; } - ofs = field_ofs[ir->op2]; lua_assert(!irt_isi8(ir->t)); emit_tai(as, pi, dest, idx, ofs); } diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index e3ed7554..69d1256e 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -205,8 +205,13 @@ static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow) static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow) { lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF); - as->mrm.ofs = field_ofs[ir->op2]; as->mrm.idx = RID_NONE; + if (ir->op1 == REF_NIL) { + as->mrm.ofs = (int32_t)ir->op2 + ptr2addr(J2GG(as->J)); + as->mrm.base = RID_NONE; + return; + } + as->mrm.ofs = field_ofs[ir->op2]; if (irref_isk(ir->op1)) { as->mrm.ofs += IR(ir->op1)->i; as->mrm.base = RID_NONE; @@ -369,6 +374,10 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) return RID_MRM; } } + if (ir->o == IR_FLOAD && ir->op1 == REF_NIL) { + asm_fusefref(as, ir, RSET_EMPTY); + return RID_MRM; + } if (!(as->freeset & allow) && !irref_isk(ref) && (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref))) goto fusespill; diff --git a/src/lj_ir.c b/src/lj_ir.c index 6a1ecc13..593b4127 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -145,6 +145,14 @@ TRef lj_ir_call(jit_State *J, IRCallID id, ...) return emitir(CCI_OPTYPE(ci), tr, id); } +/* Load field of type t from GG_State + offset. */ +LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs) +{ + lua_assert(ofs >= IRFL__MAX && ofs < REF_BIAS); + lj_ir_set(J, IRT(IR_FLOAD, t), REF_NIL, ofs); + return lj_opt_fold(J); +} + /* -- Interning of constants ---------------------------------------------- */ /* diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 67221c6f..3e758b2e 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -36,6 +36,8 @@ static LJ_AINLINE IRRef lj_ir_nextins(jit_State *J) return ref; } +LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs); + /* Interning of constants. */ LJ_FUNC TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k); LJ_FUNC void lj_ir_k64_freeall(jit_State *J); diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c index 6def4161..49c9ae47 100644 --- a/src/lj_opt_split.c +++ b/src/lj_opt_split.c @@ -12,6 +12,7 @@ #include "lj_err.h" #include "lj_buf.h" +#include "lj_dispatch.h" #include "lj_ir.h" #include "lj_jit.h" #include "lj_ircall.h" @@ -448,6 +449,11 @@ static void split_ir(jit_State *J) case IR_STRTO: hi = split_emit(J, IRT(IR_HIOP, IRT_SOFTFP), nref, nref); break; + case IR_FLOAD: + lua_assert(ir->op1 == REF_NIL); + hi = lj_ir_kint(J, *(int32_t*)((char*)J2GG(J) + ir->op2 + LJ_LE*4)); + nir->op2 += LJ_BE*4; + break; case IR_XLOAD: { IRIns inslo = *nir; /* Save/undo the emit of the lo XLOAD. */ J->cur.nins--; From ccae333844c7aad0934f13f7698894c883a6b561 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 May 2016 01:04:17 +0200 Subject: [PATCH 19/57] Load SIMD constants with IR_FLOAD from GG_State. Contributed by Peter Cawley. --- src/lj_ffrecord.c | 2 +- src/lj_iropt.h | 4 ++-- src/lj_opt_fold.c | 2 +- src/lj_opt_narrow.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 56497778..14fde4d9 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -498,7 +498,7 @@ static void LJ_FASTCALL recff_getfenv(jit_State *J, RecordFFData *rd) static void LJ_FASTCALL recff_math_abs(jit_State *J, RecordFFData *rd) { TRef tr = lj_ir_tonum(J, J->base[0]); - J->base[0] = emitir(IRTN(IR_ABS), tr, lj_ir_knum_abs(J)); + J->base[0] = emitir(IRTN(IR_ABS), tr, lj_ir_ksimd(J, LJ_KSIMD_ABS)); UNUSED(rd); } diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 3e758b2e..46933671 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -77,8 +77,8 @@ static LJ_AINLINE TRef lj_ir_knum(jit_State *J, lua_Number n) #define lj_ir_knum_tobit(J) lj_ir_knum_u64(J, U64x(43380000,00000000)) /* Special 128 bit SIMD constants. */ -#define lj_ir_knum_abs(J) lj_ir_k64(J, IR_KNUM, LJ_KSIMD(J, LJ_KSIMD_ABS)) -#define lj_ir_knum_neg(J) lj_ir_k64(J, IR_KNUM, LJ_KSIMD(J, LJ_KSIMD_NEG)) +#define lj_ir_ksimd(J, idx) \ + lj_ir_ggfload(J, IRT_NUM, (uintptr_t)LJ_KSIMD(J, idx) - (uintptr_t)J2GG(J)) /* Access to constants. */ LJ_FUNC void lj_ir_kvalue(lua_State *L, TValue *tv, const IRIns *ir); diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 95b58a1a..c102f2db 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -999,7 +999,7 @@ LJFOLDF(simplify_nummuldiv_k) return LEFTFOLD; } else if (n == -1.0) { /* x o -1 ==> -x */ fins->o = IR_NEG; - fins->op2 = (IRRef1)lj_ir_knum_neg(J); + fins->op2 = (IRRef1)lj_ir_ksimd(J, LJ_KSIMD_NEG); return RETRYFOLD; } else if (fins->o == IR_MUL && n == 2.0) { /* x * 2 ==> x + x */ fins->o = IR_ADD; diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c index 36be66ee..ca0a0f49 100644 --- a/src/lj_opt_narrow.c +++ b/src/lj_opt_narrow.c @@ -555,7 +555,7 @@ TRef lj_opt_narrow_unm(jit_State *J, TRef rc, TValue *vc) return emitir(IRTGI(IR_SUBOV), lj_ir_kint(J, 0), rc); rc = emitir(IRTN(IR_CONV), rc, IRCONV_NUM_INT); } - return emitir(IRTN(IR_NEG), rc, lj_ir_knum_neg(J)); + return emitir(IRTN(IR_NEG), rc, lj_ir_ksimd(J, LJ_KSIMD_NEG)); } /* Narrowing of modulo operator. */ From 513587656a36362e08fb99a28a280a9d412ef1bc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 21 May 2016 01:45:18 +0200 Subject: [PATCH 20/57] Add ra_addrename(). Contributed by Peter Cawley. --- src/lj_asm.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 9cddd0c9..5e38d254 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -619,10 +619,21 @@ static Reg ra_alloc1(ASMState *as, IRRef ref, RegSet allow) return r; } +/* Add a register rename to the IR. */ +static void ra_addrename(ASMState *as, Reg down, IRRef ref, SnapNo snapno) +{ + IRRef ren; + lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, snapno); + ren = tref_ref(lj_ir_emit(as->J)); + as->ir = as->T->ir; /* The IR may have been reallocated. */ + IR(ren)->r = (uint8_t)down; + IR(ren)->s = SPS_NONE; +} + /* Rename register allocation and emit move. */ static void ra_rename(ASMState *as, Reg down, Reg up) { - IRRef ren, ref = regcost_ref(as->cost[up] = as->cost[down]); + IRRef ref = regcost_ref(as->cost[up] = as->cost[down]); IRIns *ir = IR(ref); ir->r = (uint8_t)up; as->cost[down] = 0; @@ -635,11 +646,7 @@ static void ra_rename(ASMState *as, Reg down, Reg up) RA_DBGX((as, "rename $f $r $r", regcost_ref(as->cost[up]), down, up)); emit_movrr(as, ir, down, up); /* Backwards codegen needs inverse move. */ if (!ra_hasspill(IR(ref)->s)) { /* Add the rename to the IR. */ - lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, as->snapno); - ren = tref_ref(lj_ir_emit(as->J)); - as->ir = as->T->ir; /* The IR may have been reallocated. */ - IR(ren)->r = (uint8_t)down; - IR(ren)->s = SPS_NONE; + ra_addrename(as, down, ref, as->snapno); } } @@ -1472,12 +1479,7 @@ static void asm_phi_fixup(ASMState *as) irt_clearmark(ir->t); /* Left PHI gained a spill slot before the loop? */ if (ra_hasspill(ir->s)) { - IRRef ren; - lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), lref, as->loopsnapno); - ren = tref_ref(lj_ir_emit(as->J)); - as->ir = as->T->ir; /* The IR may have been reallocated. */ - IR(ren)->r = (uint8_t)r; - IR(ren)->s = SPS_NONE; + ra_addrename(as, r, lref, as->loopsnapno); } } rset_clear(work, r); From a657fa01869e63508afce88cc8088c3d2e2fb47c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 22 May 2016 23:25:28 +0200 Subject: [PATCH 21/57] Make the IR immovable after assembly. This allows embedding pointers to IR constants in the machine code. Contributed by Peter Cawley. --- src/lj_asm.c | 95 +++++++++++++++++++++++++++++++++++++------------- src/lj_jit.h | 1 + src/lj_trace.c | 33 +++++++++++++----- src/lj_trace.h | 1 + 4 files changed, 97 insertions(+), 33 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 5e38d254..5235dd00 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -625,9 +625,8 @@ static void ra_addrename(ASMState *as, Reg down, IRRef ref, SnapNo snapno) IRRef ren; lj_ir_set(as->J, IRT(IR_RENAME, IRT_NIL), ref, snapno); ren = tref_ref(lj_ir_emit(as->J)); - as->ir = as->T->ir; /* The IR may have been reallocated. */ - IR(ren)->r = (uint8_t)down; - IR(ren)->s = SPS_NONE; + as->J->cur.ir[ren].r = (uint8_t)down; + as->J->cur.ir[ren].s = SPS_NONE; } /* Rename register allocation and emit move. */ @@ -948,7 +947,7 @@ static void asm_snap_prep(ASMState *as) } else { /* Process any renames above the highwater mark. */ for (; as->snaprename < as->T->nins; as->snaprename++) { - IRIns *ir = IR(as->snaprename); + IRIns *ir = &as->T->ir[as->snaprename]; if (asm_snap_checkrename(as, ir->op1)) ir->op2 = REF_BIAS-1; /* Kill rename. */ } @@ -1967,8 +1966,9 @@ static void asm_setup_regsp(ASMState *as) ir = IR(nins-1); if (ir->o == IR_RENAME) { + /* Remove any renames left over from ASM restart due to LJ_TRERR_MCODELM. */ do { ir--; nins--; } while (ir->o == IR_RENAME); - T->nins = nins; /* Remove any renames left over from ASM restart. */ + T->nins = nins; } as->snaprename = nins; as->snapref = nins; @@ -2202,13 +2202,14 @@ void lj_asm_trace(jit_State *J, GCtrace *T) MCode *origtop; /* Ensure an initialized instruction beyond the last one for HIOP checks. */ - J->cur.nins = lj_ir_nextins(J); - J->cur.ir[J->cur.nins].o = IR_NOP; + /* This also allows one RENAME to be added without reallocating curfinal. */ + as->orignins = lj_ir_nextins(J); + J->cur.ir[as->orignins].o = IR_NOP; /* Setup initial state. Copy some fields to reduce indirections. */ as->J = J; as->T = T; - as->ir = T->ir; + J->curfinal = lj_trace_alloc(J->L, T); /* This copies the IR, too. */ as->flags = J->flags; as->loopref = J->loopref; as->realign = NULL; @@ -2221,12 +2222,41 @@ void lj_asm_trace(jit_State *J, GCtrace *T) as->mclim = as->mcbot + MCLIM_REDZONE; asm_setup_target(as); - do { + /* + ** This is a loop, because the MCode may have to be (re-)assembled + ** multiple times: + ** + ** 1. as->realign is set (and the assembly aborted), if the arch-specific + ** backend wants the MCode to be aligned differently. + ** + ** This is currently only the case on x86/x64, where small loops get + ** an aligned loop body plus a short branch. Not much effort is wasted, + ** because the abort happens very quickly and only once. + ** + ** 2. The IR is immovable, since the MCode embeds pointers to various + ** constants inside the IR. But RENAMEs may need to be added to the IR + ** during assembly, which might grow and reallocate the IR. We check + ** at the end if the IR (in J->cur.ir) has actually grown, resize the + ** copy (in J->curfinal.ir) and try again. + ** + ** 95% of all traces have zero RENAMEs, 3% have one RENAME, 1.5% have + ** 2 RENAMEs and only 0.5% have more than that. That's why we opt to + ** always have one spare slot in the IR (see above), which means we + ** have to redo the assembly for only ~2% of all traces. + ** + ** Very, very rarely, this needs to be done repeatedly, since the + ** location of constants inside the IR (actually, reachability from + ** a global pointer) may affect register allocation and thus the + ** number of RENAMEs. + */ + for (;;) { as->mcp = as->mctop; #ifdef LUA_USE_ASSERT as->mcp_prev = as->mcp; #endif - as->curins = T->nins; + as->ir = J->curfinal->ir; /* Use the copied IR. */ + as->curins = J->cur.nins = as->orignins; + RA_DBG_START(); RA_DBGX((as, "===== STOP =====")); @@ -2254,22 +2284,39 @@ void lj_asm_trace(jit_State *J, GCtrace *T) checkmclim(as); asm_ir(as, ir); } - } while (as->realign); /* Retry in case the MCode needs to be realigned. */ - /* Emit head of trace. */ - RA_DBG_REF(); - checkmclim(as); - if (as->gcsteps > 0) { - as->curins = as->T->snap[0].ref; - asm_snap_prep(as); /* The GC check is a guard. */ - asm_gc_check(as); + if (as->realign && J->curfinal->nins >= T->nins) + continue; /* Retry in case only the MCode needs to be realigned. */ + + /* Emit head of trace. */ + RA_DBG_REF(); + checkmclim(as); + if (as->gcsteps > 0) { + as->curins = as->T->snap[0].ref; + asm_snap_prep(as); /* The GC check is a guard. */ + asm_gc_check(as); + } + ra_evictk(as); + if (as->parent) + asm_head_side(as); + else + asm_head_root(as); + asm_phi_fixup(as); + + if (J->curfinal->nins >= T->nins) { /* IR didn't grow? */ + lua_assert(J->curfinal->nk == T->nk); + memcpy(J->curfinal->ir + as->orignins, T->ir + as->orignins, + (T->nins - as->orignins) * sizeof(IRIns)); /* Copy RENAMEs. */ + T->nins = J->curfinal->nins; + break; /* Done. */ + } + + /* Otherwise try again with a bigger IR. */ + lj_trace_free(J2G(J), J->curfinal); + J->curfinal = NULL; /* In case lj_trace_alloc() OOMs. */ + J->curfinal = lj_trace_alloc(J->L, T); + as->realign = NULL; } - ra_evictk(as); - if (as->parent) - asm_head_side(as); - else - asm_head_root(as); - asm_phi_fixup(as); RA_DBGX((as, "===== START ====")); RA_DBG_FLUSH(); diff --git a/src/lj_jit.h b/src/lj_jit.h index 6a47961b..ad9d62af 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -362,6 +362,7 @@ typedef struct FoldState { /* JIT compiler state. */ typedef struct jit_State { GCtrace cur; /* Current trace. */ + GCtrace *curfinal; /* Final address of current trace (set during asm). */ lua_State *L; /* Current Lua state. */ const BCIns *pc; /* Current PC. */ diff --git a/src/lj_trace.c b/src/lj_trace.c index 0d54c0af..19ddba41 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -117,15 +117,26 @@ static void perftools_addtrace(GCtrace *T) } #endif -/* Allocate space for copy of trace. */ -static GCtrace *trace_save_alloc(jit_State *J) +/* Allocate space for copy of T. */ +GCtrace * LJ_FASTCALL lj_trace_alloc(lua_State *L, GCtrace *T) { size_t sztr = ((sizeof(GCtrace)+7)&~7); - size_t szins = (J->cur.nins-J->cur.nk)*sizeof(IRIns); + size_t szins = (T->nins-T->nk)*sizeof(IRIns); size_t sz = sztr + szins + - J->cur.nsnap*sizeof(SnapShot) + - J->cur.nsnapmap*sizeof(SnapEntry); - return lj_mem_newt(J->L, (MSize)sz, GCtrace); + T->nsnap*sizeof(SnapShot) + + T->nsnapmap*sizeof(SnapEntry); + GCtrace *T2 = lj_mem_newt(L, (MSize)sz, GCtrace); + char *p = (char *)T2 + sztr; + T2->gct = ~LJ_TTRACE; + T2->marked = 0; + T2->traceno = 0; + T2->ir = (IRIns *)p - T->nk; + T2->nins = T->nins; + T2->nk = T->nk; + T2->nsnap = T->nsnap; + T2->nsnapmap = T->nsnapmap; + memcpy(p, T->ir + T->nk, szins); + return T2; } /* Save current trace by copying and compacting it. */ @@ -139,12 +150,12 @@ static void trace_save(jit_State *J, GCtrace *T) setgcrefp(J2G(J)->gc.root, T); newwhite(J2G(J), T); T->gct = ~LJ_TTRACE; - T->ir = (IRIns *)p - J->cur.nk; - memcpy(p, J->cur.ir+J->cur.nk, szins); + T->ir = (IRIns *)p - J->cur.nk; /* The IR has already been copied above. */ p += szins; TRACE_APPENDVEC(snap, nsnap, SnapShot) TRACE_APPENDVEC(snapmap, nsnapmap, SnapEntry) J->cur.traceno = 0; + J->curfinal = NULL; setgcrefp(J->trace[T->traceno], T); lj_gc_barriertrace(J2G(J), T->traceno); lj_gdbjit_addtrace(J, T); @@ -449,7 +460,7 @@ static void trace_stop(jit_State *J) BCOp op = bc_op(J->cur.startins); GCproto *pt = &gcref(J->cur.startpt)->pt; TraceNo traceno = J->cur.traceno; - GCtrace *T = trace_save_alloc(J); /* Do this first. May throw OOM. */ + GCtrace *T = J->curfinal; lua_State *L; switch (op) { @@ -537,6 +548,10 @@ static int trace_abort(jit_State *J) J->postproc = LJ_POST_NONE; lj_mcode_abort(J); + if (J->curfinal) { + lj_trace_free(J2G(J), J->curfinal); + J->curfinal = NULL; + } if (tvisnumber(L->top-1)) e = (TraceError)numberVint(L->top-1); if (e == LJ_TRERR_MCODELM) { diff --git a/src/lj_trace.h b/src/lj_trace.h index 6faa1aa3..5658d8a5 100644 --- a/src/lj_trace.h +++ b/src/lj_trace.h @@ -23,6 +23,7 @@ LJ_FUNC_NORET void lj_trace_err(jit_State *J, TraceError e); LJ_FUNC_NORET void lj_trace_err_info(jit_State *J, TraceError e); /* Trace management. */ +LJ_FUNC GCtrace * LJ_FASTCALL lj_trace_alloc(lua_State *L, GCtrace *T); LJ_FUNC void LJ_FASTCALL lj_trace_free(global_State *g, GCtrace *T); LJ_FUNC void lj_trace_reenableproto(GCproto *pt); LJ_FUNC void lj_trace_flushproto(global_State *g, GCproto *pt); From 3152ed98ea3d99579940eb5e72da687ce66792de Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 22 May 2016 23:40:37 +0200 Subject: [PATCH 22/57] Simplify GCtrace * reference embedding for trace stitching. This is now possible due to the immovable IR. Contributed by Peter Cawley. --- src/lj_asm.c | 5 +++++ src/lj_ffrecord.c | 5 ++--- src/lj_ir.c | 12 ++++++++++++ src/lj_iropt.h | 1 + src/lj_jit.h | 2 +- src/lj_trace.c | 5 +---- 6 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 5235dd00..9b394beb 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1933,6 +1933,11 @@ static void asm_tail_link(ASMState *as) } emit_addptr(as, RID_BASE, 8*(int32_t)baseslot); + if (as->J->ktrace) { /* Patch ktrace slot with the final GCtrace pointer. */ + setgcref(IR(as->J->ktrace)->gcr, obj2gco(as->J->curfinal)); + IR(as->J->ktrace)->o = IR_KGC; + } + /* Sync the interpreter state with the on-trace state. */ asm_stack_restore(as, snap); diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 14fde4d9..ae567622 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -118,9 +118,8 @@ static void recff_stitch(jit_State *J) /* Ditto for the IR. */ memmove(&J->base[1], &J->base[-1], sizeof(TRef)*(J->maxslot+1)); J->base[0] = lj_ir_kptr(J, contptr(cont)) | TREF_CONT; - J->ktracep = lj_ir_k64_reserve(J); - lua_assert(irt_toitype_(IRT_P64) == LJ_TTRACE); - J->base[-1] = emitir(IRT(IR_XLOAD, IRT_P64), lj_ir_kptr(J, &J->ktracep->gcr), 0); + J->base[-1] = lj_ir_ktrace(J); + J->ktrace = tref_ref(J->base[-1]); J->base += 2; J->baseslot += 2; J->framedepth++; diff --git a/src/lj_ir.c b/src/lj_ir.c index 593b4127..acb39463 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -348,6 +348,18 @@ found: return TREF(ref, t); } +/* Allocate GCtrace constant placeholder (no interning). */ +TRef lj_ir_ktrace(jit_State *J) +{ + IRRef ref = ir_nextk(J); + IRIns *ir = IR(ref); + lua_assert(irt_toitype_(IRT_P64) == LJ_TTRACE); + ir->t.irt = IRT_P64; + ir->o = IR_KNULL; /* Not IR_KGC yet, but same size. */ + ir->prev = 0; + return TREF(ref, IRT_P64); +} + /* Intern 32 bit pointer constant. */ TRef lj_ir_kptr_(jit_State *J, IROp op, void *ptr) { diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 46933671..fdc5f0d2 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -51,6 +51,7 @@ LJ_FUNC TRef lj_ir_kgc(jit_State *J, GCobj *o, IRType t); LJ_FUNC TRef lj_ir_kptr_(jit_State *J, IROp op, void *ptr); LJ_FUNC TRef lj_ir_knull(jit_State *J, IRType t); LJ_FUNC TRef lj_ir_kslot(jit_State *J, TRef key, IRRef slot); +LJ_FUNC TRef lj_ir_ktrace(jit_State *J); #if LJ_64 #define lj_ir_kintp(J, k) lj_ir_kint64(J, (uint64_t)(k)) diff --git a/src/lj_jit.h b/src/lj_jit.h index ad9d62af..eafbc327 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -416,7 +416,7 @@ typedef struct jit_State { GCRef *trace; /* Array of traces. */ TraceNo freetrace; /* Start of scan for next free trace. */ MSize sizetrace; /* Size of trace array. */ - TValue *ktracep; /* Pointer to K64Array slot with GCtrace pointer. */ + IRRef1 ktrace; /* Reference to KGC with GCtrace. */ IRRef1 chain[IR__MAX]; /* IR instruction skip-list chain anchors. */ TRef slot[LJ_MAX_JSLOTS+LJ_STACK_EXTRA]; /* Stack slot map. */ diff --git a/src/lj_trace.c b/src/lj_trace.c index 19ddba41..eaf9365c 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -436,7 +436,7 @@ static void trace_start(jit_State *J) J->postproc = LJ_POST_NONE; lj_resetsplit(J); J->retryrec = 0; - J->ktracep = NULL; + J->ktrace = 0; setgcref(J->cur.startpt, obj2gco(J->pt)); L = J->L; @@ -512,9 +512,6 @@ static void trace_stop(jit_State *J) lj_mcode_commit(J, J->cur.mcode); J->postproc = LJ_POST_NONE; trace_save(J, T); - if (J->ktracep) { /* Patch K64Array slot with the final GCtrace pointer. */ - setgcV(J->L, J->ktracep, obj2gco(T), LJ_TTRACE); - } L = J->L; lj_vmevent_send(L, TRACE, From a4067978b6d1c2a25d91d82b1b0d384d98abdbe5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 22 May 2016 23:45:40 +0200 Subject: [PATCH 23/57] Always walk IR constants in ascending order. Prerequisite for embedding 64 bit constants directly in the IR. Contributed by Peter Cawley. --- src/lj_opt_sink.c | 8 ++++++-- src/lj_record.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lj_opt_sink.c b/src/lj_opt_sink.c index 975ee831..49e13784 100644 --- a/src/lj_opt_sink.c +++ b/src/lj_opt_sink.c @@ -166,8 +166,8 @@ static void sink_remark_phi(jit_State *J) /* Sweep instructions and tag sunken allocations and stores. */ static void sink_sweep_ins(jit_State *J) { - IRIns *ir, *irfirst = IR(J->cur.nk); - for (ir = IR(J->cur.nins-1) ; ir >= irfirst; ir--) { + IRIns *ir, *irbase = IR(REF_BASE); + for (ir = IR(J->cur.nins-1) ; ir >= irbase; ir--) { switch (ir->o) { case IR_ASTORE: case IR_HSTORE: case IR_FSTORE: case IR_XSTORE: { IRIns *ira = sink_checkalloc(J, ir); @@ -217,6 +217,10 @@ static void sink_sweep_ins(jit_State *J) break; } } + for (ir = IR(J->cur.nk); ir < irbase; ir++) { + irt_clearmark(ir->t); + ir->prev = REGSP_INIT; + } } /* Allocation sinking and store sinking. diff --git a/src/lj_record.c b/src/lj_record.c index f7c53567..b5fb6649 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -51,7 +51,7 @@ static void rec_check_ir(jit_State *J) { IRRef i, nins = J->cur.nins, nk = J->cur.nk; lua_assert(nk <= REF_BIAS && nins >= REF_BIAS && nins < 65536); - for (i = nins-1; i >= nk; i--) { + for (i = nk; i < nins; i++) { IRIns *ir = IR(i); uint32_t mode = lj_ir_mode[ir->o]; IRRef op1 = ir->op1; From 7fb75ccc4cf17825c1c8fe9f44ebfb0668a1b033 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 00:25:29 +0200 Subject: [PATCH 24/57] Embed 64 bit constants directly in the IR, using two slots. Contributed by Peter Cawley. --- src/lj_asm.c | 17 +++++++++++------ src/lj_asm_x86.h | 26 +++++++++++++++----------- src/lj_emit_arm.h | 3 ++- src/lj_emit_mips.h | 4 ++-- src/lj_emit_ppc.h | 4 ++-- src/lj_emit_x86.h | 22 ++++++++++++++++------ src/lj_gc.c | 2 ++ src/lj_ir.c | 36 +++++++++++++++++++++++------------- src/lj_ir.h | 15 +++++++++------ src/lj_iropt.h | 2 +- src/lj_jit.h | 4 ++-- src/lj_opt_fold.c | 8 ++++++-- src/lj_opt_mem.c | 4 ++-- src/lj_opt_sink.c | 2 ++ src/lj_opt_split.c | 2 ++ src/lj_record.c | 5 ++++- src/lj_snap.c | 9 ++++----- 17 files changed, 105 insertions(+), 60 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 9b394beb..0b3e770a 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -334,7 +334,7 @@ static Reg ra_rematk(ASMState *as, IRRef ref) RA_DBGX((as, "remat $i $r", ir, r)); #if !LJ_SOFTFP if (ir->o == IR_KNUM) { - emit_loadn(as, r, ir_knum(ir)); + emit_loadk64(as, r, ir); } else #endif if (emit_canremat(REF_BASE) && ir->o == IR_BASE) { @@ -695,15 +695,14 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref) if (ra_noreg(left)) { if (irref_isk(lref)) { if (ir->o == IR_KNUM) { - cTValue *tv = ir_knum(ir); /* FP remat needs a load except for +0. Still better than eviction. */ - if (tvispzero(tv) || !(as->freeset & RSET_FPR)) { - emit_loadn(as, dest, tv); + if (tvispzero(ir_knum(ir)) || !(as->freeset & RSET_FPR)) { + emit_loadk64(as, dest, ir); return; } #if LJ_64 } else if (ir->o == IR_KINT64) { - emit_loadu64(as, dest, ir_kint64(ir)->u64); + emit_loadk64(as, dest, ir); return; #endif } else if (ir->o != IR_KPRI) { @@ -1963,8 +1962,14 @@ static void asm_setup_regsp(ASMState *as) ra_setup(as); /* Clear reg/sp for constants. */ - for (ir = IR(T->nk), lastir = IR(REF_BASE); ir < lastir; ir++) + for (ir = IR(T->nk), lastir = IR(REF_BASE); ir < lastir; ir++) { ir->prev = REGSP_INIT; + if (irt_is64(ir->t) && ir->o != IR_KNULL) { + /* Make life easier for backends by putting address of constant in i. */ + ir->i = (int32_t)(intptr_t)(ir+1); + ir++; + } + } /* REF_BASE is used for implicit references to the BASE register. */ lastir->prev = REGSP_HINT(RID_BASE); diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 69d1256e..0361a965 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -306,6 +306,16 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow) } } +/* Fuse load of 64 bit IR constant into memory operand. */ +static Reg asm_fuseloadk64(ASMState *as, IRIns *ir) +{ + const uint64_t *k = &ir_k64(ir)->u64; + as->mrm.ofs = ptr2addr(k); + as->mrm.base = RID_NONE; + as->mrm.idx = RID_NONE; + return RID_MRM; +} + /* Fuse load into memory operand. */ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) { @@ -325,19 +335,13 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) if (ir->o == IR_KNUM) { RegSet avail = as->freeset & ~as->modset & RSET_FPR; lua_assert(allow != RSET_EMPTY); - if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ - as->mrm.ofs = ptr2addr(ir_knum(ir)); - as->mrm.base = as->mrm.idx = RID_NONE; - return RID_MRM; - } + if (!(avail & (avail-1))) /* Fuse if less than two regs available. */ + return asm_fuseloadk64(as, ir); } else if (ir->o == IR_KINT64) { RegSet avail = as->freeset & ~as->modset & RSET_GPR; lua_assert(allow != RSET_EMPTY); - if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ - as->mrm.ofs = ptr2addr(ir_kint64(ir)); - as->mrm.base = as->mrm.idx = RID_NONE; - return RID_MRM; - } + if (!(avail & (avail-1))) /* Fuse if less than two regs available. */ + return asm_fuseloadk64(as, ir); } else if (mayfuse(as, ref)) { RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; if (ir->o == IR_SLOAD) { @@ -711,7 +715,7 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_rr(as, XO_CVTSD2SS, dest, dest); emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */ emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */ - emit_loadn(as, bias, k); + emit_rma(as, XO_MOVSD, bias, k); emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR)); return; } else { /* Integer to FP conversion. */ diff --git a/src/lj_emit_arm.h b/src/lj_emit_arm.h index 47fee5fc..dff9fac4 100644 --- a/src/lj_emit_arm.h +++ b/src/lj_emit_arm.h @@ -219,8 +219,9 @@ static void emit_lsptr(ASMState *as, ARMIns ai, Reg r, void *p) #if !LJ_SOFTFP /* Load a number constant into an FPR. */ -static void emit_loadn(ASMState *as, Reg r, cTValue *tv) +static void emit_loadk64(ASMState *as, Reg r, IRIns *ir) { + cTValue *tv = ir_knum(ir); int32_t i; if ((as->flags & JIT_F_VFPV3) && !tv->u32.lo) { uint32_t hi = tv->u32.hi; diff --git a/src/lj_emit_mips.h b/src/lj_emit_mips.h index fdebe94b..29079ea3 100644 --- a/src/lj_emit_mips.h +++ b/src/lj_emit_mips.h @@ -112,8 +112,8 @@ static void emit_lsptr(ASMState *as, MIPSIns mi, Reg r, void *p, RegSet allow) emit_tsi(as, mi, r, base, i); } -#define emit_loadn(as, r, tv) \ - emit_lsptr(as, MIPSI_LDC1, ((r) & 31), (void *)(tv), RSET_GPR) +#define emit_loadk64(as, r, ir) \ + emit_lsptr(as, MIPSI_LDC1, ((r) & 31), (void *)&ir_knum((ir))->u64, RSET_GPR) /* Get/set global_State fields. */ static void emit_lsglptr(ASMState *as, MIPSIns mi, Reg r, int32_t ofs) diff --git a/src/lj_emit_ppc.h b/src/lj_emit_ppc.h index 4eb933ea..5163012a 100644 --- a/src/lj_emit_ppc.h +++ b/src/lj_emit_ppc.h @@ -115,8 +115,8 @@ static void emit_lsptr(ASMState *as, PPCIns pi, Reg r, void *p, RegSet allow) emit_tai(as, pi, r, base, i); } -#define emit_loadn(as, r, tv) \ - emit_lsptr(as, PPCI_LFD, ((r) & 31), (void *)(tv), RSET_GPR) +#define emit_loadk64(as, r, ir) \ + emit_lsptr(as, PPCI_LFD, ((r) & 31), (void *)&ir_knum((ir))->u64, RSET_GPR) /* Get/set global_State fields. */ static void emit_lsglptr(ASMState *as, PPCIns pi, Reg r, int32_t ofs) diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index cbaf4e85..3d6f13f4 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -313,13 +313,23 @@ static void emit_loadu64(ASMState *as, Reg r, uint64_t u64) } #endif -/* movsd r, [&tv->n] / xorps r, r */ -static void emit_loadn(ASMState *as, Reg r, cTValue *tv) +/* Load 64 bit IR constant into register. */ +static void emit_loadk64(ASMState *as, Reg r, IRIns *ir) { - if (tvispzero(tv)) /* Use xor only for +0. */ - emit_rr(as, XO_XORPS, r, r); - else - emit_rma(as, XO_MOVSD, r, &tv->n); + const uint64_t *k = &ir_k64(ir)->u64; + if (rset_test(RSET_FPR, r)) { + if (*k == 0) { + emit_rr(as, XO_XORPS, r, r); + } else { + emit_rma(as, XO_MOVSD, r, k); + } + } else { + if (*k == 0) { + emit_rr(as, XO_ARITH(XOg_XOR), r, r); + } else { + emit_rma(as, XO_MOV, r | REX_64, k); + } + } } /* -- Emit control-flow instructions -------------------------------------- */ diff --git a/src/lj_gc.c b/src/lj_gc.c index 53f1d974..7c707462 100644 --- a/src/lj_gc.c +++ b/src/lj_gc.c @@ -238,6 +238,8 @@ static void gc_traverse_trace(global_State *g, GCtrace *T) IRIns *ir = &T->ir[ref]; if (ir->o == IR_KGC) gc_markobj(g, ir_kgc(ir)); + if (irt_is64(ir->t) && ir->o != IR_KNULL) + ref++; } if (T->link) gc_marktrace(g, T->link); if (T->nextroot) gc_marktrace(g, T->nextroot); diff --git a/src/lj_ir.c b/src/lj_ir.c index acb39463..124d5791 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -91,7 +91,7 @@ static void lj_ir_growbot(jit_State *J) IRIns *baseir = J->irbuf + J->irbotlim; MSize szins = J->irtoplim - J->irbotlim; lua_assert(szins != 0); - lua_assert(J->cur.nk == J->irbotlim); + lua_assert(J->cur.nk == J->irbotlim || J->cur.nk-1 == J->irbotlim); if (J->cur.nins + (szins >> 1) < J->irtoplim) { /* More than half of the buffer is free on top: shift up by a quarter. */ MSize ofs = szins >> 2; @@ -173,6 +173,18 @@ static LJ_AINLINE IRRef ir_nextk(jit_State *J) return ref; } +/* Get ref of next 64 bit IR constant and optionally grow IR. +** Note: this may invalidate all IRIns *! +*/ +static LJ_AINLINE IRRef ir_nextk64(jit_State *J) +{ + IRRef ref = J->cur.nk - 2; + lua_assert(J->state != LJ_TRACE_ASM); + if (LJ_UNLIKELY(ref < J->irbotlim)) lj_ir_growbot(J); + J->cur.nk = ref; + return ref; +} + /* Intern int32_t constant. */ TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k) { @@ -266,19 +278,18 @@ TValue *lj_ir_k64_reserve(jit_State *J) return ir_k64_add(J, kp, 0); /* Set to 0. Final value is set later. */ } -/* Intern 64 bit constant, given by its address. */ -TRef lj_ir_k64(jit_State *J, IROp op, cTValue *tv) +/* Intern 64 bit constant, given by its 64 bit pattern. */ +TRef lj_ir_k64(jit_State *J, IROp op, uint64_t u64) { IRIns *ir, *cir = J->cur.ir; IRRef ref; IRType t = op == IR_KNUM ? IRT_NUM : IRT_I64; for (ref = J->chain[op]; ref; ref = cir[ref].prev) - if (ir_k64(&cir[ref]) == tv) + if (ir_k64(&cir[ref])->u64 == u64) goto found; - ref = ir_nextk(J); + ref = ir_nextk64(J); ir = IR(ref); - lua_assert(checkptrGC(tv)); - setmref(ir->ptr, tv); + ir[1].tv.u64 = u64; ir->t.irt = t; ir->o = op; ir->prev = J->chain[op]; @@ -290,13 +301,13 @@ found: /* Intern FP constant, given by its 64 bit pattern. */ TRef lj_ir_knum_u64(jit_State *J, uint64_t u64) { - return lj_ir_k64(J, IR_KNUM, lj_ir_k64_find(J, u64)); + return lj_ir_k64(J, IR_KNUM, u64); } /* Intern 64 bit integer constant. */ TRef lj_ir_kint64(jit_State *J, uint64_t u64) { - return lj_ir_k64(J, IR_KINT64, lj_ir_k64_find(J, u64)); + return lj_ir_k64(J, IR_KINT64, u64); } /* Check whether a number is int and return it. -0 is NOT considered an int. */ @@ -367,7 +378,7 @@ TRef lj_ir_kptr_(jit_State *J, IROp op, void *ptr) IRRef ref; lua_assert((void *)(uintptr_t)u32ptr(ptr) == ptr); for (ref = J->chain[op]; ref; ref = cir[ref].prev) - if (mref(cir[ref].ptr, void) == ptr) + if (ir_kptr(&cir[ref]) == ptr) goto found; ref = ir_nextk(J); ir = IR(ref); @@ -432,9 +443,8 @@ void lj_ir_kvalue(lua_State *L, TValue *tv, const IRIns *ir) case IR_KPRI: setpriV(tv, irt_toitype(ir->t)); break; case IR_KINT: setintV(tv, ir->i); break; case IR_KGC: setgcV(L, tv, ir_kgc(ir), irt_toitype(ir->t)); break; - case IR_KPTR: case IR_KKPTR: case IR_KNULL: - setlightudV(tv, mref(ir->ptr, void)); - break; + case IR_KPTR: case IR_KKPTR: setlightudV(tv, ir_kptr(ir)); break; + case IR_KNULL: setlightudV(tv, NULL); break; case IR_KNUM: setnumV(tv, ir_knum(ir)->n); break; #if LJ_HASFFI case IR_KINT64: { diff --git a/src/lj_ir.h b/src/lj_ir.h index 8a655b64..03377ec1 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -522,7 +522,9 @@ typedef uint32_t TRef; ** +-------+-------+---+---+---+---+ ** | op1 | op2 | t | o | r | s | ** +-------+-------+---+---+---+---+ -** | op12/i/gco | ot | prev | (alternative fields in union) +** | op12/i/gco32 | ot | prev | (alternative fields in union) +** +-------+-------+---+---+---+---+ +** | TValue/gco64 | (2nd IR slot for 64 bit constants) ** +---------------+-------+-------+ ** 32 16 16 ** @@ -550,8 +552,9 @@ typedef union IRIns { ) }; int32_t i; /* 32 bit signed integer literal (overlaps op12). */ - GCRef gcr; /* GCobj constant (overlaps op12). */ - MRef ptr; /* Pointer constant (overlaps op12). */ + GCRef gcr; /* GCobj constant (overlaps op12 or entire slot). */ + MRef ptr; /* Pointer constant (overlaps op12 or entire slot). */ + TValue tv; /* TValue constant (overlaps entire slot). */ } IRIns; /* TODO_GC64: major changes required. */ @@ -560,10 +563,10 @@ typedef union IRIns { #define ir_ktab(ir) (gco2tab(ir_kgc((ir)))) #define ir_kfunc(ir) (gco2func(ir_kgc((ir)))) #define ir_kcdata(ir) (gco2cd(ir_kgc((ir)))) -#define ir_knum(ir) check_exp((ir)->o == IR_KNUM, mref((ir)->ptr, cTValue)) -#define ir_kint64(ir) check_exp((ir)->o == IR_KINT64, mref((ir)->ptr,cTValue)) +#define ir_knum(ir) check_exp((ir)->o == IR_KNUM, &(ir)[1].tv) +#define ir_kint64(ir) check_exp((ir)->o == IR_KINT64, &(ir)[1].tv) #define ir_k64(ir) \ - check_exp((ir)->o == IR_KNUM || (ir)->o == IR_KINT64, mref((ir)->ptr,cTValue)) + check_exp((ir)->o == IR_KNUM || (ir)->o == IR_KINT64, &(ir)[1].tv) #define ir_kptr(ir) \ check_exp((ir)->o == IR_KPTR || (ir)->o == IR_KKPTR, mref((ir)->ptr, void)) diff --git a/src/lj_iropt.h b/src/lj_iropt.h index fdc5f0d2..219d391a 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -41,7 +41,7 @@ LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs); /* Interning of constants. */ LJ_FUNC TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k); LJ_FUNC void lj_ir_k64_freeall(jit_State *J); -LJ_FUNC TRef lj_ir_k64(jit_State *J, IROp op, cTValue *tv); +LJ_FUNC TRef lj_ir_k64(jit_State *J, IROp op, uint64_t u64); LJ_FUNC TValue *lj_ir_k64_reserve(jit_State *J); LJ_FUNC cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64); LJ_FUNC TRef lj_ir_knum_u64(jit_State *J, uint64_t u64); diff --git a/src/lj_jit.h b/src/lj_jit.h index eafbc327..e9ab319e 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -355,8 +355,8 @@ enum { /* Fold state is used to fold instructions on-the-fly. */ typedef struct FoldState { IRIns ins; /* Currently emitted instruction. */ - IRIns left; /* Instruction referenced by left operand. */ - IRIns right; /* Instruction referenced by right operand. */ + IRIns left[2]; /* Instruction referenced by left operand. */ + IRIns right[2]; /* Instruction referenced by right operand. */ } FoldState; /* JIT compiler state. */ diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index c102f2db..73a368ed 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -136,8 +136,8 @@ /* Some local macros to save typing. Undef'd at the end. */ #define IR(ref) (&J->cur.ir[(ref)]) #define fins (&J->fold.ins) -#define fleft (&J->fold.left) -#define fright (&J->fold.right) +#define fleft (J->fold.left) +#define fright (J->fold.right) #define knumleft (ir_knum(fleft)->n) #define knumright (ir_knum(fright)->n) @@ -2393,10 +2393,14 @@ retry: if (fins->op1 >= J->cur.nk) { key += (uint32_t)IR(fins->op1)->o << 10; *fleft = *IR(fins->op1); + if (fins->op1 < REF_TRUE) + fleft[1] = IR(fins->op1)[1]; } if (fins->op2 >= J->cur.nk) { key += (uint32_t)IR(fins->op2)->o; *fright = *IR(fins->op2); + if (fins->op2 < REF_TRUE) + fright[1] = IR(fins->op2)[1]; } else { key += (fins->op2 & 0x3ffu); /* Literal mask. Must include IRCONV_*MASK. */ } diff --git a/src/lj_opt_mem.c b/src/lj_opt_mem.c index 5549b0d0..92ecbb48 100644 --- a/src/lj_opt_mem.c +++ b/src/lj_opt_mem.c @@ -22,8 +22,8 @@ /* Some local macros to save typing. Undef'd at the end. */ #define IR(ref) (&J->cur.ir[(ref)]) #define fins (&J->fold.ins) -#define fleft (&J->fold.left) -#define fright (&J->fold.right) +#define fleft (J->fold.left) +#define fright (J->fold.right) /* ** Caveat #1: return value is not always a TRef -- only use with tref_ref(). diff --git a/src/lj_opt_sink.c b/src/lj_opt_sink.c index 49e13784..1b775f2d 100644 --- a/src/lj_opt_sink.c +++ b/src/lj_opt_sink.c @@ -220,6 +220,8 @@ static void sink_sweep_ins(jit_State *J) for (ir = IR(J->cur.nk); ir < irbase; ir++) { irt_clearmark(ir->t); ir->prev = REGSP_INIT; + if (irt_is64(ir->t) && ir->o != IR_KNULL) + ir++; } } diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c index 49c9ae47..19818660 100644 --- a/src/lj_opt_split.c +++ b/src/lj_opt_split.c @@ -354,6 +354,8 @@ static void split_ir(jit_State *J) ir->prev = ref; /* Identity substitution for loword. */ hisubst[ref] = 0; } + if (irt_is64(ir->t) && ir->o != IR_KNULL) + ref++; } /* Process old IR instructions. */ diff --git a/src/lj_record.c b/src/lj_record.c index b5fb6649..3b754897 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -61,7 +61,10 @@ static void rec_check_ir(jit_State *J) case IRMref: lua_assert(op1 >= nk); lua_assert(i >= REF_BIAS ? op1 < i : op1 > i); break; case IRMlit: break; - case IRMcst: lua_assert(i < REF_BIAS); continue; + case IRMcst: lua_assert(i < REF_BIAS); + if (irt_is64(ir->t) && ir->o != IR_KNULL) + i++; + continue; } switch (irm_op2(mode)) { case IRMnone: lua_assert(op2 == 0); break; diff --git a/src/lj_snap.c b/src/lj_snap.c index 8638d9ed..6199b1f0 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -371,8 +371,8 @@ static TRef snap_replay_const(jit_State *J, IRIns *ir) case IR_KPRI: return TREF_PRI(irt_type(ir->t)); case IR_KINT: return lj_ir_kint(J, ir->i); case IR_KGC: return lj_ir_kgc(J, ir_kgc(ir), irt_t(ir->t)); - case IR_KNUM: return lj_ir_k64(J, IR_KNUM, ir_knum(ir)); - case IR_KINT64: return lj_ir_k64(J, IR_KINT64, ir_kint64(ir)); + case IR_KNUM: case IR_KINT64: + return lj_ir_k64(J, (IROp)ir->o, ir_k64(ir)->u64); case IR_KPTR: return lj_ir_kptr(J, ir_kptr(ir)); /* Continuation. */ default: lua_assert(0); return TREF_NIL; break; } @@ -555,8 +555,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T) if (irref_isk(irs->op2) && irref_isk((irs+1)->op2)) { uint64_t k = (uint32_t)T->ir[irs->op2].i + ((uint64_t)T->ir[(irs+1)->op2].i << 32); - val = lj_ir_k64(J, t == IRT_I64 ? IR_KINT64 : IR_KNUM, - lj_ir_k64_find(J, k)); + val = lj_ir_k64(J, t == IRT_I64 ? IR_KINT64 : IR_KNUM, k); } else { val = emitir_raw(IRT(IR_HIOP, t), val, snap_pref(J, T, map, nent, seen, (irs+1)->op2)); @@ -651,7 +650,7 @@ static void snap_restoredata(GCtrace *T, ExitState *ex, uint64_t tmp; if (irref_isk(ref)) { if (ir->o == IR_KNUM || ir->o == IR_KINT64) { - src = mref(ir->ptr, int32_t); + src = (int32_t *)&ir[1]; } else if (sz == 8) { tmp = (uint64_t)(uint32_t)ir->i; src = (int32_t *)&tmp; From 9e99ccc360bc9784ebe5ce29d5fa2c72acfc5777 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 00:27:51 +0200 Subject: [PATCH 25/57] Strip out old infrastructure for 64 bit constants. Contributed by Peter Cawley. --- src/lj_ir.c | 74 -------------------------------------------------- src/lj_iropt.h | 3 -- src/lj_jit.h | 1 - src/lj_trace.c | 2 -- 4 files changed, 80 deletions(-) diff --git a/src/lj_ir.c b/src/lj_ir.c index 124d5791..9c0a2224 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -204,80 +204,6 @@ found: return TREF(ref, IRT_INT); } -/* The MRef inside the KNUM/KINT64 IR instructions holds the address of the -** 64 bit constant. The constants themselves are stored in a chained array -** and shared across traces. -** -** Rationale for choosing this data structure: -** - The address of the constants is embedded in the generated machine code -** and must never move. A resizable array or hash table wouldn't work. -** - Most apps need very few non-32 bit integer constants (less than a dozen). -** - Linear search is hard to beat in terms of speed and low complexity. -*/ -typedef struct K64Array { - MRef next; /* Pointer to next list. */ - MSize numk; /* Number of used elements in this array. */ - TValue k[LJ_MIN_K64SZ]; /* Array of constants. */ -} K64Array; - -/* Free all chained arrays. */ -void lj_ir_k64_freeall(jit_State *J) -{ - K64Array *k; - for (k = mref(J->k64p, K64Array); k; ) { - K64Array *next = mref(k->next, K64Array); - lj_mem_free(J2G(J), k, sizeof(K64Array)); - k = next; - } - setmref(J->k64p, NULL); -} - -/* Get new 64 bit constant slot. */ -static TValue *ir_k64_add(jit_State *J, K64Array *kp, uint64_t u64) -{ - TValue *ntv; - if (!(kp && kp->numk < LJ_MIN_K64SZ)) { /* Allocate a new array. */ - K64Array *kn = lj_mem_newt(J->L, sizeof(K64Array), K64Array); - setmref(kn->next, NULL); - kn->numk = 0; - if (kp) - setmref(kp->next, kn); /* Chain to the end of the list. */ - else - setmref(J->k64p, kn); /* Link first array. */ - kp = kn; - } - ntv = &kp->k[kp->numk++]; /* Add to current array. */ - ntv->u64 = u64; - return ntv; -} - -/* Find 64 bit constant in chained array or add it. */ -cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64) -{ - K64Array *k, *kp = NULL; - MSize idx; - /* Search for the constant in the whole chain of arrays. */ - for (k = mref(J->k64p, K64Array); k; k = mref(k->next, K64Array)) { - kp = k; /* Remember previous element in list. */ - for (idx = 0; idx < k->numk; idx++) { /* Search one array. */ - TValue *tv = &k->k[idx]; - if (tv->u64 == u64) /* Needed for +-0/NaN/absmask. */ - return tv; - } - } - /* Otherwise add a new constant. */ - return ir_k64_add(J, kp, u64); -} - -TValue *lj_ir_k64_reserve(jit_State *J) -{ - K64Array *k, *kp = NULL; - lj_ir_k64_find(J, 0); /* Intern dummy 0 to protect the reserved slot. */ - /* Find last K64Array, if any. */ - for (k = mref(J->k64p, K64Array); k; k = mref(k->next, K64Array)) kp = k; - return ir_k64_add(J, kp, 0); /* Set to 0. Final value is set later. */ -} - /* Intern 64 bit constant, given by its 64 bit pattern. */ TRef lj_ir_k64(jit_State *J, IROp op, uint64_t u64) { diff --git a/src/lj_iropt.h b/src/lj_iropt.h index 219d391a..8b7a43de 100644 --- a/src/lj_iropt.h +++ b/src/lj_iropt.h @@ -40,10 +40,7 @@ LJ_FUNC TRef lj_ir_ggfload(jit_State *J, IRType t, uintptr_t ofs); /* Interning of constants. */ LJ_FUNC TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k); -LJ_FUNC void lj_ir_k64_freeall(jit_State *J); LJ_FUNC TRef lj_ir_k64(jit_State *J, IROp op, uint64_t u64); -LJ_FUNC TValue *lj_ir_k64_reserve(jit_State *J); -LJ_FUNC cTValue *lj_ir_k64_find(jit_State *J, uint64_t u64); LJ_FUNC TRef lj_ir_knum_u64(jit_State *J, uint64_t u64); LJ_FUNC TRef lj_ir_knumint(jit_State *J, lua_Number n); LJ_FUNC TRef lj_ir_kint64(jit_State *J, uint64_t u64); diff --git a/src/lj_jit.h b/src/lj_jit.h index e9ab319e..55fbea8b 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -392,7 +392,6 @@ typedef struct jit_State { int32_t framedepth; /* Current frame depth. */ int32_t retdepth; /* Return frame depth (count of RETF). */ - MRef k64p; /* Pointer to chained array of 64 bit constants. */ TValue ksimd[LJ_KSIMD__MAX*2+1]; /* 16 byte aligned SIMD constants. */ TValue k64[LJ_K64__MAX]; /* Common 8 byte constants used by backends. */ uint32_t k32[LJ_K32__MAX]; /* Ditto for 4 byte constants. */ diff --git a/src/lj_trace.c b/src/lj_trace.c index eaf9365c..87146832 100644 --- a/src/lj_trace.c +++ b/src/lj_trace.c @@ -295,7 +295,6 @@ int lj_trace_flushall(lua_State *L) memset(J->penalty, 0, sizeof(J->penalty)); /* Free the whole machine code and invalidate all exit stub groups. */ lj_mcode_free(J); - lj_ir_k64_freeall(J); memset(J->exitstubgroup, 0, sizeof(J->exitstubgroup)); lj_vmevent_send(L, TRACE, setstrV(L, L->top++, lj_str_newlit(L, "flush")); @@ -351,7 +350,6 @@ void lj_trace_freestate(global_State *g) } #endif lj_mcode_free(J); - lj_ir_k64_freeall(J); lj_mem_freevec(g, J->snapmapbuf, J->sizesnapmap, SnapEntry); lj_mem_freevec(g, J->snapbuf, J->sizesnap, SnapShot); lj_mem_freevec(g, J->irbuf + J->irbotlim, J->irtoplim - J->irbotlim, IRIns); From f26679c7195e99b32f95ea9f25b47b9981e15d3d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 00:34:05 +0200 Subject: [PATCH 26/57] LJ_GC64: Add support for 64 bit GCobj constants in the IR. Contributed by Peter Cawley. --- src/lj_asm.c | 11 ++++++++++- src/lj_ir.c | 29 ++++++++++++++++++++--------- src/lj_ir.h | 6 +++--- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index 0b3e770a..9f784cc8 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -704,6 +704,11 @@ static void ra_left(ASMState *as, Reg dest, IRRef lref) } else if (ir->o == IR_KINT64) { emit_loadk64(as, dest, ir); return; +#if LJ_GC64 + } else if (ir->o == IR_KGC || ir->o == IR_KPTR || ir->o == IR_KKPTR) { + emit_loadk64(as, dest, ir); + return; +#endif #endif } else if (ir->o != IR_KPRI) { lua_assert(ir->o == IR_KINT || ir->o == IR_KGC || @@ -1933,7 +1938,7 @@ static void asm_tail_link(ASMState *as) emit_addptr(as, RID_BASE, 8*(int32_t)baseslot); if (as->J->ktrace) { /* Patch ktrace slot with the final GCtrace pointer. */ - setgcref(IR(as->J->ktrace)->gcr, obj2gco(as->J->curfinal)); + setgcref(IR(as->J->ktrace)[LJ_GC64].gcr, obj2gco(as->J->curfinal)); IR(as->J->ktrace)->o = IR_KGC; } @@ -1965,8 +1970,12 @@ static void asm_setup_regsp(ASMState *as) for (ir = IR(T->nk), lastir = IR(REF_BASE); ir < lastir; ir++) { ir->prev = REGSP_INIT; if (irt_is64(ir->t) && ir->o != IR_KNULL) { +#if LJ_GC64 + ir->i = 0; /* Will become non-zero only for RIP-relative addresses. */ +#else /* Make life easier for backends by putting address of constant in i. */ ir->i = (int32_t)(intptr_t)(ir+1); +#endif ir++; } } diff --git a/src/lj_ir.c b/src/lj_ir.c index 9c0a2224..0a206ebb 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -185,6 +185,12 @@ static LJ_AINLINE IRRef ir_nextk64(jit_State *J) return ref; } +#if LJ_GC64 +#define ir_nextkgc ir_nextk64 +#else +#define ir_nextkgc ir_nextk +#endif + /* Intern int32_t constant. */ TRef LJ_FASTCALL lj_ir_kint(jit_State *J, int32_t k) { @@ -268,15 +274,14 @@ TRef lj_ir_kgc(jit_State *J, GCobj *o, IRType t) { IRIns *ir, *cir = J->cur.ir; IRRef ref; - lua_assert(!LJ_GC64); /* TODO_GC64: major changes required. */ lua_assert(!isdead(J2G(J), o)); for (ref = J->chain[IR_KGC]; ref; ref = cir[ref].prev) if (ir_kgc(&cir[ref]) == o) goto found; - ref = ir_nextk(J); + ref = ir_nextkgc(J); ir = IR(ref); /* NOBARRIER: Current trace is a GC root. */ - setgcref(ir->gcr, o); + setgcref(ir[LJ_GC64].gcr, o); ir->t.irt = (uint8_t)t; ir->o = IR_KGC; ir->prev = J->chain[IR_KGC]; @@ -288,33 +293,39 @@ found: /* Allocate GCtrace constant placeholder (no interning). */ TRef lj_ir_ktrace(jit_State *J) { - IRRef ref = ir_nextk(J); + IRRef ref = ir_nextkgc(J); IRIns *ir = IR(ref); lua_assert(irt_toitype_(IRT_P64) == LJ_TTRACE); ir->t.irt = IRT_P64; - ir->o = IR_KNULL; /* Not IR_KGC yet, but same size. */ + ir->o = LJ_GC64 ? IR_KNUM : IR_KNULL; /* Not IR_KGC yet, but same size. */ ir->prev = 0; return TREF(ref, IRT_P64); } -/* Intern 32 bit pointer constant. */ +/* Intern pointer constant. */ TRef lj_ir_kptr_(jit_State *J, IROp op, void *ptr) { IRIns *ir, *cir = J->cur.ir; IRRef ref; +#if LJ_64 && !LJ_GC64 lua_assert((void *)(uintptr_t)u32ptr(ptr) == ptr); +#endif for (ref = J->chain[op]; ref; ref = cir[ref].prev) if (ir_kptr(&cir[ref]) == ptr) goto found; +#if LJ_GC64 + ref = ir_nextk64(J); +#else ref = ir_nextk(J); +#endif ir = IR(ref); - setmref(ir->ptr, ptr); - ir->t.irt = IRT_P32; + setmref(ir[LJ_GC64].ptr, ptr); + ir->t.irt = IRT_PGC; ir->o = op; ir->prev = J->chain[op]; J->chain[op] = (IRRef1)ref; found: - return TREF(ref, IRT_P32); + return TREF(ref, IRT_PGC); } /* Intern typed NULL constant. */ diff --git a/src/lj_ir.h b/src/lj_ir.h index 03377ec1..8b2fe638 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -557,8 +557,7 @@ typedef union IRIns { TValue tv; /* TValue constant (overlaps entire slot). */ } IRIns; -/* TODO_GC64: major changes required. */ -#define ir_kgc(ir) check_exp((ir)->o == IR_KGC, gcref((ir)->gcr)) +#define ir_kgc(ir) check_exp((ir)->o == IR_KGC, gcref((ir)[LJ_GC64].gcr)) #define ir_kstr(ir) (gco2str(ir_kgc((ir)))) #define ir_ktab(ir) (gco2tab(ir_kgc((ir)))) #define ir_kfunc(ir) (gco2func(ir_kgc((ir)))) @@ -568,7 +567,8 @@ typedef union IRIns { #define ir_k64(ir) \ check_exp((ir)->o == IR_KNUM || (ir)->o == IR_KINT64, &(ir)[1].tv) #define ir_kptr(ir) \ - check_exp((ir)->o == IR_KPTR || (ir)->o == IR_KKPTR, mref((ir)->ptr, void)) + check_exp((ir)->o == IR_KPTR || (ir)->o == IR_KKPTR, \ + mref((ir)[LJ_GC64].ptr, void)) /* A store or any other op with a non-weak guard has a side-effect. */ static LJ_AINLINE int ir_sideeff(IRIns *ir) From 8f868a9d02340bae8b3b4a703118b324213f5c6d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 00:38:18 +0200 Subject: [PATCH 27/57] LJ_GC64: Update IR type sizes. Contributed by Peter Cawley. --- src/lj_ir.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/lj_ir.h b/src/lj_ir.h index 8b2fe638..3de57046 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -294,7 +294,9 @@ LJ_DATA const uint8_t lj_ir_mode[IR__MAX+1]; /* -- IR instruction types ------------------------------------------------ */ -/* Map of itypes to non-negative numbers. ORDER LJ_T. +#define IRTSIZE_PGC (LJ_GC64 ? 8 : 4) + +/* Map of itypes to non-negative numbers and their sizes. ORDER LJ_T. ** LJ_TUPVAL/LJ_TTRACE never appear in a TValue. Use these itypes for ** IRT_P32 and IRT_P64, which never escape the IR. ** The various integers are only used in the IR and can only escape to @@ -302,12 +304,13 @@ LJ_DATA const uint8_t lj_ir_mode[IR__MAX+1]; ** contiguous and next to IRT_NUM (see the typerange macros below). */ #define IRTDEF(_) \ - _(NIL, 4) _(FALSE, 4) _(TRUE, 4) _(LIGHTUD, LJ_64 ? 8 : 4) _(STR, 4) \ - _(P32, 4) _(THREAD, 4) _(PROTO, 4) _(FUNC, 4) _(P64, 8) _(CDATA, 4) \ - _(TAB, 4) _(UDATA, 4) \ + _(NIL, 4) _(FALSE, 4) _(TRUE, 4) _(LIGHTUD, LJ_64 ? 8 : 4) \ + _(STR, IRTSIZE_PGC) _(P32, 4) _(THREAD, IRTSIZE_PGC) _(PROTO, IRTSIZE_PGC) \ + _(FUNC, IRTSIZE_PGC) _(P64, 8) _(CDATA, IRTSIZE_PGC) _(TAB, IRTSIZE_PGC) \ + _(UDATA, IRTSIZE_PGC) \ _(FLOAT, 4) _(NUM, 8) _(I8, 1) _(U8, 1) _(I16, 2) _(U16, 2) \ _(INT, 4) _(U32, 4) _(I64, 8) _(U64, 8) \ - _(SOFTFP, 4) /* There is room for 9 more types. */ + _(SOFTFP, 4) /* There is room for 8 more types. */ /* IR result type and flags (8 bit). */ typedef enum { From 6c8258d74b7d4ae7f288897518f23c809b9395f2 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 01:49:00 +0200 Subject: [PATCH 28/57] LJ_FR2: Add support for trace recording and snapshots. Contributed by Peter Cawley. --- src/jit/dump.lua | 10 +- src/lj_arch.h | 2 +- src/lj_asm.c | 6 +- src/lj_asm_x86.h | 22 ++++- src/lj_crecord.c | 30 ++++-- src/lj_def.h | 2 + src/lj_ffrecord.c | 49 ++++++---- src/lj_ir.h | 2 +- src/lj_jit.h | 14 ++- src/lj_record.c | 231 ++++++++++++++++++++++++++++++---------------- src/lj_snap.c | 61 +++++++++--- 11 files changed, 291 insertions(+), 138 deletions(-) diff --git a/src/jit/dump.lua b/src/jit/dump.lua index 9a722f73..a635af10 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -310,15 +310,17 @@ local function fmtfunc(func, pc) end end -local function formatk(tr, idx) +local function formatk(tr, idx, sn) local k, t, slot = tracek(tr, idx) local tn = type(k) local s if tn == "number" then - if k == 2^52+2^51 then + if band(sn or 0, 0x30000) ~= 0 then + s = band(sn, 0x20000) ~= 0 and "contpc" or "ftsz" + elseif k == 2^52+2^51 then s = "bias" else - s = format("%+.14g", k) + s = format(0 < k and k < 0x1p-1026 and "%+a" or "%+.14g", k) end elseif tn == "string" then s = format(#k > 20 and '"%.20s"~' or '"%s"', gsub(k, "%c", ctlsub)) @@ -354,7 +356,7 @@ local function printsnap(tr, snap) n = n + 1 local ref = band(sn, 0xffff) - 0x8000 -- REF_BIAS if ref < 0 then - out:write(formatk(tr, ref)) + out:write(formatk(tr, ref, sn)) elseif band(sn, 0x80000) ~= 0 then -- SNAP_SOFTFPNUM out:write(colorize(format("%04d/%04d", ref, ref+1), 14)) else diff --git a/src/lj_arch.h b/src/lj_arch.h index 612c7303..72622a21 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -453,7 +453,7 @@ #endif /* Disable or enable the JIT compiler. */ -#if defined(LUAJIT_DISABLE_JIT) || defined(LJ_ARCH_NOJIT) || defined(LJ_OS_NOJIT) || LJ_FR2 || LJ_GC64 +#if defined(LUAJIT_DISABLE_JIT) || defined(LJ_ARCH_NOJIT) || defined(LJ_OS_NOJIT) || LJ_GC64 #define LJ_HASJIT 0 #else #define LJ_HASJIT 1 diff --git a/src/lj_asm.c b/src/lj_asm.c index 9f784cc8..5dd7ca3a 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1893,7 +1893,7 @@ static BCReg asm_baseslot(ASMState *as, SnapShot *snap, int *gotframe) SnapEntry sn = map[n-1]; if ((sn & SNAP_FRAME)) { *gotframe = 1; - return snap_slot(sn); + return snap_slot(sn) - LJ_FR2; } } return 0; @@ -1913,7 +1913,7 @@ static void asm_tail_link(ASMState *as) if (as->T->link == 0) { /* Setup fixed registers for exit to interpreter. */ - const BCIns *pc = snap_pc(as->T->snapmap[snap->mapofs + snap->nent]); + const BCIns *pc = snap_pc(&as->T->snapmap[snap->mapofs + snap->nent]); int32_t mres; if (bc_op(*pc) == BC_JLOOP) { /* NYI: find a better way to do this. */ BCIns *retpc = &traceref(as->J, bc_d(*pc))->startins; @@ -1922,7 +1922,7 @@ static void asm_tail_link(ASMState *as) } ra_allockreg(as, i32ptr(J2GG(as->J)->dispatch), RID_DISPATCH); ra_allockreg(as, i32ptr(pc), RID_LPC); - mres = (int32_t)(snap->nslots - baseslot); + mres = (int32_t)(snap->nslots - baseslot - LJ_FR2); switch (bc_op(*pc)) { case BC_CALLM: case BC_CALLMT: mres -= (int32_t)(1 + LJ_FR2 + bc_a(*pc) + bc_c(*pc)); break; diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 0361a965..83fe22b2 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -348,7 +348,8 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) && noconflict(as, ref, IR_RETF, 0)) { as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); - as->mrm.ofs = 8*((int32_t)ir->op1-1) + ((ir->op2&IRSLOAD_FRAME)?4:0); + as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) + + (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0); as->mrm.idx = RID_NONE; return RID_MRM; } @@ -655,6 +656,9 @@ static void asm_callx(ASMState *as, IRIns *ir) static void asm_retf(ASMState *as, IRIns *ir) { Reg base = ra_alloc1(as, REF_BASE, RSET_GPR); +#if LJ_FR2 + Reg rpc = ra_scratch(as, rset_exclude(RSET_GPR, base)); +#endif void *pc = ir_kptr(IR(ir->op2)); int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1)); as->topslot -= (BCReg)delta; @@ -663,7 +667,12 @@ static void asm_retf(ASMState *as, IRIns *ir) emit_setgl(as, base, jit_base); emit_addptr(as, base, -8*delta); asm_guardcc(as, CC_NE); +#if LJ_FR2 + emit_rmro(as, XO_CMP, rpc, base, -8); + emit_loadu64(as, rpc, u64ptr(pc)); +#else emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc)); +#endif } /* -- Type conversions ---------------------------------------------------- */ @@ -1397,7 +1406,8 @@ static void asm_ahustore(ASMState *as, IRIns *ir) static void asm_sload(ASMState *as, IRIns *ir) { - int32_t ofs = 8*((int32_t)ir->op1-1) + ((ir->op2 & IRSLOAD_FRAME) ? 4 : 0); + int32_t ofs = 8*((int32_t)ir->op1-1-LJ_FR2) + + (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0); IRType1 t = ir->t; Reg base; lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ @@ -2383,13 +2393,15 @@ static void asm_stack_check(ASMState *as, BCReg topslot, static void asm_stack_restore(ASMState *as, SnapShot *snap) { SnapEntry *map = &as->T->snapmap[snap->mapofs]; - SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1]; +#if !LJ_FR2 || defined(LUA_USE_ASSERT) + SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2]; +#endif MSize n, nent = snap->nent; /* Store the value of all modified slots to the Lua stack. */ for (n = 0; n < nent; n++) { SnapEntry sn = map[n]; BCReg s = snap_slot(sn); - int32_t ofs = 8*((int32_t)s-1); + int32_t ofs = 8*((int32_t)s-1-LJ_FR2); IRRef ref = snap_ref(sn); IRIns *ir = IR(ref); if ((sn & SNAP_NORESTORE)) @@ -2407,8 +2419,10 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) emit_movmroi(as, RID_BASE, ofs, ir->i); } if ((sn & (SNAP_CONT|SNAP_FRAME))) { +#if !LJ_FR2 if (s != 0) /* Do not overwrite link to previous frame. */ emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--)); +#endif } else { if (!(LJ_64 && irt_islightud(ir->t))) emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t)); diff --git a/src/lj_crecord.c b/src/lj_crecord.c index c0f7e3d7..d568b20a 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -712,6 +712,19 @@ static TRef crec_reassoc_ofs(jit_State *J, TRef tr, ptrdiff_t *ofsp, MSize sz) return tr; } +/* Tailcall to function. */ +static void crec_tailcall(jit_State *J, RecordFFData *rd, cTValue *tv) +{ + TRef kfunc = lj_ir_kfunc(J, funcV(tv)); +#if LJ_FR2 + J->base[-2] = kfunc; + J->base[-1] = TREF_FRAME; +#else + J->base[-1] = kfunc | TREF_FRAME; +#endif + rd->nres = -1; /* Pending tailcall. */ +} + /* Record ctype __index/__newindex metamethods. */ static void crec_index_meta(jit_State *J, CTState *cts, CType *ct, RecordFFData *rd) @@ -721,8 +734,7 @@ static void crec_index_meta(jit_State *J, CTState *cts, CType *ct, if (!tv) lj_trace_err(J, LJ_TRERR_BADTYPE); if (tvisfunc(tv)) { - J->base[-1] = lj_ir_kfunc(J, funcV(tv)) | TREF_FRAME; - rd->nres = -1; /* Pending tailcall. */ + crec_tailcall(J, rd, tv); } else if (rd->data == 0 && tvistab(tv) && tref_isstr(J->base[1])) { /* Specialize to result of __index lookup. */ cTValue *o = lj_tab_get(J->L, tabV(tv), &rd->argv[1]); @@ -1119,20 +1131,20 @@ static void crec_snap_caller(jit_State *J) lua_State *L = J->L; TValue *base = L->base, *top = L->top; const BCIns *pc = J->pc; - TRef ftr = J->base[-1]; + TRef ftr = J->base[-1-LJ_FR2]; ptrdiff_t delta; if (!frame_islua(base-1) || J->framedepth <= 0) lj_trace_err(J, LJ_TRERR_NYICALL); J->pc = frame_pc(base-1); delta = 1+LJ_FR2+bc_a(J->pc[-1]); L->top = base; L->base = base - delta; - J->base[-1] = TREF_FALSE; + J->base[-1-LJ_FR2] = TREF_FALSE; J->base -= delta; J->baseslot -= (BCReg)delta; - J->maxslot = (BCReg)delta; J->framedepth--; + J->maxslot = (BCReg)delta-LJ_FR2; J->framedepth--; lj_snap_add(J); L->base = base; L->top = top; J->framedepth++; J->maxslot = 1; J->base += delta; J->baseslot += (BCReg)delta; - J->base[-1] = ftr; J->pc = pc; + J->base[-1-LJ_FR2] = ftr; J->pc = pc; } /* Record function call. */ @@ -1224,8 +1236,7 @@ void LJ_FASTCALL recff_cdata_call(jit_State *J, RecordFFData *rd) tv = lj_ctype_meta(cts, ctype_isptr(ct->info) ? ctype_cid(ct->info) : id, mm); if (tv) { if (tvisfunc(tv)) { - J->base[-1] = lj_ir_kfunc(J, funcV(tv)) | TREF_FRAME; - rd->nres = -1; /* Pending tailcall. */ + crec_tailcall(J, rd, tv); return; } } else if (mm == MM_new) { @@ -1373,8 +1384,7 @@ static TRef crec_arith_meta(jit_State *J, TRef *sp, CType **s, CTState *cts, } if (tv) { if (tvisfunc(tv)) { - J->base[-1] = lj_ir_kfunc(J, funcV(tv)) | TREF_FRAME; - rd->nres = -1; /* Pending tailcall. */ + crec_tailcall(J, rd, tv); return 0; } /* NYI: non-function metamethods. */ } else if ((MMS)rd->data == MM_eq) { /* Fallback cdata pointer comparison. */ diff --git a/src/lj_def.h b/src/lj_def.h index 29d3fdda..9413399d 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -95,6 +95,8 @@ typedef unsigned int uintptr_t; #define U64x(hi, lo) (((uint64_t)0x##hi << 32) + (uint64_t)0x##lo) #define i32ptr(p) ((int32_t)(intptr_t)(void *)(p)) #define u32ptr(p) ((uint32_t)(intptr_t)(void *)(p)) +#define i64ptr(p) ((int64_t)(intptr_t)(void *)(p)) +#define u64ptr(p) ((uint64_t)(intptr_t)(void *)(p)) #define checki8(x) ((x) == (int32_t)(int8_t)(x)) #define checku8(x) ((x) == (int32_t)(uint8_t)(x)) diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index ae567622..64a9a65d 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -102,35 +102,41 @@ static void recff_stitch(jit_State *J) ASMFunction cont = lj_cont_stitch; lua_State *L = J->L; TValue *base = L->base; + BCReg nslot = J->maxslot + 1 + LJ_FR2; + TValue *nframe = base + 1 + LJ_FR2; const BCIns *pc = frame_pc(base-1); TValue *pframe = frame_prevl(base-1); - lua_assert(!LJ_FR2); /* TODO_FR2: handle frame shift. */ /* Move func + args up in Lua stack and insert continuation. */ - memmove(&base[1], &base[-1], sizeof(TValue)*(J->maxslot+1)); - setframe_ftsz(base+1, ((char *)(base+1) - (char *)pframe) + FRAME_CONT); - setcont(base, cont); + memmove(&base[1], &base[-1-LJ_FR2], sizeof(TValue)*nslot); + setframe_ftsz(nframe, ((char *)nframe - (char *)pframe) + FRAME_CONT); + setcont(base-LJ_FR2, cont); setframe_pc(base, pc); - setnilV(base-1); /* Incorrect, but rec_check_slots() won't run anymore. */ - L->base += 2; - L->top += 2; + setnilV(base-1-LJ_FR2); /* Incorrect, but rec_check_slots() won't run anymore. */ + L->base += 2 + LJ_FR2; + L->top += 2 + LJ_FR2; /* Ditto for the IR. */ - memmove(&J->base[1], &J->base[-1], sizeof(TRef)*(J->maxslot+1)); + memmove(&J->base[1], &J->base[-1-LJ_FR2], sizeof(TRef)*nslot); +#if LJ_FR2 + J->base[2] = TREF_FRAME; + J->base[-1] = lj_ir_k64(J, IR_KNUM, u64ptr(contptr(cont))); + J->base[0] = lj_ir_k64(J, IR_KNUM, u64ptr(pc)) | TREF_CONT; +#else J->base[0] = lj_ir_kptr(J, contptr(cont)) | TREF_CONT; - J->base[-1] = lj_ir_ktrace(J); - J->ktrace = tref_ref(J->base[-1]); - J->base += 2; - J->baseslot += 2; +#endif + J->ktrace = tref_ref((J->base[-1-LJ_FR2] = lj_ir_ktrace(J))); + J->base += 2 + LJ_FR2; + J->baseslot += 2 + LJ_FR2; J->framedepth++; lj_record_stop(J, LJ_TRLINK_STITCH, 0); /* Undo Lua stack changes. */ - memmove(&base[-1], &base[1], sizeof(TValue)*(J->maxslot+1)); + memmove(&base[-1-LJ_FR2], &base[1], sizeof(TValue)*nslot); setframe_pc(base-1, pc); - L->base -= 2; - L->top -= 2; + L->base -= 2 + LJ_FR2; + L->top -= 2 + LJ_FR2; } /* Fallback handler for fast functions that are not recorded (yet). */ @@ -373,10 +379,10 @@ static int recff_metacall(jit_State *J, RecordFFData *rd, MMS mm) int errcode; TValue argv0; /* Temporarily insert metamethod below object. */ - J->base[1] = J->base[0]; + J->base[1+LJ_FR2] = J->base[0]; J->base[0] = ix.mobj; copyTV(J->L, &argv0, &rd->argv[0]); - copyTV(J->L, &rd->argv[1], &rd->argv[0]); + copyTV(J->L, &rd->argv[1+LJ_FR2], &rd->argv[0]); copyTV(J->L, &rd->argv[0], &ix.mobjv); /* Need to protect lj_record_tailcall because it may throw. */ errcode = lj_vm_cpcall(J->L, NULL, J, recff_metacall_cp); @@ -443,6 +449,10 @@ static void LJ_FASTCALL recff_xpairs(jit_State *J, RecordFFData *rd) static void LJ_FASTCALL recff_pcall(jit_State *J, RecordFFData *rd) { if (J->maxslot >= 1) { +#if LJ_FR2 + /* Shift function arguments up. */ + memmove(J->base + 1, J->base, sizeof(TRef) * J->maxslot); +#endif lj_record_call(J, 0, J->maxslot - 1); rd->nres = -1; /* Pending call. */ } /* else: Interpreter will throw. */ @@ -462,13 +472,16 @@ static void LJ_FASTCALL recff_xpcall(jit_State *J, RecordFFData *rd) TValue argv0, argv1; TRef tmp; int errcode; - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ /* Swap function and traceback. */ tmp = J->base[0]; J->base[0] = J->base[1]; J->base[1] = tmp; copyTV(J->L, &argv0, &rd->argv[0]); copyTV(J->L, &argv1, &rd->argv[1]); copyTV(J->L, &rd->argv[0], &argv1); copyTV(J->L, &rd->argv[1], &argv0); +#if LJ_FR2 + /* Shift function arguments up. */ + memmove(J->base + 2, J->base + 1, sizeof(TRef) * (J->maxslot-1)); +#endif /* Need to protect lj_record_call because it may throw. */ errcode = lj_vm_cpcall(J->L, NULL, J, recff_xpcall_cp); /* Always undo Lua stack swap to avoid confusing the interpreter. */ diff --git a/src/lj_ir.h b/src/lj_ir.h index 3de57046..4e9c85c7 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -220,7 +220,7 @@ IRFLDEF(FLENUM) /* SLOAD mode bits, stored in op2. */ #define IRSLOAD_PARENT 0x01 /* Coalesce with parent trace. */ -#define IRSLOAD_FRAME 0x02 /* Load hiword of frame. */ +#define IRSLOAD_FRAME 0x02 /* Load 32 bits of ftsz. */ #define IRSLOAD_TYPECHECK 0x04 /* Needs type check. */ #define IRSLOAD_CONVERT 0x08 /* Number to integer conversion. */ #define IRSLOAD_READONLY 0x10 /* Read-only, omit slot store. */ diff --git a/src/lj_jit.h b/src/lj_jit.h index 55fbea8b..f460a0ab 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -179,14 +179,26 @@ LJ_STATIC_ASSERT(SNAP_CONT == TREF_CONT); #define SNAP(slot, flags, ref) (((SnapEntry)(slot) << 24) + (flags) + (ref)) #define SNAP_TR(slot, tr) \ (((SnapEntry)(slot) << 24) + ((tr) & (TREF_CONT|TREF_FRAME|TREF_REFMASK))) +#if !LJ_FR2 #define SNAP_MKPC(pc) ((SnapEntry)u32ptr(pc)) +#endif #define SNAP_MKFTSZ(ftsz) ((SnapEntry)(ftsz)) #define snap_ref(sn) ((sn) & 0xffff) #define snap_slot(sn) ((BCReg)((sn) >> 24)) #define snap_isframe(sn) ((sn) & SNAP_FRAME) -#define snap_pc(sn) ((const BCIns *)(uintptr_t)(sn)) #define snap_setref(sn, ref) (((sn) & (0xffff0000&~SNAP_NORESTORE)) | (ref)) +static LJ_AINLINE const BCIns *snap_pc(SnapEntry *sn) +{ +#if LJ_FR2 + uint64_t pcbase; + memcpy(&pcbase, sn, sizeof(uint64_t)); + return (const BCIns *)(pcbase >> 8); +#else + return (const BCIns *)(uintptr_t)*sn; +#endif +} + /* Snapshot and exit numbers. */ typedef uint32_t SnapNo; typedef uint32_t ExitNo; diff --git a/src/lj_record.c b/src/lj_record.c index 3b754897..f0481050 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -87,30 +87,48 @@ static void rec_check_slots(jit_State *J) BCReg s, nslots = J->baseslot + J->maxslot; int32_t depth = 0; cTValue *base = J->L->base - J->baseslot; - lua_assert(J->baseslot >= 1 && J->baseslot < LJ_MAX_JSLOTS); - lua_assert(J->baseslot == 1 || (J->slot[J->baseslot-1] & TREF_FRAME)); + lua_assert(J->baseslot >= 1+LJ_FR2 && J->baseslot < LJ_MAX_JSLOTS); + lua_assert(J->baseslot == 1+LJ_FR2 || (J->slot[J->baseslot-1] & TREF_FRAME)); lua_assert(nslots < LJ_MAX_JSLOTS); for (s = 0; s < nslots; s++) { TRef tr = J->slot[s]; if (tr) { cTValue *tv = &base[s]; IRRef ref = tref_ref(tr); - IRIns *ir; - lua_assert(ref >= J->cur.nk && ref < J->cur.nins); - ir = IR(ref); - lua_assert(irt_t(ir->t) == tref_t(tr)); + IRIns *ir = NULL; /* Silence compiler. */ + if (!LJ_FR2 || ref || !(tr & (TREF_FRAME | TREF_CONT))) { + lua_assert(ref >= J->cur.nk && ref < J->cur.nins); + ir = IR(ref); + lua_assert(irt_t(ir->t) == tref_t(tr)); + } if (s == 0) { lua_assert(tref_isfunc(tr)); +#if LJ_FR2 + } else if (s == 1) { + lua_assert(0); +#endif } else if ((tr & TREF_FRAME)) { GCfunc *fn = gco2func(frame_gc(tv)); BCReg delta = (BCReg)(tv - frame_prev(tv)); +#if LJ_FR2 + if (ref) + lua_assert(ir_knum(ir)->u64 == tv->u64); + tr = J->slot[s-1]; + ir = IR(tref_ref(tr)); +#endif lua_assert(tref_isfunc(tr)); if (tref_isk(tr)) lua_assert(fn == ir_kfunc(ir)); - lua_assert(s > delta ? (J->slot[s-delta] & TREF_FRAME) : (s == delta)); + lua_assert(s > delta + LJ_FR2 ? (J->slot[s-delta] & TREF_FRAME) + : (s == delta + LJ_FR2)); depth++; } else if ((tr & TREF_CONT)) { +#if LJ_FR2 + if (ref) + lua_assert(ir_knum(ir)->u64 == tv->u64); +#else lua_assert(ir_kptr(ir) == gcrefp(tv->gcr, void)); - lua_assert((J->slot[s+1] & TREF_FRAME)); +#endif + lua_assert((J->slot[s+1+LJ_FR2] & TREF_FRAME)); depth++; } else { if (tvisnumber(tv)) @@ -162,10 +180,10 @@ static TRef sload(jit_State *J, int32_t slot) /* Get TRef for current function. */ static TRef getcurrf(jit_State *J) { - if (J->base[-1]) - return J->base[-1]; - lua_assert(J->baseslot == 1); - return sloadt(J, -1, IRT_FUNC, IRSLOAD_READONLY); + if (J->base[-1-LJ_FR2]) + return J->base[-1-LJ_FR2]; + lua_assert(J->baseslot == 1+LJ_FR2); + return sloadt(J, -1-LJ_FR2, IRT_FUNC, IRSLOAD_READONLY); } /* Compare for raw object equality. @@ -509,7 +527,6 @@ static LoopEvent rec_for(jit_State *J, const BCIns *fori, int isforl) static LoopEvent rec_iterl(jit_State *J, const BCIns iterins) { BCReg ra = bc_a(iterins); - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ if (!tref_isnil(getslot(J, ra))) { /* Looping back? */ J->base[ra-1] = J->base[ra]; /* Copy result of ITERC to control var. */ J->maxslot = ra-1+bc_b(J->pc[-1]); @@ -678,22 +695,31 @@ static void rec_call_setup(jit_State *J, BCReg func, ptrdiff_t nargs) { RecordIndex ix; TValue *functv = &J->L->base[func]; - TRef *fbase = &J->base[func]; + TRef kfunc, *fbase = &J->base[func]; ptrdiff_t i; - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ - for (i = 0; i <= nargs; i++) - (void)getslot(J, func+i); /* Ensure func and all args have a reference. */ + (void)getslot(J, func); /* Ensure func has a reference. */ + for (i = 1; i <= nargs; i++) + (void)getslot(J, func+LJ_FR2+i); /* Ensure all args have a reference. */ if (!tref_isfunc(fbase[0])) { /* Resolve __call metamethod. */ ix.tab = fbase[0]; copyTV(J->L, &ix.tabv, functv); if (!lj_record_mm_lookup(J, &ix, MM_call) || !tref_isfunc(ix.mobj)) lj_trace_err(J, LJ_TRERR_NOMM); - for (i = ++nargs; i > 0; i--) /* Shift arguments up. */ - fbase[i] = fbase[i-1]; + for (i = ++nargs; i > LJ_FR2; i--) /* Shift arguments up. */ + fbase[i+LJ_FR2] = fbase[i+LJ_FR2-1]; +#if LJ_FR2 + fbase[2] = fbase[0]; +#endif fbase[0] = ix.mobj; /* Replace function. */ functv = &ix.mobjv; } - fbase[0] = TREF_FRAME | rec_call_specialize(J, funcV(functv), fbase[0]); + kfunc = rec_call_specialize(J, funcV(functv), fbase[0]); +#if LJ_FR2 + fbase[0] = kfunc; + fbase[1] = TREF_FRAME; +#else + fbase[0] = kfunc | TREF_FRAME; +#endif J->maxslot = (BCReg)nargs; } @@ -703,8 +729,8 @@ void lj_record_call(jit_State *J, BCReg func, ptrdiff_t nargs) rec_call_setup(J, func, nargs); /* Bump frame. */ J->framedepth++; - J->base += func+1; - J->baseslot += func+1; + J->base += func+1+LJ_FR2; + J->baseslot += func+1+LJ_FR2; } /* Record tail call. */ @@ -720,7 +746,9 @@ void lj_record_tailcall(jit_State *J, BCReg func, ptrdiff_t nargs) func += cbase; } /* Move func + args down. */ - memmove(&J->base[-1], &J->base[func], sizeof(TRef)*(J->maxslot+1)); + if (LJ_FR2 && J->baseslot == 2) + J->base[func+1] = 0; + memmove(&J->base[-1-LJ_FR2], &J->base[func], sizeof(TRef)*(J->maxslot+1+LJ_FR2)); /* Note: the new TREF_FRAME is now at J->base[-1] (even for slot #0). */ /* Tailcalls can form a loop, so count towards the loop unroll limit. */ if (++J->tailcalled > J->loopunroll) @@ -763,7 +791,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) BCReg cbase = (BCReg)frame_delta(frame); if (--J->framedepth < 0) lj_trace_err(J, LJ_TRERR_NYIRETL); - lua_assert(J->baseslot > 1); + lua_assert(J->baseslot > 1+LJ_FR2); gotresults++; rbase += cbase; J->baseslot -= (BCReg)cbase; @@ -787,7 +815,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) BCReg cbase = (BCReg)frame_delta(frame); if (--J->framedepth < 0) /* NYI: return of vararg func to lower frame. */ lj_trace_err(J, LJ_TRERR_NYIRETL); - lua_assert(J->baseslot > 1); + lua_assert(J->baseslot > 1+LJ_FR2); rbase += cbase; J->baseslot -= (BCReg)cbase; J->base -= cbase; @@ -797,8 +825,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) BCIns callins = *(frame_pc(frame)-1); ptrdiff_t nresults = bc_b(callins) ? (ptrdiff_t)bc_b(callins)-1 :gotresults; BCReg cbase = bc_a(callins); - GCproto *pt = funcproto(frame_func(frame - (cbase+1-LJ_FR2))); - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame teardown. */ + GCproto *pt = funcproto(frame_func(frame - (cbase+1+LJ_FR2))); if ((pt->flags & PROTO_NOJIT)) lj_trace_err(J, LJ_TRERR_CJITOFF); if (J->framedepth == 0 && J->pt && frame == J->L->base - 1) { @@ -811,13 +838,13 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) lj_snap_add(J); } for (i = 0; i < nresults; i++) /* Adjust results. */ - J->base[i-1] = i < gotresults ? J->base[rbase+i] : TREF_NIL; + J->base[i-1-LJ_FR2] = i < gotresults ? J->base[rbase+i] : TREF_NIL; J->maxslot = cbase+(BCReg)nresults; if (J->framedepth > 0) { /* Return to a frame that is part of the trace. */ J->framedepth--; - lua_assert(J->baseslot > cbase+1); - J->baseslot -= cbase+1; - J->base -= cbase+1; + lua_assert(J->baseslot > cbase+1+LJ_FR2); + J->baseslot -= cbase+1+LJ_FR2; + J->base -= cbase+1+LJ_FR2; } else if (J->parent == 0 && J->exitno == 0 && !bc_isret(bc_op(J->cur.startins))) { /* Return to lower frame would leave the loop in a root trace. */ @@ -827,13 +854,13 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) } else { /* Return to lower frame. Guard for the target we return to. */ TRef trpt = lj_ir_kgc(J, obj2gco(pt), IRT_PROTO); TRef trpc = lj_ir_kptr(J, (void *)frame_pc(frame)); - emitir(IRTG(IR_RETF, IRT_P32), trpt, trpc); + emitir(IRTG(IR_RETF, IRT_PGC), trpt, trpc); J->retdepth++; J->needsnap = 1; - lua_assert(J->baseslot == 1); + lua_assert(J->baseslot == 1+LJ_FR2); /* Shift result slots up and clear the slots of the new frame below. */ - memmove(J->base + cbase, J->base-1, sizeof(TRef)*nresults); - memset(J->base-1, 0, sizeof(TRef)*(cbase+1)); + memmove(J->base + cbase, J->base-1-LJ_FR2, sizeof(TRef)*nresults); + memset(J->base-1-LJ_FR2, 0, sizeof(TRef)*(cbase+1+LJ_FR2)); } } else if (frame_iscont(frame)) { /* Return to continuation frame. */ ASMFunction cont = frame_contf(frame); @@ -842,32 +869,39 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) lj_trace_err(J, LJ_TRERR_NYIRETL); J->baseslot -= (BCReg)cbase; J->base -= cbase; - J->maxslot = cbase-2; + J->maxslot = cbase-(2<base[dst] = gotresults ? J->base[cbase+rbase] : TREF_NIL; - if (dst >= J->maxslot) J->maxslot = dst+1; + if (dst >= J->maxslot) { + J->maxslot = dst+1; + } } else if (cont == lj_cont_nop) { /* Nothing to do here. */ } else if (cont == lj_cont_cat) { BCReg bslot = bc_b(*(frame_contpc(frame)-1)); TRef tr = gotresults ? J->base[cbase+rbase] : TREF_NIL; - if (bslot != cbase-2) { /* Concatenate the remainder. */ + if (bslot != J->maxslot) { /* Concatenate the remainder. */ TValue *b = J->L->base, save; /* Simulate lower frame and result. */ - J->base[cbase-2] = tr; - copyTV(J->L, &save, b-2); - if (gotresults) copyTV(J->L, b-2, b+rbase); else setnilV(b-2); + J->base[J->maxslot] = tr; + copyTV(J->L, &save, b-(2<L, b-(2<L->base = b - cbase; - tr = rec_cat(J, bslot, cbase-2); + tr = rec_cat(J, bslot, cbase-(2<L->base + cbase; /* Undo. */ J->L->base = b; - copyTV(J->L, b-2, &save); + copyTV(J->L, b-(2<base[dst] = tr; - if (dst >= J->maxslot) J->maxslot = dst+1; + if (dst >= J->maxslot) { + J->maxslot = dst+1; + } } /* Otherwise continue with another __concat call. */ } else { /* Result type already specialized. */ @@ -876,7 +910,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) } else { lj_trace_err(J, LJ_TRERR_NYIRETL); /* NYI: handle return to C frame. */ } - lua_assert(J->baseslot >= 1); + lua_assert(J->baseslot >= 1+LJ_FR2); } /* -- Metamethod handling ------------------------------------------------- */ @@ -885,11 +919,16 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) static BCReg rec_mm_prep(jit_State *J, ASMFunction cont) { BCReg s, top = cont == lj_cont_cat ? J->maxslot : curr_proto(J->L)->framesize; +#if LJ_FR2 + J->base[top] = lj_ir_k64(J, IR_KNUM, u64ptr(contptr(cont))); + J->base[top+1] = TREF_CONT; +#else J->base[top] = lj_ir_kptr(J, contptr(cont)) | TREF_CONT; +#endif J->framedepth++; for (s = J->maxslot; s < top; s++) J->base[s] = 0; /* Clear frame gap to avoid resurrecting previous refs. */ - return top+1; + return top+1+LJ_FR2; } /* Record metamethod lookup. */ @@ -967,9 +1006,9 @@ static TRef rec_mm_arith(jit_State *J, RecordIndex *ix, MMS mm) BCReg func = rec_mm_prep(J, mm == MM_concat ? lj_cont_cat : lj_cont_ra); TRef *base = J->base + func; TValue *basev = J->L->base + func; - base[1] = ix->tab; base[2] = ix->key; - copyTV(J->L, basev+1, &ix->tabv); - copyTV(J->L, basev+2, &ix->keyv); + base[1+LJ_FR2] = ix->tab; base[2+LJ_FR2] = ix->key; + copyTV(J->L, basev+1+LJ_FR2, &ix->tabv); + copyTV(J->L, basev+2+LJ_FR2, &ix->keyv); if (!lj_record_mm_lookup(J, ix, mm)) { /* Lookup mm on 1st operand. */ if (mm != MM_unm) { ix->tab = ix->key; @@ -980,8 +1019,10 @@ static TRef rec_mm_arith(jit_State *J, RecordIndex *ix, MMS mm) lj_trace_err(J, LJ_TRERR_NOMM); } ok: - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ base[0] = ix->mobj; +#if LJ_FR2 + base[1] = 0; +#endif copyTV(J->L, basev+0, &ix->mobjv); lj_record_call(J, func, 2); return 0; /* No result yet. */ @@ -997,8 +1038,9 @@ static TRef rec_mm_len(jit_State *J, TRef tr, TValue *tv) BCReg func = rec_mm_prep(J, lj_cont_ra); TRef *base = J->base + func; TValue *basev = J->L->base + func; - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ base[0] = ix.mobj; copyTV(J->L, basev+0, &ix.mobjv); + base += LJ_FR2; + basev += LJ_FR2; base[1] = tr; copyTV(J->L, basev+1, tv); #if LJ_52 base[2] = tr; copyTV(J->L, basev+2, tv); @@ -1018,11 +1060,10 @@ static TRef rec_mm_len(jit_State *J, TRef tr, TValue *tv) static void rec_mm_callcomp(jit_State *J, RecordIndex *ix, int op) { BCReg func = rec_mm_prep(J, (op&1) ? lj_cont_condf : lj_cont_condt); - TRef *base = J->base + func; - TValue *tv = J->L->base + func; - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ - base[0] = ix->mobj; base[1] = ix->val; base[2] = ix->key; - copyTV(J->L, tv+0, &ix->mobjv); + TRef *base = J->base + func + LJ_FR2; + TValue *tv = J->L->base + func + LJ_FR2; + base[-LJ_FR2] = ix->mobj; base[1] = ix->val; base[2] = ix->key; + copyTV(J->L, tv-LJ_FR2, &ix->mobjv); copyTV(J->L, tv+1, &ix->valv); copyTV(J->L, tv+2, &ix->keyv); lj_record_call(J, func, 2); @@ -1339,11 +1380,10 @@ TRef lj_record_idx(jit_State *J, RecordIndex *ix) handlemm: if (tref_isfunc(ix->mobj)) { /* Handle metamethod call. */ BCReg func = rec_mm_prep(J, ix->val ? lj_cont_nop : lj_cont_ra); - TRef *base = J->base + func; - TValue *tv = J->L->base + func; - lua_assert(!LJ_FR2); /* TODO_FR2: handle different frame setup. */ - base[0] = ix->mobj; base[1] = ix->tab; base[2] = ix->key; - setfuncV(J->L, tv+0, funcV(&ix->mobjv)); + TRef *base = J->base + func + LJ_FR2; + TValue *tv = J->L->base + func + LJ_FR2; + base[-LJ_FR2] = ix->mobj; base[1] = ix->tab; base[2] = ix->key; + setfuncV(J->L, tv-LJ_FR2, funcV(&ix->mobjv)); copyTV(J->L, tv+1, &ix->tabv); copyTV(J->L, tv+2, &ix->keyv); if (ix->val) { @@ -1533,7 +1573,11 @@ static TRef rec_upvalue(jit_State *J, uint32_t uv, TRef val) goto noconstify; kfunc = lj_ir_kfunc(J, J->fn); emitir(IRTG(IR_EQ, IRT_FUNC), fn, kfunc); - J->base[-1] = TREF_FRAME | kfunc; +#if LJ_FR2 + J->base[-2] = kfunc; +#else + J->base[-1] = kfunc | TREF_FRAME; +#endif fn = kfunc; } tr = lj_record_constify(J, uvval(uvp)); @@ -1644,11 +1688,14 @@ static void rec_func_setup(jit_State *J) static void rec_func_vararg(jit_State *J) { GCproto *pt = J->pt; - BCReg s, fixargs, vframe = J->maxslot+1; + BCReg s, fixargs, vframe = J->maxslot+1+LJ_FR2; lua_assert((pt->flags & PROTO_VARARG)); if (J->baseslot + vframe + pt->framesize >= LJ_MAX_JSLOTS) lj_trace_err(J, LJ_TRERR_STACKOV); - J->base[vframe-1] = J->base[-1]; /* Copy function up. */ + J->base[vframe-1-LJ_FR2] = J->base[-1-LJ_FR2]; /* Copy function up. */ +#if LJ_FR2 + J->base[vframe-1] = TREF_FRAME; +#endif /* Copy fixarg slots up and set their original slots to nil. */ fixargs = pt->numparams < J->maxslot ? pt->numparams : J->maxslot; for (s = 0; s < fixargs; s++) { @@ -1710,7 +1757,7 @@ static int select_detect(jit_State *J) static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) { int32_t numparams = J->pt->numparams; - ptrdiff_t nvararg = frame_delta(J->L->base-1) - numparams - 1; + ptrdiff_t nvararg = frame_delta(J->L->base-1) - numparams - 1 - LJ_FR2; lua_assert(frame_isvarg(J->L->base-1)); if (J->framedepth > 0) { /* Simple case: varargs defined on-trace. */ ptrdiff_t i; @@ -1722,10 +1769,10 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) J->maxslot = dst + (BCReg)nresults; } for (i = 0; i < nresults; i++) - J->base[dst+i] = i < nvararg ? getslot(J, i - nvararg - 1) : TREF_NIL; + J->base[dst+i] = i < nvararg ? getslot(J, i - nvararg - 1 - LJ_FR2) : TREF_NIL; } else { /* Unknown number of varargs passed to trace. */ - TRef fr = emitir(IRTI(IR_SLOAD), 0, IRSLOAD_READONLY|IRSLOAD_FRAME); - int32_t frofs = 8*(1+numparams)+FRAME_VARG; + TRef fr = emitir(IRTI(IR_SLOAD), LJ_FR2, IRSLOAD_READONLY|IRSLOAD_FRAME); + int32_t frofs = 8*(1+LJ_FR2+numparams)+FRAME_VARG; if (nresults >= 0) { /* Known fixed number of results. */ ptrdiff_t i; if (nvararg > 0) { @@ -1739,7 +1786,7 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr); vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8)); for (i = 0; i < nload; i++) { - IRType t = itype2irt(&J->L->base[i-1-nvararg]); + IRType t = itype2irt(&J->L->base[i-1-LJ_FR2-nvararg]); TRef aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, lj_ir_kint(J, (int32_t)i)); TRef tr = emitir(IRTG(IR_VLOAD, t), aref, 0); @@ -1787,14 +1834,15 @@ static void rec_varg(jit_State *J, BCReg dst, ptrdiff_t nresults) if (idx != 0 && idx <= nvararg) { IRType t; TRef aref, vbase = emitir(IRT(IR_SUB, IRT_IGC), REF_BASE, fr); - vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, lj_ir_kint(J, frofs-8)); - t = itype2irt(&J->L->base[idx-2-nvararg]); + vbase = emitir(IRT(IR_ADD, IRT_PGC), vbase, + lj_ir_kint(J, frofs-(8<L->base[idx-2-LJ_FR2-nvararg]); aref = emitir(IRT(IR_AREF, IRT_PGC), vbase, tridx); tr = emitir(IRTG(IR_VLOAD, t), aref, 0); if (irtype_ispri(t)) tr = TREF_PRI(t); /* Canonicalize primitives. */ } - J->base[dst-2] = tr; - J->maxslot = dst-1; + J->base[dst-2-LJ_FR2] = tr; + J->maxslot = dst-1-LJ_FR2; J->bcskip = 2; /* Skip CALLM + select. */ } else { nyivarg: @@ -1887,7 +1935,15 @@ static void rec_comp_fixup(jit_State *J, const BCIns *pc, int cond) const BCIns *npc = pc + 2 + (cond ? bc_j(jmpins) : 0); SnapShot *snap = &J->cur.snap[J->cur.nsnap-1]; /* Set PC to opposite target to avoid re-recording the comp. in side trace. */ +#if LJ_FR2 + SnapEntry *flink = &J->cur.snapmap[snap->mapofs + snap->nent]; + uint64_t pcbase; + memcpy(&pcbase, flink, sizeof(uint64_t)); + pcbase = (pcbase & 0xff) | (u64ptr(npc) << 8); + memcpy(flink, &pcbase, sizeof(uint64_t)); +#else J->cur.snapmap[snap->mapofs + snap->nent] = SNAP_MKPC(npc); +#endif J->needsnap = 1; if (bc_a(jmpins) < J->maxslot) J->maxslot = bc_a(jmpins); lj_snap_shrink(J); /* Shrink last snapshot if possible. */ @@ -2185,7 +2241,13 @@ void lj_record_ins(jit_State *J) case BC_MOV: /* Clear gap of method call to avoid resurrecting previous refs. */ - if (ra > J->maxslot) J->base[ra-1] = 0; + if (ra > J->maxslot) { +#if LJ_FR2 + memset(J->base + J->maxslot, 0, (ra - J->maxslot) * sizeof(TRef)); +#else + J->base[ra-1] = 0; +#endif + } break; case BC_KSTR: case BC_KNUM: case BC_KPRI: break; @@ -2254,14 +2316,14 @@ void lj_record_ins(jit_State *J) /* -- Calls and vararg handling ----------------------------------------- */ case BC_ITERC: - J->base[ra] = getslot(J, ra-3-LJ_FR2); - J->base[ra+1] = getslot(J, ra-2-LJ_FR2); - J->base[ra+2] = getslot(J, ra-1-LJ_FR2); + J->base[ra] = getslot(J, ra-3); + J->base[ra+1+LJ_FR2] = getslot(J, ra-2); + J->base[ra+2+LJ_FR2] = getslot(J, ra-1); { /* Do the actual copy now because lj_record_call needs the values. */ TValue *b = &J->L->base[ra]; - copyTV(J->L, b, b-3-LJ_FR2); - copyTV(J->L, b+1, b-2-LJ_FR2); - copyTV(J->L, b+2, b-1-LJ_FR2); + copyTV(J->L, b, b-3); + copyTV(J->L, b+1+LJ_FR2, b-2); + copyTV(J->L, b+2+LJ_FR2, b-1); } lj_record_call(J, ra, (ptrdiff_t)rc-1); break; @@ -2384,7 +2446,12 @@ void lj_record_ins(jit_State *J) /* rc == 0 if we have no result yet, e.g. pending __index metamethod call. */ if (bcmode_a(op) == BCMdst && rc) { J->base[ra] = rc; - if (ra >= J->maxslot) J->maxslot = ra+1; + if (ra >= J->maxslot) { +#if LJ_FR2 + if (ra > J->maxslot) J->base[ra-1] = 0; +#endif + J->maxslot = ra+1; + } } #undef rav @@ -2469,7 +2536,7 @@ void lj_record_setup(jit_State *J) J->scev.idx = REF_NIL; setmref(J->scev.pc, NULL); - J->baseslot = 1; /* Invoking function is at base[-1]. */ + J->baseslot = 1+LJ_FR2; /* Invoking function is at base[-1-LJ_FR2]. */ J->base = J->slot + J->baseslot; J->maxslot = 0; J->framedepth = 0; diff --git a/src/lj_snap.c b/src/lj_snap.c index 6199b1f0..33c058be 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -68,10 +68,18 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots) for (s = 0; s < nslots; s++) { TRef tr = J->slot[s]; IRRef ref = tref_ref(tr); +#if LJ_FR2 + if (s == 1) continue; + if ((tr & (TREF_FRAME | TREF_CONT)) && !ref) { + TValue *base = J->L->base - J->baseslot; + tr = J->slot[s] = (tr & 0xff0000) | lj_ir_k64(J, IR_KNUM, base[s].u64); + ref = tref_ref(tr); + } +#endif if (ref) { SnapEntry sn = SNAP_TR(s, tr); IRIns *ir = &J->cur.ir[ref]; - if (!(sn & (SNAP_CONT|SNAP_FRAME)) && + if ((LJ_FR2 || !(sn & (SNAP_CONT|SNAP_FRAME))) && ir->o == IR_SLOAD && ir->op1 == s && ref > retf) { /* No need to snapshot unmodified non-inherited slots. */ if (!(ir->op2 & IRSLOAD_INHERIT)) @@ -90,34 +98,51 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots) } /* Add frame links at the end of the snapshot. */ -static BCReg snapshot_framelinks(jit_State *J, SnapEntry *map) +static MSize snapshot_framelinks(jit_State *J, SnapEntry *map, uint8_t *topslot) { cTValue *frame = J->L->base - 1; - cTValue *lim = J->L->base - J->baseslot; + cTValue *lim = J->L->base - J->baseslot + LJ_FR2; GCfunc *fn = frame_func(frame); cTValue *ftop = isluafunc(fn) ? (frame+funcproto(fn)->framesize) : J->L->top; +#if LJ_FR2 + uint64_t pcbase = (u64ptr(J->pc) << 8) | (J->baseslot - 2); + lua_assert(2 <= J->baseslot && J->baseslot <= 257); + memcpy(map, &pcbase, sizeof(uint64_t)); +#else MSize f = 0; - lua_assert(!LJ_FR2); /* TODO_FR2: store 64 bit PCs. */ map[f++] = SNAP_MKPC(J->pc); /* The current PC is always the first entry. */ +#endif while (frame > lim) { /* Backwards traversal of all frames above base. */ if (frame_islua(frame)) { +#if !LJ_FR2 map[f++] = SNAP_MKPC(frame_pc(frame)); +#endif frame = frame_prevl(frame); } else if (frame_iscont(frame)) { +#if !LJ_FR2 map[f++] = SNAP_MKFTSZ(frame_ftsz(frame)); map[f++] = SNAP_MKPC(frame_contpc(frame)); +#endif frame = frame_prevd(frame); } else { lua_assert(!frame_isc(frame)); +#if !LJ_FR2 map[f++] = SNAP_MKFTSZ(frame_ftsz(frame)); +#endif frame = frame_prevd(frame); continue; } if (frame + funcproto(frame_func(frame))->framesize > ftop) ftop = frame + funcproto(frame_func(frame))->framesize; } + *topslot = (uint8_t)(ftop - lim); +#if LJ_FR2 + lua_assert(sizeof(SnapEntry) * 2 == sizeof(uint64_t)); + return 2; +#else lua_assert(f == (MSize)(1 + J->framedepth)); - return (BCReg)(ftop - lim); + return f; +#endif } /* Take a snapshot of the current stack. */ @@ -127,16 +152,16 @@ static void snapshot_stack(jit_State *J, SnapShot *snap, MSize nsnapmap) MSize nent; SnapEntry *p; /* Conservative estimate. */ - lj_snap_grow_map(J, nsnapmap + nslots + (MSize)J->framedepth+1); + lj_snap_grow_map(J, nsnapmap + nslots + (MSize)(LJ_FR2?2:J->framedepth+1)); p = &J->cur.snapmap[nsnapmap]; nent = snapshot_slots(J, p, nslots); - snap->topslot = (uint8_t)snapshot_framelinks(J, p + nent); + snap->nent = (uint8_t)nent; + nent += snapshot_framelinks(J, p + nent, &snap->topslot); snap->mapofs = (uint16_t)nsnapmap; snap->ref = (IRRef1)J->cur.nins; - snap->nent = (uint8_t)nent; snap->nslots = (uint8_t)nslots; snap->count = 0; - J->cur.nsnapmap = (uint16_t)(nsnapmap + nent + 1 + J->framedepth); + J->cur.nsnapmap = (uint16_t)(nsnapmap + nent); } /* Add or merge a snapshot. */ @@ -284,8 +309,8 @@ void lj_snap_shrink(jit_State *J) MSize n, m, nlim, nent = snap->nent; uint8_t udf[SNAP_USEDEF_SLOTS]; BCReg maxslot = J->maxslot; - BCReg minslot = snap_usedef(J, udf, snap_pc(map[nent]), maxslot); BCReg baseslot = J->baseslot; + BCReg minslot = snap_usedef(J, udf, snap_pc(&map[nent]), maxslot); maxslot += baseslot; minslot += baseslot; snap->nslots = (uint8_t)maxslot; @@ -794,11 +819,13 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) SnapShot *snap = &T->snap[snapno]; MSize n, nent = snap->nent; SnapEntry *map = &T->snapmap[snap->mapofs]; - SnapEntry *flinks = &T->snapmap[snap_nextofs(T, snap)-1]; + SnapEntry *flinks = &T->snapmap[snap_nextofs(T, snap)-1-LJ_FR2]; +#if !LJ_FR2 ptrdiff_t ftsz0; +#endif TValue *frame; BloomFilter rfilt = snap_renamefilter(T, snapno); - const BCIns *pc = snap_pc(map[nent]); + const BCIns *pc = snap_pc(&map[nent]); lua_State *L = J->L; /* Set interpreter PC to the next PC to get correct error messages. */ @@ -811,8 +838,10 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) } /* Fill stack slots with data from the registers and spill slots. */ - frame = L->base-1; + frame = L->base-1-LJ_FR2; +#if !LJ_FR2 ftsz0 = frame_ftsz(frame); /* Preserve link to previous frame in slot #0. */ +#endif for (n = 0; n < nent; n++) { SnapEntry sn = map[n]; if (!(sn & SNAP_NORESTORE)) { @@ -835,14 +864,18 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) TValue tmp; snap_restoreval(J, T, ex, snapno, rfilt, ref+1, &tmp); o->u32.hi = tmp.u32.lo; +#if !LJ_FR2 } else if ((sn & (SNAP_CONT|SNAP_FRAME))) { - lua_assert(!LJ_FR2); /* TODO_FR2: store 64 bit PCs. */ /* Overwrite tag with frame link. */ setframe_ftsz(o, snap_slot(sn) != 0 ? (int32_t)*flinks-- : ftsz0); L->base = o+1; +#endif } } } +#if LJ_FR2 + L->base += (map[nent+LJ_BE] & 0xff); +#endif lua_assert(map + nent == flinks); /* Compute current stack top. */ From 2868715d80b6ac497a7f08393ec325b60d71df8d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 06:01:54 +0200 Subject: [PATCH 29/57] x64/LJ_GC64: Add missing backend support and enable JIT compilation. Contributed by Peter Cawley. --- src/lj_arch.h | 2 +- src/lj_asm.c | 11 ++ src/lj_asm_x86.h | 430 ++++++++++++++++++++++++++++++++++++++------ src/lj_emit_x86.h | 121 ++++++++++--- src/lj_ffrecord.c | 5 + src/lj_ir.h | 8 +- src/lj_record.c | 6 + src/lj_snap.c | 9 +- src/lj_target_x86.h | 12 +- src/vm_x64.dasc | 3 +- 10 files changed, 517 insertions(+), 90 deletions(-) diff --git a/src/lj_arch.h b/src/lj_arch.h index 72622a21..3c3c98b1 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -453,7 +453,7 @@ #endif /* Disable or enable the JIT compiler. */ -#if defined(LUAJIT_DISABLE_JIT) || defined(LJ_ARCH_NOJIT) || defined(LJ_OS_NOJIT) || LJ_GC64 +#if defined(LUAJIT_DISABLE_JIT) || defined(LJ_ARCH_NOJIT) || defined(LJ_OS_NOJIT) #define LJ_HASJIT 0 #else #define LJ_HASJIT 1 diff --git a/src/lj_asm.c b/src/lj_asm.c index 5dd7ca3a..dba5c178 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -346,6 +346,12 @@ static Reg ra_rematk(ASMState *as, IRRef ref) #if LJ_64 } else if (ir->o == IR_KINT64) { emit_loadu64(as, r, ir_kint64(ir)->u64); +#if LJ_GC64 + } else if (ir->o == IR_KGC) { + emit_loadu64(as, r, (uintptr_t)ir_kgc(ir)); + } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { + emit_loadu64(as, r, (uintptr_t)ir_kptr(ir)); +#endif #endif } else { lua_assert(ir->o == IR_KINT || ir->o == IR_KGC || @@ -1920,8 +1926,12 @@ static void asm_tail_link(ASMState *as) if (bc_isret(bc_op(*retpc))) pc = retpc; } +#if LJ_GC64 + emit_loadu64(as, RID_LPC, u64ptr(pc)); +#else ra_allockreg(as, i32ptr(J2GG(as->J)->dispatch), RID_DISPATCH); ra_allockreg(as, i32ptr(pc), RID_LPC); +#endif mres = (int32_t)(snap->nslots - baseslot - LJ_FR2); switch (bc_op(*pc)) { case BC_CALLM: case BC_CALLMT: @@ -2314,6 +2324,7 @@ void lj_asm_trace(jit_State *J, GCtrace *T) as->curins = as->T->snap[0].ref; asm_snap_prep(as); /* The GC check is a guard. */ asm_gc_check(as); + as->curins = as->stopins; } ra_evictk(as); if (as->parent) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 83fe22b2..7d07336a 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -21,12 +21,14 @@ static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) } /* Push the high byte of the exitno for each exit stub group. */ *mxp++ = XI_PUSHi8; *mxp++ = (MCode)((group*EXITSTUBS_PER_GROUP)>>8); +#if !LJ_GC64 /* Store DISPATCH at original stack slot 0. Account for the two push ops. */ *mxp++ = XI_MOVmi; *mxp++ = MODRM(XM_OFS8, 0, RID_ESP); *mxp++ = MODRM(XM_SCALE1, RID_ESP, RID_ESP); *mxp++ = 2*sizeof(void *); *(int32_t *)mxp = ptr2addr(J2GG(as->J)->dispatch); mxp += 4; +#endif /* Jump to exit handler which fills in the ExitState. */ *mxp++ = XI_JMP; mxp += 4; *((int32_t *)(mxp-4)) = jmprel(mxp, (MCode *)(void *)lj_vm_exit_handler); @@ -62,10 +64,14 @@ static void asm_guardcc(ASMState *as, int cc) target = p; cc ^= 1; if (as->realign) { + if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP)) + as->mrm.ofs += 2; /* Fixup RIP offset for pending fused load. */ emit_sjcc(as, cc, target); return; } } + if (LJ_GC64 && LJ_UNLIKELY(as->mrm.base == RID_RIP)) + as->mrm.ofs += 6; /* Fixup RIP offset for pending fused load. */ emit_jcc(as, cc, target); } @@ -79,6 +85,15 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k) { if (irref_isk(ref)) { IRIns *ir = IR(ref); +#if LJ_GC64 + if (ir->o == IR_KNULL || !irt_is64(ir->t)) { + *k = ir->i; + return 1; + } else if (checki32((int64_t)ir_k64(ir)->u64)) { + *k = (int32_t)ir_k64(ir)->u64; + return 1; + } +#else if (ir->o != IR_KINT64) { *k = ir->i; return 1; @@ -86,6 +101,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k) *k = (int32_t)ir_kint64(ir)->u64; return 1; } +#endif } return 0; } @@ -185,9 +201,19 @@ static void asm_fuseahuref(ASMState *as, IRRef ref, RegSet allow) if (irref_isk(ir->op1)) { GCfunc *fn = ir_kfunc(IR(ir->op1)); GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv; +#if LJ_GC64 + int64_t ofs = dispofs(as, &uv->tv); + if (checki32(ofs) && checki32(ofs+4)) { + as->mrm.ofs = (int32_t)ofs; + as->mrm.base = RID_DISPATCH; + as->mrm.idx = RID_NONE; + return; + } +#else as->mrm.ofs = ptr2addr(&uv->tv); as->mrm.base = as->mrm.idx = RID_NONE; return; +#endif } break; default: @@ -207,17 +233,38 @@ static void asm_fusefref(ASMState *as, IRIns *ir, RegSet allow) lua_assert(ir->o == IR_FLOAD || ir->o == IR_FREF); as->mrm.idx = RID_NONE; if (ir->op1 == REF_NIL) { +#if LJ_GC64 + as->mrm.ofs = (int32_t)ir->op2 - GG_OFS(dispatch); + as->mrm.base = RID_DISPATCH; +#else as->mrm.ofs = (int32_t)ir->op2 + ptr2addr(J2GG(as->J)); as->mrm.base = RID_NONE; +#endif return; } as->mrm.ofs = field_ofs[ir->op2]; if (irref_isk(ir->op1)) { - as->mrm.ofs += IR(ir->op1)->i; + IRIns *op1 = IR(ir->op1); +#if LJ_GC64 + if (ir->op1 == REF_NIL) { + as->mrm.ofs -= GG_OFS(dispatch); + as->mrm.base = RID_DISPATCH; + return; + } else if (op1->o == IR_KPTR || op1->o == IR_KKPTR) { + intptr_t ofs = dispofs(as, ir_kptr(op1)); + if (checki32(as->mrm.ofs + ofs)) { + as->mrm.ofs += (int32_t)ofs; + as->mrm.base = RID_DISPATCH; + return; + } + } +#else + as->mrm.ofs += op1->i; as->mrm.base = RID_NONE; - } else { - as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); + return; +#endif } + as->mrm.base = (uint8_t)ra_alloc1(as, ir->op1, allow); } /* Fuse string reference into memory operand. */ @@ -228,7 +275,7 @@ static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow) as->mrm.base = as->mrm.idx = RID_NONE; as->mrm.scale = XM_SCALE1; as->mrm.ofs = sizeof(GCstr); - if (irref_isk(ir->op1)) { + if (!LJ_GC64 && irref_isk(ir->op1)) { as->mrm.ofs += IR(ir->op1)->i; } else { Reg r = ra_alloc1(as, ir->op1, allow); @@ -260,10 +307,20 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow) IRIns *ir = IR(ref); as->mrm.idx = RID_NONE; if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { +#if LJ_GC64 + intptr_t ofs = dispofs(as, ir_kptr(ir)); + if (checki32(ofs)) { + as->mrm.ofs = (int32_t)ofs; + as->mrm.base = RID_DISPATCH; + return; + } + } if (0) { +#else as->mrm.ofs = ir->i; as->mrm.base = RID_NONE; } else if (ir->o == IR_STRREF) { asm_fusestrref(as, ir, allow); +#endif } else { as->mrm.ofs = 0; if (canfuse(as, ir) && ir->o == IR_ADD && ra_noreg(ir->r)) { @@ -310,13 +367,41 @@ static void asm_fusexref(ASMState *as, IRRef ref, RegSet allow) static Reg asm_fuseloadk64(ASMState *as, IRIns *ir) { const uint64_t *k = &ir_k64(ir)->u64; - as->mrm.ofs = ptr2addr(k); - as->mrm.base = RID_NONE; + if (!LJ_GC64 || checki32((intptr_t)k)) { + as->mrm.ofs = ptr2addr(k); + as->mrm.base = RID_NONE; +#if LJ_GC64 + } else if (checki32(dispofs(as, k))) { + as->mrm.ofs = (int32_t)dispofs(as, k); + as->mrm.base = RID_DISPATCH; + } else if (checki32(mcpofs(as, k)) && checki32(mcpofs(as, k+1)) && + checki32(mctopofs(as, k)) && checki32(mctopofs(as, k+1))) { + as->mrm.ofs = (int32_t)mcpofs(as, k); + as->mrm.base = RID_RIP; + } else { + if (ir->i) { + lua_assert(*k == *(uint64_t*)(as->mctop - ir->i)); + } else { + while ((uintptr_t)as->mcbot & 7) *as->mcbot++ = XI_INT3; + *(uint64_t*)as->mcbot = *k; + ir->i = (int32_t)(as->mctop - as->mcbot); + as->mcbot += 8; + as->mclim = as->mcbot + MCLIM_REDZONE; + } + as->mrm.ofs = (int32_t)mcpofs(as, as->mctop - ir->i); + as->mrm.base = RID_RIP; +#endif + } as->mrm.idx = RID_NONE; return RID_MRM; } -/* Fuse load into memory operand. */ +/* Fuse load into memory operand. +** +** Important caveat: this may emit RIP-relative loads! So don't place any +** code emitters between this function and the use of its result. +** The only permitted exception is asm_guardcc(). +*/ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) { IRIns *ir = IR(ref); @@ -346,7 +431,8 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR; if (ir->o == IR_SLOAD) { if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) && - noconflict(as, ref, IR_RETF, 0)) { + noconflict(as, ref, IR_RETF, 0) && + !(LJ_GC64 && irt_isaddr(ir->t))) { as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow); as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) + (!LJ_FR2 && (ir->op2 & IRSLOAD_FRAME) ? 4 : 0); @@ -361,7 +447,8 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) return RID_MRM; } } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) { - if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0)) { + if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) && + !(LJ_GC64 && irt_isaddr(ir->t))) { asm_fuseahuref(as, ir->op1, xallow); return RID_MRM; } @@ -374,7 +461,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) asm_fusexref(as, ir->op1, xallow); return RID_MRM; } - } else if (ir->o == IR_VLOAD) { + } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) { asm_fuseahuref(as, ir->op1, xallow); return RID_MRM; } @@ -499,8 +586,8 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) if (r) { /* Argument is in a register. */ if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { #if LJ_64 - if (ir->o == IR_KINT64) - emit_loadu64(as, r, ir_kint64(ir)->u64); + if (LJ_GC64 ? ir->o != IR_KINT : ir->o == IR_KINT64) + emit_loadu64(as, r, ir_k64(ir)->u64); else #endif emit_loadi(as, r, ir->i); @@ -668,7 +755,7 @@ static void asm_retf(ASMState *as, IRIns *ir) emit_addptr(as, base, -8*delta); asm_guardcc(as, CC_NE); #if LJ_FR2 - emit_rmro(as, XO_CMP, rpc, base, -8); + emit_rmro(as, XO_CMP, rpc|REX_GC64, base, -8); emit_loadu64(as, rpc, u64ptr(pc)); #else emit_gmroi(as, XG_ARITHi(XOg_CMP), base, -4, ptr2addr(pc)); @@ -696,8 +783,9 @@ static void asm_tobit(ASMState *as, IRIns *ir) Reg tmp = ra_noreg(IR(ir->op1)->r) ? ra_alloc1(as, ir->op1, RSET_FPR) : ra_scratch(as, RSET_FPR); - Reg right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp)); + Reg right; emit_rr(as, XO_MOVDto, tmp, dest); + right = asm_fuseload(as, ir->op2, rset_exclude(RSET_FPR, tmp)); emit_mrm(as, XO_ADDSD, tmp, right); ra_left(as, tmp, ir->op1); } @@ -768,13 +856,12 @@ static void asm_conv(ASMState *as, IRIns *ir) emit_rr(as, op, dest|REX_64, tmp); ra_left(as, tmp, lref); } else { - Reg left = asm_fuseload(as, lref, RSET_FPR); if (LJ_64 && irt_isu32(ir->t)) emit_rr(as, XO_MOV, dest, dest); /* Zero hiword. */ emit_mrm(as, op, dest|((LJ_64 && (irt_is64(ir->t) || irt_isu32(ir->t))) ? REX_64 : 0), - left); + asm_fuseload(as, lref, RSET_FPR)); } } } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ @@ -952,6 +1039,24 @@ static void asm_tvptr(ASMState *as, Reg dest, IRRef ref) emit_rmro(as, XO_LEA, dest|REX_64, RID_ESP, ra_spill(as, ir)); } else { /* Otherwise use g->tmptv to hold the TValue. */ +#if LJ_GC64 + if (irref_isk(ref)) { + TValue k; + lj_ir_kvalue(as->J->L, &k, ir); + emit_movmroi(as, dest, 4, k.u32.hi); + emit_movmroi(as, dest, 0, k.u32.lo); + } else { + /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */ + Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest)); + if (irt_is64(ir->t)) { + emit_u32(as, irt_toitype(ir->t) << 15); + emit_rmro(as, XO_ARITHi, XOg_OR, dest, 4); + } else { + emit_movmroi(as, dest, 4, (irt_toitype(ir->t) << 15) | 0x7fff); + } + emit_movtomro(as, REX_64IR(ir, src), dest, 0); + } +#else if (!irref_isk(ref)) { Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, dest)); emit_movtomro(as, REX_64IR(ir, src), dest, 0); @@ -960,6 +1065,7 @@ static void asm_tvptr(ASMState *as, Reg dest, IRRef ref) } if (!(LJ_64 && irt_islightud(ir->t))) emit_movmroi(as, dest, 4, irt_toitype(ir->t)); +#endif emit_loada(as, dest, &J2G(as->J)->tmptv); } } @@ -969,9 +1075,9 @@ static void asm_aref(ASMState *as, IRIns *ir) Reg dest = ra_dest(as, ir, RSET_GPR); asm_fusearef(as, ir, RSET_GPR); if (!(as->mrm.idx == RID_NONE && as->mrm.ofs == 0)) - emit_mrm(as, XO_LEA, dest, RID_MRM); + emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM); else if (as->mrm.base != dest) - emit_rr(as, XO_MOV, dest, as->mrm.base); + emit_rr(as, XO_MOV, dest|REX_GC64, as->mrm.base); } /* Inlined hash lookup. Specialized for key type and for const keys. @@ -998,7 +1104,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) if (!isk) { rset_clear(allow, tab); key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); - if (!irt_isstr(kt)) + if (LJ_GC64 || !irt_isstr(kt)) tmp = ra_scratch(as, rset_exclude(allow, key)); } @@ -1011,8 +1117,8 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) /* Follow hash chain until the end. */ l_loop = emit_sjcc_label(as, CC_NZ); - emit_rr(as, XO_TEST, dest, dest); - emit_rmro(as, XO_MOV, dest, dest, offsetof(Node, next)); + emit_rr(as, XO_TEST, dest|REX_GC64, dest); + emit_rmro(as, XO_MOV, dest|REX_GC64, dest, offsetof(Node, next)); l_next = emit_label(as); /* Type and value comparison. */ @@ -1033,7 +1139,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_rmro(as, XO_UCOMISD, key, dest, offsetof(Node, key.n)); emit_sjcc(as, CC_AE, l_next); /* The type check avoids NaN penalties and complaints from Valgrind. */ -#if LJ_64 +#if LJ_64 && !LJ_GC64 emit_u32(as, LJ_TISNUM); emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); #else @@ -1041,10 +1147,27 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); #endif } -#if LJ_64 +#if LJ_64 && !LJ_GC64 } else if (irt_islightud(kt)) { emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64)); -#endif +#elif LJ_GC64 + } else if (irt_isaddr(kt)) { + if (isk) { + TValue k; + k.u64 = ((uint64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64; + emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.lo), + k.u32.lo); + emit_sjcc(as, CC_NE, l_next); + emit_gmroi(as, XG_ARITHi(XOg_CMP), dest, offsetof(Node, key.u32.hi), + k.u32.hi); + } else { + emit_rmro(as, XO_CMP, tmp|REX_64, dest, offsetof(Node, key.u64)); + } + } else { + lua_assert(irt_ispri(kt) && !irt_isnil(kt)); + emit_u32(as, (irt_toitype(kt)<<15)|0x7fff); + emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); +#else } else { if (!irt_ispri(kt)) { lua_assert(irt_isaddr(kt)); @@ -1058,16 +1181,23 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) lua_assert(!irt_isnil(kt)); emit_i8(as, irt_toitype(kt)); emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); +#endif } emit_sfixup(as, l_loop); checkmclim(as); +#if LJ_GC64 + if (!isk && irt_isaddr(kt)) { + emit_rr(as, XO_OR, tmp|REX_64, key); + emit_loadu64(as, tmp, (uint64_t)irt_toitype(kt) << 47); + } +#endif /* Load main position relative to tab->node into dest. */ khash = isk ? ir_khash(irkey) : 1; if (khash == 0) { - emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, node)); + emit_rmro(as, XO_MOV, dest|REX_GC64, tab, offsetof(GCtab, node)); } else { - emit_rmro(as, XO_ARITH(XOg_ADD), dest, tab, offsetof(GCtab, node)); + emit_rmro(as, XO_ARITH(XOg_ADD), dest|REX_GC64, tab, offsetof(GCtab,node)); if ((as->flags & JIT_F_PREFER_IMUL)) { emit_i8(as, sizeof(Node)); emit_rr(as, XO_IMULi8, dest, dest); @@ -1122,11 +1252,11 @@ static void asm_hrefk(ASMState *as, IRIns *ir) if (ra_hasreg(dest)) { if (ofs != 0) { if (dest == node && !(as->flags & JIT_F_LEA_AGU)) - emit_gri(as, XG_ARITHi(XOg_ADD), dest, ofs); + emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, ofs); else - emit_rmro(as, XO_LEA, dest, node, ofs); + emit_rmro(as, XO_LEA, dest|REX_GC64, node, ofs); } else if (dest != node) { - emit_rr(as, XO_MOV, dest, node); + emit_rr(as, XO_MOV, dest|REX_GC64, node); } } asm_guardcc(as, CC_NE); @@ -1138,13 +1268,24 @@ static void asm_hrefk(ASMState *as, IRIns *ir) lua_assert(irt_isnum(irkey->t) || irt_isgcv(irkey->t)); /* Assumes -0.0 is already canonicalized to +0.0. */ emit_loadu64(as, key, irt_isnum(irkey->t) ? ir_knum(irkey)->u64 : +#if LJ_GC64 + ((uint64_t)irt_toitype(irkey->t) << 47) | + (uint64_t)ir_kgc(irkey)); +#else ((uint64_t)irt_toitype(irkey->t) << 32) | (uint64_t)(uint32_t)ptr2addr(ir_kgc(irkey))); +#endif } else { lua_assert(!irt_isnil(irkey->t)); +#if LJ_GC64 + emit_i32(as, (irt_toitype(irkey->t)<<15)|0x7fff); + emit_rmro(as, XO_ARITHi, XOg_CMP, node, + ofs + (int32_t)offsetof(Node, key.it)); +#else emit_i8(as, irt_toitype(irkey->t)); emit_rmro(as, XO_ARITHi8, XOg_CMP, node, ofs + (int32_t)offsetof(Node, key.it)); +#endif } #else l_exit = emit_label(as); @@ -1179,20 +1320,21 @@ static void asm_uref(ASMState *as, IRIns *ir) if (irref_isk(ir->op1)) { GCfunc *fn = ir_kfunc(IR(ir->op1)); MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; - emit_rma(as, XO_MOV, dest, v); + emit_rma(as, XO_MOV, dest|REX_GC64, v); } else { Reg uv = ra_scratch(as, RSET_GPR); Reg func = ra_alloc1(as, ir->op1, RSET_GPR); if (ir->o == IR_UREFC) { - emit_rmro(as, XO_LEA, dest, uv, offsetof(GCupval, tv)); + emit_rmro(as, XO_LEA, dest|REX_GC64, uv, offsetof(GCupval, tv)); asm_guardcc(as, CC_NE); emit_i8(as, 1); emit_rmro(as, XO_ARITHib, XOg_CMP, uv, offsetof(GCupval, closed)); } else { - emit_rmro(as, XO_MOV, dest, uv, offsetof(GCupval, v)); + emit_rmro(as, XO_MOV, dest|REX_GC64, uv, offsetof(GCupval, v)); } - emit_rmro(as, XO_MOV, uv, func, - (int32_t)offsetof(GCfuncL, uvptr) + 4*(int32_t)(ir->op2 >> 8)); + emit_rmro(as, XO_MOV, uv|REX_GC64, func, + (int32_t)offsetof(GCfuncL, uvptr) + + (int32_t)sizeof(MRef) * (int32_t)(ir->op2 >> 8)); } } @@ -1210,9 +1352,9 @@ static void asm_strref(ASMState *as, IRIns *ir) if (as->mrm.base == RID_NONE) emit_loadi(as, dest, as->mrm.ofs); else if (as->mrm.base == dest && as->mrm.idx == RID_NONE) - emit_gri(as, XG_ARITHi(XOg_ADD), dest, as->mrm.ofs); + emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, as->mrm.ofs); else - emit_mrm(as, XO_LEA, dest, RID_MRM); + emit_mrm(as, XO_LEA, dest|REX_GC64, RID_MRM); } /* -- Loads and stores ---------------------------------------------------- */ @@ -1281,7 +1423,7 @@ static void asm_fxstore(ASMState *as, IRIns *ir) case IRT_I16: case IRT_U16: xo = XO_MOVtow; break; case IRT_NUM: xo = XO_MOVSDto; break; case IRT_FLOAT: xo = XO_MOVSSto; break; -#if LJ_64 +#if LJ_64 && !LJ_GC64 case IRT_LIGHTUD: lua_assert(0); /* NYI: mask 64 bit lightuserdata. */ #endif default: @@ -1313,7 +1455,7 @@ static void asm_fxstore(ASMState *as, IRIns *ir) #define asm_fstore(as, ir) asm_fxstore(as, ir) #define asm_xstore(as, ir) asm_fxstore(as, ir) -#if LJ_64 +#if LJ_64 && !LJ_GC64 static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck) { if (ra_used(ir) || typecheck) { @@ -1335,9 +1477,12 @@ static Reg asm_load_lightud64(ASMState *as, IRIns *ir, int typecheck) static void asm_ahuvload(ASMState *as, IRIns *ir) { +#if LJ_GC64 + Reg tmp = RID_NONE; +#endif lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isint(ir->t))); -#if LJ_64 +#if LJ_64 && !LJ_GC64 if (irt_islightud(ir->t)) { Reg dest = asm_load_lightud64(as, ir, 1); if (ra_hasreg(dest)) { @@ -1351,20 +1496,64 @@ static void asm_ahuvload(ASMState *as, IRIns *ir) RegSet allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; Reg dest = ra_dest(as, ir, allow); asm_fuseahuref(as, ir->op1, RSET_GPR); +#if LJ_GC64 + if (irt_isaddr(ir->t)) { + emit_shifti(as, XOg_SHR|REX_64, dest, 17); + asm_guardcc(as, CC_NE); + emit_i8(as, irt_toitype(ir->t)); + emit_rr(as, XO_ARITHi8, XOg_CMP, dest); + emit_i8(as, XI_O16); + if ((as->flags & JIT_F_BMI2)) { + emit_i8(as, 47); + emit_mrm(as, XV_RORX|VEX_64, dest, RID_MRM); + } else { + emit_shifti(as, XOg_ROR|REX_64, dest, 47); + emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM); + } + return; + } else +#endif emit_mrm(as, dest < RID_MAX_GPR ? XO_MOV : XO_MOVSD, dest, RID_MRM); } else { - asm_fuseahuref(as, ir->op1, RSET_GPR); + RegSet gpr = RSET_GPR; +#if LJ_GC64 + if (irt_isaddr(ir->t)) { + tmp = ra_scratch(as, RSET_GPR); + gpr = rset_exclude(gpr, tmp); + } +#endif + asm_fuseahuref(as, ir->op1, gpr); } /* Always do the type check, even if the load result is unused. */ as->mrm.ofs += 4; asm_guardcc(as, irt_isnum(ir->t) ? CC_AE : CC_NE); if (LJ_64 && irt_type(ir->t) >= IRT_NUM) { lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t)); +#if LJ_GC64 + emit_u32(as, LJ_TISNUM << 15); +#else emit_u32(as, LJ_TISNUM); +#endif emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); +#if LJ_GC64 + } else if (irt_isaddr(ir->t)) { + as->mrm.ofs -= 4; + emit_i8(as, irt_toitype(ir->t)); + emit_mrm(as, XO_ARITHi8, XOg_CMP, tmp); + emit_shifti(as, XOg_SAR|REX_64, tmp, 47); + emit_mrm(as, XO_MOV, tmp|REX_64, RID_MRM); + } else if (irt_isnil(ir->t)) { + as->mrm.ofs -= 4; + emit_i8(as, -1); + emit_mrm(as, XO_ARITHi8, XOg_CMP|REX_64, RID_MRM); + } else { + emit_u32(as, (irt_toitype(ir->t) << 15) | 0x7fff); + emit_mrm(as, XO_ARITHi, XOg_CMP, RID_MRM); +#else } else { emit_i8(as, irt_toitype(ir->t)); emit_mrm(as, XO_ARITHi8, XOg_CMP, RID_MRM); +#endif } } @@ -1376,11 +1565,27 @@ static void asm_ahustore(ASMState *as, IRIns *ir) Reg src = ra_alloc1(as, ir->op2, RSET_FPR); asm_fuseahuref(as, ir->op1, RSET_GPR); emit_mrm(as, XO_MOVSDto, src, RID_MRM); -#if LJ_64 +#if LJ_64 && !LJ_GC64 } else if (irt_islightud(ir->t)) { Reg src = ra_alloc1(as, ir->op2, RSET_GPR); asm_fuseahuref(as, ir->op1, rset_exclude(RSET_GPR, src)); emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM); +#endif +#if LJ_GC64 + } else if (irref_isk(ir->op2)) { + TValue k; + lj_ir_kvalue(as->J->L, &k, IR(ir->op2)); + asm_fuseahuref(as, ir->op1, RSET_GPR); + if (tvisnil(&k)) { + emit_i32(as, -1); + emit_mrm(as, XO_MOVmi, REX_64, RID_MRM); + } else { + emit_u32(as, k.u32.lo); + emit_mrm(as, XO_MOVmi, 0, RID_MRM); + as->mrm.ofs += 4; + emit_u32(as, k.u32.hi); + emit_mrm(as, XO_MOVmi, 0, RID_MRM); + } #endif } else { IRIns *irr = IR(ir->op2); @@ -1392,6 +1597,17 @@ static void asm_ahustore(ASMState *as, IRIns *ir) } asm_fuseahuref(as, ir->op1, allow); if (ra_hasreg(src)) { +#if LJ_GC64 + if (!(LJ_DUALNUM && irt_isinteger(ir->t))) { + /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */ + as->mrm.ofs += 4; + emit_u32(as, irt_toitype(ir->t) << 15); + emit_mrm(as, XO_ARITHi, XOg_OR, RID_MRM); + as->mrm.ofs -= 4; + emit_mrm(as, XO_MOVto, src|REX_64, RID_MRM); + return; + } +#endif emit_mrm(as, XO_MOVto, src, RID_MRM); } else if (!irt_ispri(irr->t)) { lua_assert(irt_isaddr(ir->t) || (LJ_DUALNUM && irt_isinteger(ir->t))); @@ -1399,7 +1615,12 @@ static void asm_ahustore(ASMState *as, IRIns *ir) emit_mrm(as, XO_MOVmi, 0, RID_MRM); } as->mrm.ofs += 4; +#if LJ_GC64 + lua_assert(LJ_DUALNUM && irt_isinteger(ir->t)); + emit_i32(as, LJ_TNUMX << 15); +#else emit_i32(as, (int32_t)irt_toitype(ir->t)); +#endif emit_mrm(as, XO_MOVmi, 0, RID_MRM); } } @@ -1420,7 +1641,7 @@ static void asm_sload(ASMState *as, IRIns *ir) base = ra_alloc1(as, REF_BASE, RSET_GPR); emit_rmro(as, XO_MOVSD, left, base, ofs); t.irt = IRT_NUM; /* Continue with a regular number type check. */ -#if LJ_64 +#if LJ_64 && !LJ_GC64 } else if (irt_islightud(t)) { Reg dest = asm_load_lightud64(as, ir, (ir->op2 & IRSLOAD_TYPECHECK)); if (ra_hasreg(dest)) { @@ -1438,6 +1659,36 @@ static void asm_sload(ASMState *as, IRIns *ir) t.irt = irt_isint(t) ? IRT_NUM : IRT_INT; /* Check for original type. */ emit_rmro(as, irt_isint(t) ? XO_CVTSI2SD : XO_CVTTSD2SI, dest, base, ofs); } else { +#if LJ_GC64 + if (irt_isaddr(t)) { + /* LJ_GC64 type check + tag removal without BMI2 and with BMI2: + ** + ** mov r64, [addr] rorx r64, [addr], 47 + ** ror r64, 47 + ** cmp r16, itype cmp r16, itype + ** jne ->exit jne ->exit + ** shr r64, 16 shr r64, 16 + */ + emit_shifti(as, XOg_SHR|REX_64, dest, 17); + if ((ir->op2 & IRSLOAD_TYPECHECK)) { + asm_guardcc(as, CC_NE); + emit_i8(as, irt_toitype(t)); + emit_rr(as, XO_ARITHi8, XOg_CMP, dest); + emit_i8(as, XI_O16); + } + if ((as->flags & JIT_F_BMI2)) { + emit_i8(as, 47); + emit_rmro(as, XV_RORX|VEX_64, dest, base, ofs); + } else { + if ((ir->op2 & IRSLOAD_TYPECHECK)) + emit_shifti(as, XOg_ROR|REX_64, dest, 47); + else + emit_shifti(as, XOg_SHL|REX_64, dest, 17); + emit_rmro(as, XO_MOV, dest|REX_64, base, ofs); + } + return; + } else +#endif emit_rmro(as, irt_isnum(t) ? XO_MOVSD : XO_MOV, dest, base, ofs); } } else { @@ -1450,11 +1701,42 @@ static void asm_sload(ASMState *as, IRIns *ir) asm_guardcc(as, irt_isnum(t) ? CC_AE : CC_NE); if (LJ_64 && irt_type(t) >= IRT_NUM) { lua_assert(irt_isinteger(t) || irt_isnum(t)); +#if LJ_GC64 + emit_u32(as, LJ_TISNUM << 15); +#else emit_u32(as, LJ_TISNUM); +#endif emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4); +#if LJ_GC64 + } else if (irt_isnil(t)) { + /* LJ_GC64 type check for nil: + ** + ** cmp qword [addr], -1 + ** jne ->exit + */ + emit_i8(as, -1); + emit_rmro(as, XO_ARITHi8, XOg_CMP|REX_64, base, ofs); + } else if (irt_ispri(t)) { + emit_u32(as, (irt_toitype(t) << 15) | 0x7fff); + emit_rmro(as, XO_ARITHi, XOg_CMP, base, ofs+4); + } else { + /* LJ_GC64 type check only: + ** + ** mov r64, [addr] + ** sar r64, 47 + ** cmp r32, itype + ** jne ->exit + */ + Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, base)); + emit_i8(as, irt_toitype(t)); + emit_rr(as, XO_ARITHi8, XOg_CMP, tmp); + emit_shifti(as, XOg_SAR|REX_64, tmp, 47); + emit_rmro(as, XO_MOV, tmp|REX_64, base, ofs+4); +#else } else { emit_i8(as, irt_toitype(t)); emit_rmro(as, XO_ARITHi8, XOg_CMP, base, ofs+4); +#endif } } } @@ -1548,7 +1830,7 @@ static void asm_tbar(ASMState *as, IRIns *ir) Reg tab = ra_alloc1(as, ir->op1, RSET_GPR); Reg tmp = ra_scratch(as, rset_exclude(RSET_GPR, tab)); MCLabel l_end = emit_label(as); - emit_movtomro(as, tmp, tab, offsetof(GCtab, gclist)); + emit_movtomro(as, tmp|REX_GC64, tab, offsetof(GCtab, gclist)); emit_setgl(as, tab, gc.grayagain); emit_getgl(as, tmp, gc.grayagain); emit_i8(as, ~LJ_GC_BLACK); @@ -2084,7 +2366,6 @@ static void asm_comp(ASMState *as, IRIns *ir) cc ^= (VCC_PS|(5<<4)); /* A <-> B, AE <-> BE, PS <-> none */ } left = ra_alloc1(as, lref, RSET_FPR); - right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left)); l_around = emit_label(as); asm_guardcc(as, cc >> 4); if (cc & VCC_P) { /* Extra CC_P branch required? */ @@ -2101,6 +2382,7 @@ static void asm_comp(ASMState *as, IRIns *ir) emit_jcc(as, CC_P, as->mcp); } } + right = asm_fuseload(as, rref, rset_exclude(RSET_FPR, left)); emit_mrm(as, XO_UCOMISD, left, right); } else { IRRef lref = ir->op1, rref = ir->op2; @@ -2377,13 +2659,18 @@ static void asm_stack_check(ASMState *as, BCReg topslot, emit_rmro(as, XO_MOV, r|REX_64, RID_ESP, 0); else ra_modified(as, r); - emit_gri(as, XG_ARITHi(XOg_CMP), r, (int32_t)(8*topslot)); + emit_gri(as, XG_ARITHi(XOg_CMP), r|REX_GC64, (int32_t)(8*topslot)); if (ra_hasreg(pbase) && pbase != r) - emit_rr(as, XO_ARITH(XOg_SUB), r, pbase); + emit_rr(as, XO_ARITH(XOg_SUB), r|REX_GC64, pbase); else +#if LJ_GC64 + emit_rmro(as, XO_ARITH(XOg_SUB), r|REX_64, RID_DISPATCH, + (int32_t)dispofs(as, &J2G(as->J)->jit_base)); +#else emit_rmro(as, XO_ARITH(XOg_SUB), r, RID_NONE, ptr2addr(&J2G(as->J)->jit_base)); - emit_rmro(as, XO_MOV, r, r, offsetof(lua_State, maxstack)); +#endif + emit_rmro(as, XO_MOV, r|REX_GC64, r, offsetof(lua_State, maxstack)); emit_getgl(as, r, cur_L); if (allow == RSET_EMPTY) /* Spill temp. register. */ emit_rmro(as, XO_MOVto, r|REX_64, RID_ESP, 0); @@ -2414,18 +2701,44 @@ static void asm_stack_restore(ASMState *as, SnapShot *snap) (LJ_DUALNUM && irt_isinteger(ir->t))); if (!irref_isk(ref)) { Reg src = ra_alloc1(as, ref, rset_exclude(RSET_GPR, RID_BASE)); +#if LJ_GC64 + if (irt_is64(ir->t)) { + /* TODO: 64 bit store + 32 bit load-modify-store is suboptimal. */ + emit_u32(as, irt_toitype(ir->t) << 15); + emit_rmro(as, XO_ARITHi, XOg_OR, RID_BASE, ofs+4); + } else if (LJ_DUALNUM && irt_isinteger(ir->t)) { + emit_movmroi(as, RID_BASE, ofs+4, LJ_TISNUM << 15); + } else { + emit_movmroi(as, RID_BASE, ofs+4, (irt_toitype(ir->t)<<15)|0x7fff); + } +#endif emit_movtomro(as, REX_64IR(ir, src), RID_BASE, ofs); +#if LJ_GC64 + } else { + TValue k; + lj_ir_kvalue(as->J->L, &k, ir); + if (tvisnil(&k)) { + emit_i32(as, -1); + emit_rmro(as, XO_MOVmi, REX_64, RID_BASE, ofs); + } else { + emit_movmroi(as, RID_BASE, ofs+4, k.u32.hi); + emit_movmroi(as, RID_BASE, ofs, k.u32.lo); + } +#else } else if (!irt_ispri(ir->t)) { emit_movmroi(as, RID_BASE, ofs, ir->i); +#endif } if ((sn & (SNAP_CONT|SNAP_FRAME))) { #if !LJ_FR2 if (s != 0) /* Do not overwrite link to previous frame. */ emit_movmroi(as, RID_BASE, ofs+4, (int32_t)(*flinks--)); #endif +#if !LJ_GC64 } else { if (!(LJ_64 && irt_islightud(ir->t))) emit_movmroi(as, RID_BASE, ofs+4, irt_toitype(ir->t)); +#endif } } checkmclim(as); @@ -2451,11 +2764,15 @@ static void asm_gc_check(ASMState *as) args[1] = ASMREF_TMP2; /* MSize steps */ asm_gencall(as, ci, args); tmp = ra_releasetmp(as, ASMREF_TMP1); +#if LJ_GC64 + emit_rmro(as, XO_LEA, tmp|REX_64, RID_DISPATCH, GG_DISP2G); +#else emit_loada(as, tmp, J2G(as->J)); +#endif emit_loadi(as, ra_releasetmp(as, ASMREF_TMP2), as->gcsteps); /* Jump around GC step if GC total < GC threshold. */ emit_sjcc(as, CC_B, l_end); - emit_opgl(as, XO_ARITH(XOg_CMP), tmp, gc.threshold); + emit_opgl(as, XO_ARITH(XOg_CMP), tmp|REX_GC64, gc.threshold); emit_getgl(as, tmp, gc.total); as->gcsteps = 0; checkmclim(as); @@ -2520,7 +2837,7 @@ static void asm_head_root_base(ASMState *as) if (rset_test(as->modset, r) || irt_ismarked(ir->t)) ir->r = RID_INIT; /* No inheritance for modified BASE register. */ if (r != RID_BASE) - emit_rr(as, XO_MOV, r, RID_BASE); + emit_rr(as, XO_MOV, r|REX_GC64, RID_BASE); } } @@ -2536,8 +2853,9 @@ static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow) if (irp->r == r) { rset_clear(allow, r); /* Mark same BASE register as coalesced. */ } else if (ra_hasreg(irp->r) && rset_test(as->freeset, irp->r)) { + /* Move from coalesced parent reg. */ rset_clear(allow, irp->r); - emit_rr(as, XO_MOV, r, irp->r); /* Move from coalesced parent reg. */ + emit_rr(as, XO_MOV, r|REX_GC64, irp->r); } else { emit_getgl(as, r, jit_base); /* Otherwise reload BASE. */ } @@ -2750,13 +3068,19 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) MSize len = T->szmcode; MCode *px = exitstub_addr(J, exitno) - 6; MCode *pe = p+len-6; - uint32_t stateaddr = u32ptr(&J2G(J)->vmstate); +#if LJ_GC64 + uint32_t statei = (uint32_t)(GG_OFS(g.vmstate) - GG_OFS(dispatch)); +#else + uint32_t statei = u32ptr(&J2G(J)->vmstate); +#endif if (len > 5 && p[len-5] == XI_JMP && p+len-6 + *(int32_t *)(p+len-4) == px) *(int32_t *)(p+len-4) = jmprel(p+len, target); /* Do not patch parent exit for a stack check. Skip beyond vmstate update. */ - for (; p < pe; p += asm_x86_inslen(p)) - if (*(uint32_t *)(p+(LJ_64 ? 3 : 2)) == stateaddr && p[0] == XI_MOVmi) + for (; p < pe; p += asm_x86_inslen(p)) { + intptr_t ofs = LJ_GC64 ? (p[0] & 0xf0) == 0x40 : LJ_64; + if (*(uint32_t *)(p+2+ofs) == statei && p[ofs+LJ_GC64-LJ_64] == XI_MOVmi) break; + } lua_assert(p < pe); for (; p < pe; p += asm_x86_inslen(p)) if ((*(uint16_t *)p & 0xf0ff) == 0x800f && p + *(int32_t *)(p+2) == px) diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index 3d6f13f4..f0bca938 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -20,6 +20,11 @@ #define REX_64 0 #define VEX_64 0 #endif +#if LJ_GC64 +#define REX_GC64 REX_64 +#else +#define REX_GC64 0 +#endif #define emit_i8(as, i) (*--as->mcp = (MCode)(i)) #define emit_i32(as, i) (*(int32_t *)(as->mcp-4) = (i), as->mcp -= 4) @@ -94,26 +99,17 @@ static int32_t ptr2addr(const void *p) #define ptr2addr(p) (i32ptr((p))) #endif -/* op r, [addr] */ -static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr) -{ - MCode *p = as->mcp; - *(int32_t *)(p-4) = ptr2addr(addr); -#if LJ_64 - p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP); - as->mcp = emit_opm(xo, XM_OFS0, rr, RID_ESP, p, -5); -#else - as->mcp = emit_opm(xo, XM_OFS0, rr, RID_EBP, p, -4); -#endif -} - /* op r, [base+ofs] */ static void emit_rmro(ASMState *as, x86Op xo, Reg rr, Reg rb, int32_t ofs) { MCode *p = as->mcp; x86Mode mode; if (ra_hasreg(rb)) { - if (ofs == 0 && (rb&7) != RID_EBP) { + if (LJ_GC64 && rb == RID_RIP) { + mode = XM_OFS0; + p -= 4; + *(int32_t *)p = ofs; + } else if (ofs == 0 && (rb&7) != RID_EBP) { mode = XM_OFS0; } else if (checki8(ofs)) { *--p = (MCode)ofs; @@ -211,6 +207,11 @@ static void emit_mrm(ASMState *as, x86Op xo, Reg rr, Reg rb) *--p = MODRM(XM_SCALE1, RID_ESP, RID_EBP); rb = RID_ESP; #endif + } else if (LJ_GC64 && rb == RID_RIP) { + lua_assert(as->mrm.idx == RID_NONE); + mode = XM_OFS0; + p -= 4; + *(int32_t *)p = as->mrm.ofs; } else { if (as->mrm.ofs == 0 && (rb&7) != RID_EBP) { mode = XM_OFS0; @@ -264,8 +265,8 @@ static void emit_movmroi(ASMState *as, Reg base, int32_t ofs, int32_t i) /* Get/set global_State fields. */ #define emit_opgl(as, xo, r, field) \ emit_rma(as, (xo), (r), (void *)&J2G(as->J)->field) -#define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r), field) -#define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r), field) +#define emit_getgl(as, r, field) emit_opgl(as, XO_MOV, (r)|REX_GC64, field) +#define emit_setgl(as, r, field) emit_opgl(as, XO_MOVto, (r)|REX_GC64, field) #define emit_setvmstate(as, i) \ (emit_i32(as, i), emit_opgl(as, XO_MOVmi, 0, vmstate)) @@ -288,9 +289,21 @@ static void emit_loadi(ASMState *as, Reg r, int32_t i) } } +#if LJ_GC64 +#define dispofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)J2GG(as->J)->dispatch)) +#define mcpofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp)) +#define mctopofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mctop)) +/* mov r, addr */ +#define emit_loada(as, r, addr) \ + emit_loadu64(as, (r), (uintptr_t)(addr)) +#else /* mov r, addr */ #define emit_loada(as, r, addr) \ emit_loadi(as, (r), ptr2addr((addr))) +#endif #if LJ_64 /* mov r, imm64 or shorter 32 bit extended load. */ @@ -302,6 +315,15 @@ static void emit_loadu64(ASMState *as, Reg r, uint64_t u64) MCode *p = as->mcp; *(int32_t *)(p-4) = (int32_t)u64; as->mcp = emit_opm(XO_MOVmi, XM_REG, REX_64, r, p, -4); +#if LJ_GC64 + } else if (checki32(dispofs(as, u64))) { + emit_rmro(as, XO_LEA, r|REX_64, RID_DISPATCH, (int32_t)dispofs(as, u64)); + } else if (checki32(mcpofs(as, u64)) && checki32(mctopofs(as, u64))) { + /* Since as->realign assumes the code size doesn't change, check + ** RIP-relative addressing reachability for both as->mcp and as->mctop. + */ + emit_rmro(as, XO_LEA, r|REX_64, RID_RIP, (int32_t)mcpofs(as, u64)); +#endif } else { /* Full-size 64 bit load. */ MCode *p = as->mcp; *(uint64_t *)(p-8) = u64; @@ -313,22 +335,69 @@ static void emit_loadu64(ASMState *as, Reg r, uint64_t u64) } #endif +/* op r, [addr] */ +static void emit_rma(ASMState *as, x86Op xo, Reg rr, const void *addr) +{ +#if LJ_GC64 + if (checki32(dispofs(as, addr))) { + emit_rmro(as, xo, rr, RID_DISPATCH, (int32_t)dispofs(as, addr)); + } else if (checki32(mcpofs(as, addr)) && checki32(mctopofs(as, addr))) { + emit_rmro(as, xo, rr, RID_RIP, (int32_t)mcpofs(as, addr)); + } else if (!checki32((intptr_t)addr) && (xo == XO_MOV || xo == XO_MOVSD)) { + emit_rmro(as, xo, rr, rr, 0); + emit_loadu64(as, rr, (uintptr_t)addr); + } else +#endif + { + MCode *p = as->mcp; + *(int32_t *)(p-4) = ptr2addr(addr); +#if LJ_64 + p[-5] = MODRM(XM_SCALE1, RID_ESP, RID_EBP); + as->mcp = emit_opm(xo, XM_OFS0, rr, RID_ESP, p, -5); +#else + as->mcp = emit_opm(xo, XM_OFS0, rr, RID_EBP, p, -4); +#endif + } +} + /* Load 64 bit IR constant into register. */ static void emit_loadk64(ASMState *as, Reg r, IRIns *ir) { + Reg r64; + x86Op xo; const uint64_t *k = &ir_k64(ir)->u64; if (rset_test(RSET_FPR, r)) { - if (*k == 0) { - emit_rr(as, XO_XORPS, r, r); - } else { - emit_rma(as, XO_MOVSD, r, k); - } + r64 = r; + xo = XO_MOVSD; } else { - if (*k == 0) { - emit_rr(as, XO_ARITH(XOg_XOR), r, r); + r64 = r | REX_64; + xo = XO_MOV; + } + if (*k == 0) { + emit_rr(as, rset_test(RSET_FPR, r) ? XO_XORPS : XO_ARITH(XOg_XOR), r, r); +#if LJ_GC64 + } else if (checki32((intptr_t)k) || checki32(dispofs(as, k)) || + (checki32(mcpofs(as, k)) && checki32(mctopofs(as, k)))) { + emit_rma(as, xo, r64, k); + } else { + if (ir->i) { + lua_assert(*k == *(uint64_t*)(as->mctop - ir->i)); + } else if (as->curins <= as->stopins && rset_test(RSET_GPR, r)) { + emit_loadu64(as, r, *k); + return; } else { - emit_rma(as, XO_MOV, r | REX_64, k); + /* If all else fails, add the FP constant at the MCode area bottom. */ + while ((uintptr_t)as->mcbot & 7) *as->mcbot++ = XI_INT3; + *(uint64_t *)as->mcbot = *k; + ir->i = (int32_t)(as->mctop - as->mcbot); + as->mcbot += 8; + as->mclim = as->mcbot + MCLIM_REDZONE; } + emit_rmro(as, xo, r64, RID_RIP, (int32_t)mcpofs(as, as->mctop - ir->i)); +#else + } else { + emit_rma(as, xo, r64, k); +#endif } } @@ -470,9 +539,9 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs) { if (ofs) { if ((as->flags & JIT_F_LEA_AGU)) - emit_rmro(as, XO_LEA, r, r, ofs); + emit_rmro(as, XO_LEA, r|REX_GC64, r, ofs); else - emit_gri(as, XG_ARITHi(XOg_ADD), r, ofs); + emit_gri(as, XG_ARITHi(XOg_ADD), r|REX_GC64, ofs); } } diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index 64a9a65d..6d141a20 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -1114,8 +1114,13 @@ static TRef recff_io_fp(jit_State *J, TRef *udp, int32_t id) { TRef tr, ud, fp; if (id) { /* io.func() */ +#if LJ_GC64 + /* TODO: fix ARM32 asm_fload(), so we can use this for all archs. */ + ud = lj_ir_ggfload(J, IRT_UDATA, GG_OFS(g.gcroot[id])); +#else tr = lj_ir_kptr(J, &J2G(J)->gcroot[id]); ud = emitir(IRT(IR_XLOAD, IRT_UDATA), tr, 0); +#endif } else { /* fp:method() */ ud = J->base[0]; if (!tref_isudata(ud)) diff --git a/src/lj_ir.h b/src/lj_ir.h index 4e9c85c7..e77f7b99 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -412,7 +412,7 @@ static LJ_AINLINE IRType itype2irt(const TValue *tv) static LJ_AINLINE uint32_t irt_toitype_(IRType t) { - lua_assert(!LJ_64 || t != IRT_LIGHTUD); + lua_assert(!LJ_64 || LJ_GC64 || t != IRT_LIGHTUD); if (LJ_DUALNUM && t > IRT_NUM) { return LJ_TISNUM; } else { @@ -568,7 +568,11 @@ typedef union IRIns { #define ir_knum(ir) check_exp((ir)->o == IR_KNUM, &(ir)[1].tv) #define ir_kint64(ir) check_exp((ir)->o == IR_KINT64, &(ir)[1].tv) #define ir_k64(ir) \ - check_exp((ir)->o == IR_KNUM || (ir)->o == IR_KINT64, &(ir)[1].tv) + check_exp((ir)->o == IR_KNUM || (ir)->o == IR_KINT64 || \ + (LJ_GC64 && \ + ((ir)->o == IR_KGC || \ + (ir)->o == IR_KPTR || (ir)->o == IR_KKPTR)), \ + &(ir)[1].tv) #define ir_kptr(ir) \ check_exp((ir)->o == IR_KPTR || (ir)->o == IR_KKPTR, \ mref((ir)[LJ_GC64].ptr, void)) diff --git a/src/lj_record.c b/src/lj_record.c index f0481050..3c67e1a0 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -976,7 +976,13 @@ int lj_record_mm_lookup(jit_State *J, RecordIndex *ix, MMS mm) } /* The cdata metatable is treated as immutable. */ if (LJ_HASFFI && tref_iscdata(ix->tab)) goto immutable_mt; +#if LJ_GC64 + /* TODO: fix ARM32 asm_fload(), so we can use this for all archs. */ + ix->mt = mix.tab = lj_ir_ggfload(J, IRT_TAB, + GG_OFS(g.gcroot[GCROOT_BASEMT+itypemap(&ix->tabv)])); +#else ix->mt = mix.tab = lj_ir_ktab(J, mt); +#endif goto nocheck; } ix->mt = mt ? mix.tab : TREF_NIL; diff --git a/src/lj_snap.c b/src/lj_snap.c index 33c058be..0a08d4d4 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -623,7 +623,6 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, } if (LJ_UNLIKELY(bloomtest(rfilt, ref))) rs = snap_renameref(T, snapno, ref, rs); - lua_assert(!LJ_GC64); /* TODO_GC64: handle 64 bit references. */ if (ra_hasspill(regsp_spill(rs))) { /* Restore from spill slot. */ int32_t *sps = &ex->spill[regsp_spill(rs)]; if (irt_isinteger(t)) { @@ -632,9 +631,11 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, } else if (irt_isnum(t)) { o->u64 = *(uint64_t *)sps; #endif - } else if (LJ_64 && irt_islightud(t)) { +#if LJ_64 && !LJ_GC64 + } else if (irt_islightud(t)) { /* 64 bit lightuserdata which may escape already has the tag bits. */ o->u64 = *(uint64_t *)sps; +#endif } else { lua_assert(!irt_ispri(t)); /* PRI refs never have a spill slot. */ setgcV(J->L, o, (GCobj *)(uintptr_t)*(GCSize *)sps, irt_toitype(t)); @@ -652,9 +653,11 @@ static void snap_restoreval(jit_State *J, GCtrace *T, ExitState *ex, } else if (irt_isnum(t)) { setnumV(o, ex->fpr[r-RID_MIN_FPR]); #endif - } else if (LJ_64 && irt_is64(t)) { +#if LJ_64 && !LJ_GC64 + } else if (irt_is64(t)) { /* 64 bit values that already have the tag bits. */ o->u64 = ex->gpr[r-RID_MIN_GPR]; +#endif } else if (irt_ispri(t)) { setpriV(o, irt_toitype(t)); } else { diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index e29f4748..d5429597 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -22,7 +22,7 @@ _(XMM0) _(XMM1) _(XMM2) _(XMM3) _(XMM4) _(XMM5) _(XMM6) _(XMM7) #endif #define VRIDDEF(_) \ - _(MRM) + _(MRM) _(RIP) #define RIDENUM(name) RID_##name, @@ -31,6 +31,7 @@ enum { FPRDEF(RIDENUM) /* Floating-point registers (FPRs). */ RID_MAX, RID_MRM = RID_MAX, /* Pseudo-id for ModRM operand. */ + RID_RIP = RID_MAX+1, /* Pseudo-id for RIP (x64 only). */ /* Calling conventions. */ RID_SP = RID_ESP, @@ -63,8 +64,10 @@ enum { /* -- Register sets ------------------------------------------------------- */ -/* Make use of all registers, except the stack pointer. */ -#define RSET_GPR (RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR)-RID2RSET(RID_ESP)) +/* Make use of all registers, except the stack pointer (and maybe DISPATCH). */ +#define RSET_GPR (RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) \ + - RID2RSET(RID_ESP) \ + - LJ_GC64*RID2RSET(RID_DISPATCH)) #define RSET_FPR (RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR)) #define RSET_ALL (RSET_GPR|RSET_FPR) #define RSET_INIT RSET_ALL @@ -200,6 +203,7 @@ typedef struct { */ typedef enum { /* Fixed length opcodes. XI_* prefix. */ + XI_O16 = 0x66, XI_NOP = 0x90, XI_XCHGa = 0x90, XI_CALL = 0xe8, @@ -217,6 +221,7 @@ typedef enum { XI_PUSHi8 = 0x6a, XI_TESTb = 0x84, XI_TEST = 0x85, + XI_INT3 = 0xcc, XI_MOVmi = 0xc7, XI_GROUP5 = 0xff, @@ -243,6 +248,7 @@ typedef enum { XV_SHRX = XV_f20f38(f7), /* Variable-length opcodes. XO_* prefix. */ + XO_OR = XO_(0b), XO_MOV = XO_(8b), XO_MOVto = XO_(89), XO_MOVtow = XO_66(89), diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc index 759e30ec..d38ac907 100644 --- a/src/vm_x64.dasc +++ b/src/vm_x64.dasc @@ -2401,8 +2401,7 @@ static void build_subroutines(BuildCtx *ctx) | movzx RCd, byte [rbp-8] // Reconstruct exit number. | mov RCH, byte [rbp-16] | mov [rbp-8], r15; mov [rbp-16], r14 - | // Caveat: DISPATCH is rbx. - | mov DISPATCH, [ebp] + | // DISPATCH is preserved on-trace in LJ_GC64 mode. | mov RAd, [DISPATCH+DISPATCH_GL(vmstate)] // Get trace number. | set_vmstate EXIT | mov [DISPATCH+DISPATCH_J(exitno)], RCd From 747feb6e4ef2a8147e18a92e76b9befa8de8a7b5 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 06:24:19 +0200 Subject: [PATCH 30/57] x86: Don't spill an explicit REF_BASE in the IR. Thanks to Vyacheslav Egorov. --- src/lj_asm_x86.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index db3e49f8..a380094b 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -325,11 +325,11 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) as->mrm.base = as->mrm.idx = RID_NONE; return RID_MRM; } - } else if (ir->o == IR_KINT64) { + } else if (ref == REF_BASE || ir->o == IR_KINT64) { RegSet avail = as->freeset & ~as->modset & RSET_GPR; lua_assert(allow != RSET_EMPTY); if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */ - as->mrm.ofs = ptr2addr(ir_kint64(ir)); + as->mrm.ofs = ptr2addr(ref == REF_BASE ? (void *)&J2G(as->J)->jit_base : (void *)ir_kint64(ir)); as->mrm.base = as->mrm.idx = RID_NONE; return RID_MRM; } @@ -369,7 +369,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) return RID_MRM; } } - if (!(as->freeset & allow) && !irref_isk(ref) && + if (!(as->freeset & allow) && !emit_canremat(ref) && (allow == RSET_EMPTY || ra_hasspill(ir->s) || iscrossref(as, ref))) goto fusespill; return ra_allocref(as, ref, allow); From 3f1031c34ba840e2c1f6000d709ed4a6102b40a9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 23 May 2016 14:28:28 +0200 Subject: [PATCH 31/57] Use MAP_TRYFIXED for the probing memory allocator, if available. --- src/lj_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/lj_alloc.c b/src/lj_alloc.c index 5d0834ec..95d15d04 100644 --- a/src/lj_alloc.c +++ b/src/lj_alloc.c @@ -215,6 +215,12 @@ static int CALL_MUNMAP(void *ptr, size_t size) #if LJ_ALLOC_MMAP_PROBE +#ifdef MAP_TRYFIXED +#define MMAP_FLAGS_PROBE (MMAP_FLAGS|MAP_TRYFIXED) +#else +#define MMAP_FLAGS_PROBE MMAP_FLAGS +#endif + #define LJ_ALLOC_MMAP_PROBE_MAX 30 #define LJ_ALLOC_MMAP_PROBE_LINEAR 5 @@ -247,7 +253,7 @@ static void *mmap_probe(size_t size) int olderr = errno; int retry; for (retry = 0; retry < LJ_ALLOC_MMAP_PROBE_MAX; retry++) { - void *p = mmap((void *)hint_addr, size, MMAP_PROT, MMAP_FLAGS, -1, 0); + void *p = mmap((void *)hint_addr, size, MMAP_PROT, MMAP_FLAGS_PROBE, -1, 0); uintptr_t addr = (uintptr_t)p; if ((addr >> LJ_ALLOC_MBITS) == 0 && addr >= LJ_ALLOC_MMAP_PROBE_LOWER) { /* We got a suitable address. Bump the hint address. */ From 9fa843aefbf4eb1bc91dd6ca4f947365b5393e26 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 26 May 2016 13:54:58 +0200 Subject: [PATCH 32/57] Cleanup install docs. --- doc/install.html | 79 ++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/doc/install.html b/doc/install.html index 9921258e..9963010d 100644 --- a/doc/install.html +++ b/doc/install.html @@ -122,7 +122,7 @@ operating systems, CPUs and compilers: x64 (64 bit) GCC 4.2+ -ORBIS (PS4) +GCC 4.2+
    ORBIS (PS4) XCode 5.0+
    Clang MSVC + SDK v7.0
    WinSDK v7.0
    Durango (Xbox One) @@ -202,7 +202,7 @@ which is probably the default on your system, anyway. Simply run: make

    -This always builds a native x86, x64 or PPC binary, depending on the host OS +This always builds a native binary, depending on the host OS you're running this command on. Check the section on cross-compilation for more options.

    @@ -333,25 +333,36 @@ directory where luajit.exe is installed

    Cross-compiling LuaJIT

    -The GNU Makefile-based build system allows cross-compiling on any host -for any supported target, as long as both architectures have the same -pointer size. If you want to cross-compile to any 32 bit target on an -x64 OS, you need to install the multilib development package (e.g. -libc6-dev-i386 on Debian/Ubuntu) and build a 32 bit host part -(HOST_CC="gcc -m32"). +First, let's clear up some terminology:

    +
      +
    • Host: This is your development system, usually based on a x64 or x86 CPU.
    • +
    • Target: This is the target system you want LuaJIT to run on, e.g. Android/ARM.
    • +
    • Toolchain: This comprises a C compiler, linker, assembler and a matching C library.
    • +
    • Host (or system) toolchain: This is the toolchain used to build native binaries for your host system.
    • +
    • Cross-compile toolchain: This is the toolchain used to build binaries for the target system. They can only be run on the target system.
    • +
    +

    +The GNU Makefile-based build system allows cross-compiling on any host +for any supported target: +

    +
      +
    • Yes, you need a toolchain for both your host and your target!
    • +
    • Both host and target architectures must have the same pointer size.
    • +
    • E.g. if you want to cross-compile to a 32 bit target on a 64 bit host, you need to install the multilib development package (e.g. libc6-dev-i386 on Debian/Ubuntu) and build a 32 bit host part (HOST_CC="gcc -m32").
    • +
    • 64 bit targets always require compilation on a 64 bit host.
    • +

    You need to specify TARGET_SYS whenever the host OS and the -target OS differ, or you'll get assembler or linker errors. E.g. if -you're compiling on a Windows or OSX host for embedded Linux or Android, -you need to add TARGET_SYS=Linux to the examples below. For a -minimal target OS, you may need to disable the built-in allocator in -src/Makefile and use TARGET_SYS=Other. Don't forget to -specify the same TARGET_SYS for the install step, too. +target OS differ, or you'll get assembler or linker errors:

    +
      +
    • E.g. if you're compiling on a Windows or OSX host for embedded Linux or Android, you need to add TARGET_SYS=Linux to the examples below.
    • +
    • For a minimal target OS, you may need to disable the built-in allocator in src/Makefile and use TARGET_SYS=Other.
    • +
    • Don't forget to specify the same TARGET_SYS for the install step, too.
    • +

    -The examples below only show some popular targets — please check -the comments in src/Makefile for more details. +Here are some examples where host and target have the same CPU:

     # Cross-compile to a 32 bit binary on a multilib x64 OS
    @@ -369,26 +380,30 @@ use the canonical toolchain triplets for Linux.
     

    Since there's often no easy way to detect CPU features at runtime, it's -important to compile with the proper CPU or architecture settings. You -can specify these when building the toolchain yourself. Or add --mcpu=... or -march=... to TARGET_CFLAGS. For -ARM it's important to have the correct -mfloat-abi=... setting, -too. Otherwise LuaJIT may not run at the full performance of your target -CPU. +important to compile with the proper CPU or architecture settings: + +

      +
    • The best way to get consistent results is to specify the correct settings when building the toolchain yourself.
    • +
    • For a pre-built, generic toolchain add -mcpu=... or -march=... and other necessary flags to TARGET_CFLAGS.
    • +
    • For ARM it's important to have the correct -mfloat-abi=... setting, too. Otherwise LuaJIT may not run at the full performance of your target CPU.
    • +
    • For MIPS it's important to select a supported ABI (o32 on MIPS32) and consistently compile your project either with hard-float or soft-float compiler settings. Do not use -mips16.
    • +
    +

    +Here are some examples for targets with a different CPU than the host:

     # ARM soft-float
     make HOST_CC="gcc -m32" CROSS=arm-linux-gnueabi- \
          TARGET_CFLAGS="-mfloat-abi=soft"
     
    -# ARM soft-float ABI with VFP (example for Cortex-A8)
    +# ARM soft-float ABI with VFP (example for Cortex-A9)
     make HOST_CC="gcc -m32" CROSS=arm-linux-gnueabi- \
    -     TARGET_CFLAGS="-mcpu=cortex-a8 -mfloat-abi=softfp"
    +     TARGET_CFLAGS="-mcpu=cortex-a9 -mfloat-abi=softfp"
     
    -# ARM hard-float ABI with VFP (armhf, requires recent toolchain)
    +# ARM hard-float ABI with VFP (armhf, most modern toolchains)
     make HOST_CC="gcc -m32" CROSS=arm-linux-gnueabihf-
     
    -# ARM64 (requires x64 host)
    +# ARM64
     make CROSS=aarch64-linux-
     
     # PPC
    @@ -400,7 +415,7 @@ make HOST_CC="gcc -m32" CROSS=mips-linux-
     make HOST_CC="gcc -m32" CROSS=mipsel-linux-
     

    -You can cross-compile for Android using the » Android NDK. +You can cross-compile for Android using the Android NDK. The environment variables need to match the install locations and the desired target platform. E.g. Android 4.0 corresponds to ABI level 14. For details check the folder docs in the NDK directory. @@ -414,7 +429,7 @@ to build/deploy or which lowest common denominator you want to pick: # Android/ARM, armeabi (ARMv5TE soft-float), Android 2.2+ (Froyo) NDK=/opt/android/ndk NDKABI=8 -NDKVER=$NDK/toolchains/arm-linux-androideabi-4.6 +NDKVER=$NDK/toolchains/arm-linux-androideabi-4.9 NDKP=$NDKVER/prebuilt/linux-x86/bin/arm-linux-androideabi- NDKF="--sysroot $NDK/platforms/android-$NDKABI/arch-arm" make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF" @@ -422,16 +437,16 @@ make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF" # Android/ARM, armeabi-v7a (ARMv7 VFP), Android 4.0+ (ICS) NDK=/opt/android/ndk NDKABI=14 -NDKVER=$NDK/toolchains/arm-linux-androideabi-4.6 +NDKVER=$NDK/toolchains/arm-linux-androideabi-4.9 NDKP=$NDKVER/prebuilt/linux-x86/bin/arm-linux-androideabi- NDKF="--sysroot $NDK/platforms/android-$NDKABI/arch-arm" NDKARCH="-march=armv7-a -mfloat-abi=softfp -Wl,--fix-cortex-a8" make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF $NDKARCH" -# Android/MIPS, mips (MIPS32R1 hard-float), Android 4.0+ (ICS) +# Android/MIPS, mipsel (MIPS32R1 hard-float), Android 4.0+ (ICS) NDK=/opt/android/ndk NDKABI=14 -NDKVER=$NDK/toolchains/mipsel-linux-android-4.6 +NDKVER=$NDK/toolchains/mipsel-linux-android-4.9 NDKP=$NDKVER/prebuilt/linux-x86/bin/mipsel-linux-android- NDKF="--sysroot $NDK/platforms/android-$NDKABI/arch-mips" make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF" @@ -439,7 +454,7 @@ make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF" # Android/x86, x86 (i686 SSE3), Android 4.0+ (ICS) NDK=/opt/android/ndk NDKABI=14 -NDKVER=$NDK/toolchains/x86-4.6 +NDKVER=$NDK/toolchains/x86-4.9 NDKP=$NDKVER/prebuilt/linux-x86/bin/i686-linux-android- NDKF="--sysroot $NDK/platforms/android-$NDKABI/arch-x86" make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF" From 6360f6e10675d4337bbe0c79a67b40d6aa631393 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 28 May 2016 00:35:07 +0200 Subject: [PATCH 33/57] Fix collateral damage from LJ_GC64 changes to asm_href(). --- src/lj_asm_x86.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 26705e2c..f5230a44 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1180,7 +1180,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) lua_assert(irt_ispri(kt) && !irt_isnil(kt)); emit_u32(as, (irt_toitype(kt)<<15)|0x7fff); emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); -#else +#endif } else { if (!irt_ispri(kt)) { lua_assert(irt_isaddr(kt)); @@ -1194,7 +1194,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) lua_assert(!irt_isnil(kt)); emit_i8(as, irt_toitype(kt)); emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); -#endif } emit_sfixup(as, l_loop); checkmclim(as); From e77638f922f59fddaea81964735dde3cc0de851e Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 28 May 2016 05:01:12 +0200 Subject: [PATCH 34/57] x64/LJ_GC64: Fix __call metamethod for tailcall. --- src/vm_x64.dasc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc index d38ac907..56154495 100644 --- a/src/vm_x64.dasc +++ b/src/vm_x64.dasc @@ -1105,11 +1105,11 @@ static void build_subroutines(BuildCtx *ctx) | mov BASE, L:RB->base | mov NARGS:RDd, TMP1d | mov LFUNC:RB, [RA-16] - | cleartp LFUNC:RB | add NARGS:RDd, 1 | // This is fragile. L->base must not move, KBASE must always be defined. | cmp KBASE, BASE // Continue with CALLT if flag set. | je ->BC_CALLT_Z + | cleartp LFUNC:RB | mov BASE, RA | ins_call // Otherwise call resolved metamethod. | From 5e2b609b3f6145fa9fe0b396b76ff76f732c6fe6 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 28 May 2016 05:02:18 +0200 Subject: [PATCH 35/57] Fix compiler warning. --- src/lj_snap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lj_snap.c b/src/lj_snap.c index 0a08d4d4..48259972 100644 --- a/src/lj_snap.c +++ b/src/lj_snap.c @@ -822,7 +822,9 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) SnapShot *snap = &T->snap[snapno]; MSize n, nent = snap->nent; SnapEntry *map = &T->snapmap[snap->mapofs]; +#if !LJ_FR2 || defined(LUA_USE_ASSERT) SnapEntry *flinks = &T->snapmap[snap_nextofs(T, snap)-1-LJ_FR2]; +#endif #if !LJ_FR2 ptrdiff_t ftsz0; #endif From e3c4c9af0f07a114fb754fed6ac358a102f49e2f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 28 May 2016 05:03:18 +0200 Subject: [PATCH 36/57] DynASM/MIPS: Add missing MIPS64 instructions. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- dynasm/dasm_mips.h | 11 ++++--- dynasm/dasm_mips.lua | 75 ++++++++++++++++++++++++++++++++++++------ dynasm/dasm_mips64.lua | 12 +++++++ 3 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 dynasm/dasm_mips64.lua diff --git a/dynasm/dasm_mips.h b/dynasm/dasm_mips.h index c10528fa..f3b43211 100644 --- a/dynasm/dasm_mips.h +++ b/dynasm/dasm_mips.h @@ -21,7 +21,7 @@ enum { /* The following actions need a buffer position. */ DASM_ALIGN, DASM_REL_LG, DASM_LABEL_LG, /* The following actions also have an argument. */ - DASM_REL_PC, DASM_LABEL_PC, DASM_IMM, + DASM_REL_PC, DASM_LABEL_PC, DASM_IMM, DASM_IMMS, DASM__MAX }; @@ -231,7 +231,7 @@ void dasm_put(Dst_DECL, int start, ...) *pl = -pos; /* Label exists now. */ b[pos++] = ofs; /* Store pass1 offset estimate. */ break; - case DASM_IMM: + case DASM_IMM: case DASM_IMMS: #ifdef DASM_CHECKS CK((n & ((1<<((ins>>10)&31))-1)) == 0, RANGE_I); #endif @@ -299,7 +299,7 @@ int dasm_link(Dst_DECL, size_t *szp) case DASM_ALIGN: ofs -= (b[pos++] + ofs) & (ins & 255); break; case DASM_REL_LG: case DASM_REL_PC: pos++; break; case DASM_LABEL_LG: case DASM_LABEL_PC: b[pos++] += ofs; break; - case DASM_IMM: pos++; break; + case DASM_IMM: case DASM_IMMS: pos++; break; } } stop: (void)0; @@ -356,7 +356,7 @@ int dasm_encode(Dst_DECL, void *buffer) if (ins & 2048) n = n - (int)((char *)cp - base); else - n = (n + (int)base) & 0x0fffffff; + n = (n + (int)(size_t)base) & 0x0fffffff; patchrel: CK((n & 3) == 0 && ((n + ((ins & 2048) ? 0x00020000 : 0)) >> @@ -367,6 +367,9 @@ int dasm_encode(Dst_DECL, void *buffer) ins &= 2047; if (ins >= 20) D->globals[ins-10] = (void *)(base + n); break; case DASM_LABEL_PC: break; + case DASM_IMMS: + cp[-1] |= ((n>>3) & 4); n &= 0x1f; + /* fallthrough */ case DASM_IMM: cp[-1] |= (n & ((1<<((ins>>5)&31))-1)) << (ins&31); break; diff --git a/dynasm/dasm_mips.lua b/dynasm/dasm_mips.lua index c5a5595c..c8010561 100644 --- a/dynasm/dasm_mips.lua +++ b/dynasm/dasm_mips.lua @@ -1,17 +1,19 @@ ------------------------------------------------------------------------------ --- DynASM MIPS module. +-- DynASM MIPS32/MIPS64 module. -- -- Copyright (C) 2005-2016 Mike Pall. All rights reserved. -- See dynasm.lua for full copyright notice. ------------------------------------------------------------------------------ +local mips64 = mips64 + -- Module information: local _info = { - arch = "mips", - description = "DynASM MIPS module", + arch = mips64 and "mips64" or "mips", + description = "DynASM MIPS32/MIPS64 module", version = "1.4.0", vernum = 10400, - release = "2015-10-18", + release = "2016-05-24", author = "Mike Pall", license = "MIT", } @@ -27,7 +29,8 @@ local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char local match, gmatch = _s.match, _s.gmatch local concat, sort = table.concat, table.sort local bit = bit or require("bit") -local band, shl, sar, tohex = bit.band, bit.lshift, bit.arshift, bit.tohex +local band, shl, shr, sar = bit.band, bit.lshift, bit.rshift, bit.arshift +local tohex = bit.tohex -- Inherited tables and callbacks. local g_opt, g_arch @@ -38,7 +41,7 @@ local wline, werror, wfatal, wwarn local action_names = { "STOP", "SECTION", "ESC", "REL_EXT", "ALIGN", "REL_LG", "LABEL_LG", - "REL_PC", "LABEL_PC", "IMM", + "REL_PC", "LABEL_PC", "IMM", "IMMS", } -- Maximum number of section buffer positions for dasm_put(). @@ -251,6 +254,10 @@ local map_op = { bnel_3 = "54000000STB", blezl_2 = "58000000SB", bgtzl_2 = "5c000000SB", + daddi_3 = mips64 and "60000000TSI", + daddiu_3 = mips64 and "64000000TSI", + ldl_2 = mips64 and "68000000TO", + ldr_2 = mips64 and "6c000000TO", lb_2 = "80000000TO", lh_2 = "84000000TO", lwl_2 = "88000000TO", @@ -258,23 +265,30 @@ local map_op = { lbu_2 = "90000000TO", lhu_2 = "94000000TO", lwr_2 = "98000000TO", + lwu_2 = mips64 and "9c000000TO", sb_2 = "a0000000TO", sh_2 = "a4000000TO", swl_2 = "a8000000TO", sw_2 = "ac000000TO", + sdl_2 = mips64 and "b0000000TO", + sdr_2 = mips64 and "b1000000TO", swr_2 = "b8000000TO", cache_2 = "bc000000NO", ll_2 = "c0000000TO", lwc1_2 = "c4000000HO", pref_2 = "cc000000NO", ldc1_2 = "d4000000HO", + ld_2 = mips64 and "dc000000TO", sc_2 = "e0000000TO", swc1_2 = "e4000000HO", + scd_2 = mips64 and "f0000000TO", sdc1_2 = "f4000000HO", + sd_2 = mips64 and "fc000000TO", -- Opcode SPECIAL. nop_0 = "00000000", sll_3 = "00000000DTA", + sextw_2 = "00000000DT", movf_2 = "00000001DS", movf_3 = "00000001DSC", movt_2 = "00010001DS", @@ -285,6 +299,7 @@ local map_op = { sllv_3 = "00000004DTS", srlv_3 = "00000006DTS", rotrv_3 = "00000046DTS", + drotrv_3 = mips64 and "00000056DTS", srav_3 = "00000007DTS", jr_1 = "00000008S", jalr_1 = "0000f809S", @@ -300,15 +315,22 @@ local map_op = { mthi_1 = "00000011S", mflo_1 = "00000012D", mtlo_1 = "00000013S", + dsllv_3 = mips64 and "00000014DTS", + dsrlv_3 = mips64 and "00000016DTS", + dsrav_3 = mips64 and "00000017DTS", mult_2 = "00000018ST", multu_2 = "00000019ST", div_2 = "0000001aST", divu_2 = "0000001bST", + dmult_2 = mips64 and "0000001cST", + dmultu_2 = mips64 and "0000001dST", + ddiv_2 = mips64 and "0000001eST", + ddivu_2 = mips64 and "0000001fST", add_3 = "00000020DST", - move_2 = "00000021DS", + move_2 = mips64 and "00000025DS" or "00000021DS", addu_3 = "00000021DST", sub_3 = "00000022DST", - negu_2 = "00000023DT", + negu_2 = mips64 and "0000002fDT" or "00000023DT", subu_3 = "00000023DST", and_3 = "00000024DST", or_3 = "00000025DST", @@ -317,6 +339,10 @@ local map_op = { nor_3 = "00000027DST", slt_3 = "0000002aDST", sltu_3 = "0000002bDST", + dadd_3 = mips64 and "0000002cDST", + daddu_3 = mips64 and "0000002dDST", + dsub_3 = mips64 and "0000002eDST", + dsubu_3 = mips64 and "0000002fDST", tge_2 = "00000030ST", tge_3 = "00000030STZ", tgeu_2 = "00000031ST", @@ -329,6 +355,14 @@ local map_op = { teq_3 = "00000034STZ", tne_2 = "00000036ST", tne_3 = "00000036STZ", + dsll_3 = mips64 and "00000038DTa", + dsrl_3 = mips64 and "0000003aDTa", + drotr_3 = mips64 and "0020003aDTa", + dsra_3 = mips64 and "0000003bDTa", + dsll32_3 = mips64 and "0000003cDTA", + dsrl32_3 = mips64 and "0000003eDTA", + drotr32_3 = mips64 and "0020003eDTA", + dsra32_3 = mips64 and "0000003fDTA", -- Opcode REGIMM. bltz_2 = "04000000SB", @@ -356,13 +390,24 @@ local map_op = { msubu_2 = "70000005ST", clz_2 = "70000020DS=", clo_2 = "70000021DS=", + dclz_2 = mips64 and "70000024DS=", + dclo_2 = mips64 and "70000025DS=", sdbbp_0 = "7000003f", sdbbp_1 = "7000003fY", -- Opcode SPECIAL3. ext_4 = "7c000000TSAM", -- Note: last arg is msbd = size-1 + dextm_4 = mips64 and "7c000001TSAM", -- Args: pos | size-1-32 + dextu_4 = mips64 and "7c000002TSAM", -- Args: pos-32 | size-1 + dext_4 = mips64 and "7c000003TSAM", -- Args: pos | size-1 + zextw_2 = mips64 and "7c00f803TS", ins_4 = "7c000004TSAM", -- Note: last arg is msb = pos+size-1 + dinsm_4 = mips64 and "7c000005TSAM", -- Args: pos | pos+size-33 + dinsu_4 = mips64 and "7c000006TSAM", -- Args: pos-32 | pos+size-33 + dins_4 = mips64 and "7c000007TSAM", -- Args: pos | pos+size-1 wsbh_2 = "7c0000a0DT", + dsbh_2 = mips64 and "7c0000a4DT", + dshd_2 = mips64 and "7c000164DT", seb_2 = "7c000420DT", seh_2 = "7c000620DT", rdhwr_2 = "7c00003bTD", @@ -370,8 +415,12 @@ local map_op = { -- Opcode COP0. mfc0_2 = "40000000TD", mfc0_3 = "40000000TDW", + dmfc0_2 = mips64 and "40200000TD", + dmfc0_3 = mips64 and "40200000TDW", mtc0_2 = "40800000TD", mtc0_3 = "40800000TDW", + dmtc0_2 = mips64 and "40a00000TD", + dmtc0_3 = mips64 and "40a00000TDW", rdpgpr_2 = "41400000DT", di_0 = "41606000", di_1 = "41606000T", @@ -388,9 +437,11 @@ local map_op = { -- Opcode COP1. mfc1_2 = "44000000TG", + dmfc1_2 = mips64 and "44200000TG", cfc1_2 = "44400000TG", mfhc1_2 = "44600000TG", mtc1_2 = "44800000TG", + dmtc1_2 = mips64 and "44a00000TG", ctc1_2 = "44c00000TG", mthc1_2 = "44e00000TG", @@ -633,7 +684,7 @@ local function parse_fpr(expr) werror("bad register name `"..expr.."'") end -local function parse_imm(imm, bits, shift, scale, signed) +local function parse_imm(imm, bits, shift, scale, signed, action) local n = tonumber(imm) if n then local m = sar(n, scale) @@ -651,7 +702,8 @@ local function parse_imm(imm, bits, shift, scale, signed) match(imm, "^([%w_]+):([rf][1-3]?[0-9])$") then werror("expected immediate operand, got register") else - waction("IMM", (signed and 32768 or 0)+scale*1024+bits*32+shift, imm) + waction(action or "IMM", + (signed and 32768 or 0)+shl(scale, 10)+shl(bits, 5)+shift, imm) return 0 end end @@ -763,6 +815,9 @@ map_op[".template__"] = function(params, template, nparams) n = n + 1 elseif p == "A" then op = op + parse_imm(params[n], 5, 6, 0, false); n = n + 1 + elseif p == "a" then + local m = parse_imm(params[n], 6, 6, 0, false, "IMMS"); n = n + 1 + op = op + band(m, 0x7c0) + band(shr(m, 9), 4) elseif p == "M" then op = op + parse_imm(params[n], 5, 11, 0, false); n = n + 1 elseif p == "N" then diff --git a/dynasm/dasm_mips64.lua b/dynasm/dasm_mips64.lua new file mode 100644 index 00000000..94f21921 --- /dev/null +++ b/dynasm/dasm_mips64.lua @@ -0,0 +1,12 @@ +------------------------------------------------------------------------------ +-- DynASM MIPS64 module. +-- +-- Copyright (C) 2005-2016 Mike Pall. All rights reserved. +-- See dynasm.lua for full copyright notice. +------------------------------------------------------------------------------ +-- This module just sets 64 bit mode for the combined MIPS/MIPS64 module. +-- All the interesting stuff is there. +------------------------------------------------------------------------------ + +mips64 = true -- Using a global is an ugly, but effective solution. +return require("dasm_mips") From d9986fbadb6c50b826e39e5f690bcf0b4cd6a20b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 28 May 2016 05:10:55 +0200 Subject: [PATCH 37/57] MIPS64, part 1: Add MIPS64 support to interpreter. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- doc/ext_jit.html | 2 +- doc/install.html | 13 +- doc/luajit.html | 4 +- src/Makefile | 6 +- src/jit/dis_mips.lua | 2 +- src/lib_jit.c | 6 +- src/lj_arch.h | 48 +- src/lj_asm_mips.h | 8 +- src/lj_ccall.c | 159 +- src/lj_ccall.h | 20 +- src/lj_ccallback.c | 70 +- src/lj_dispatch.c | 2 +- src/lj_emit_mips.h | 2 +- src/lj_frame.h | 25 +- src/lj_ircall.h | 4 +- src/lj_jit.h | 8 +- src/lj_target_mips.h | 45 +- src/vm_mips.dasc | 2 +- src/vm_mips64.dasc | 4849 ++++++++++++++++++++++++++++++++++++++++++ 19 files changed, 5213 insertions(+), 62 deletions(-) create mode 100644 src/vm_mips64.dasc diff --git a/doc/ext_jit.html b/doc/ext_jit.html index 9489c6f5..86098019 100644 --- a/doc/ext_jit.html +++ b/doc/ext_jit.html @@ -153,7 +153,7 @@ Contains the target OS name:

    jit.arch

    Contains the target architecture name: -"x86", "x64", "arm", "ppc", or "mips". +"x86", "x64", "arm", "arm64", "ppc", "mips" or "mips64".

    jit.opt.* — JIT compiler optimization control

    diff --git a/doc/install.html b/doc/install.html index 9963010d..38c9a6bb 100644 --- a/doc/install.html +++ b/doc/install.html @@ -148,7 +148,7 @@ operating systems, CPUs and compilers: XEDK (Xbox 360) -MIPS +MIPS32
    MIPS64
    GCC 4.3+ GCC 4.3+   @@ -386,7 +386,7 @@ important to compile with the proper CPU or architecture settings:
  • The best way to get consistent results is to specify the correct settings when building the toolchain yourself.
  • For a pre-built, generic toolchain add -mcpu=... or -march=... and other necessary flags to TARGET_CFLAGS.
  • For ARM it's important to have the correct -mfloat-abi=... setting, too. Otherwise LuaJIT may not run at the full performance of your target CPU.
  • -
  • For MIPS it's important to select a supported ABI (o32 on MIPS32) and consistently compile your project either with hard-float or soft-float compiler settings. Do not use -mips16.
  • +
  • For MIPS it's important to select a supported ABI (o32 on MIPS32, n64 on MIPS64) and consistently compile your project either with hard-float or soft-float compiler settings. Do not use -mips16.
  • Here are some examples for targets with a different CPU than the host: @@ -409,10 +409,15 @@ make CROSS=aarch64-linux- # PPC make HOST_CC="gcc -m32" CROSS=powerpc-linux-gnu- -# MIPS big-endian +# MIPS32 big-endian make HOST_CC="gcc -m32" CROSS=mips-linux- -# MIPS little-endian +# MIPS32 little-endian make HOST_CC="gcc -m32" CROSS=mipsel-linux- + +# MIPS64 big-endian +make CROSS=mips-linux- TARGET_CFLAGS="-mips64r2 -mabi=64" +# MIPS64 little-endian +make CROSS=mipsel-linux- TARGET_CFLAGS="-mips64r2 -mabi=64"

    You can cross-compile for Android using the Android NDK. diff --git a/doc/luajit.html b/doc/luajit.html index c7a92b86..44a7b8a1 100644 --- a/doc/luajit.html +++ b/doc/luajit.html @@ -169,10 +169,10 @@ LuaJIT is Copyright © 2005-2016 Mike Pall, released under the PS3PS4PS VitaXbox 360Xbox One - +
    GCCCLANG
    LLVM
    MSVC
    GCCClang
    LLVM
    MSVC
    - +
    x86x64ARMARM64PPCMIPS
    x86
    x64
    ARM
    ARM64
    PPCMIPS32
    MIPS64
    diff --git a/src/Makefile b/src/Makefile index 3f25192b..16aeba19 100644 --- a/src/Makefile +++ b/src/Makefile @@ -257,7 +257,11 @@ ifneq (,$(findstring LJ_TARGET_MIPS ,$(TARGET_TESTARCH))) ifneq (,$(findstring MIPSEL ,$(TARGET_TESTARCH))) TARGET_ARCH= -D__MIPSEL__=1 endif - TARGET_LJARCH= mips + ifneq (,$(findstring LJ_TARGET_MIPS64 ,$(TARGET_TESTARCH))) + TARGET_LJARCH= mips64 + else + TARGET_LJARCH= mips + endif else $(error Unsupported target architecture) endif diff --git a/src/jit/dis_mips.lua b/src/jit/dis_mips.lua index 2bf8b389..19327d6f 100644 --- a/src/jit/dis_mips.lua +++ b/src/jit/dis_mips.lua @@ -38,7 +38,7 @@ local map_special = { "multST", "multuST", "divST", "divuST", false, false, false, false, "addDST", "addu|moveDST0", "subDST", "subu|neguDS0T", - "andDST", "orDST", "xorDST", "nor|notDST0", + "andDST", "or|moveDST0", "xorDST", "nor|notDST0", false, false, "sltDST", "sltuDST", false, false, false, false, "tgeSTZ", "tgeuSTZ", "tltSTZ", "tltuSTZ", diff --git a/src/lib_jit.c b/src/lib_jit.c index c6330c49..1655f0c5 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -715,15 +715,15 @@ static uint32_t jit_cpudetect(lua_State *L) #if LJ_HASJIT /* Compile-time MIPS CPU detection. */ #if LJ_ARCH_VERSION >= 20 - flags |= JIT_F_MIPS32R2; + flags |= JIT_F_MIPSXXR2; #endif /* Runtime MIPS CPU detection. */ #if defined(__GNUC__) - if (!(flags & JIT_F_MIPS32R2)) { + if (!(flags & JIT_F_MIPSXXR2)) { int x; /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); - if (x) flags |= JIT_F_MIPS32R2; /* Either 0x80000000 (R2) or 0 (R1). */ + if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ } #endif #endif diff --git a/src/lj_arch.h b/src/lj_arch.h index 3c3c98b1..bbcdc739 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -25,6 +25,10 @@ #define LUAJIT_ARCH_ppc 5 #define LUAJIT_ARCH_MIPS 6 #define LUAJIT_ARCH_mips 6 +#define LUAJIT_ARCH_MIPS32 6 +#define LUAJIT_ARCH_mips32 6 +#define LUAJIT_ARCH_MIPS64 7 +#define LUAJIT_ARCH_mips64 7 /* Target OS. */ #define LUAJIT_OS_OTHER 0 @@ -47,8 +51,10 @@ #define LUAJIT_TARGET LUAJIT_ARCH_ARM64 #elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC) #define LUAJIT_TARGET LUAJIT_ARCH_PPC +#elif defined(__mips64__) || defined(__mips64) || defined(__MIPS64__) || defined(__MIPS64) +#define LUAJIT_TARGET LUAJIT_ARCH_MIPS64 #elif defined(__mips__) || defined(__mips) || defined(__MIPS__) || defined(__MIPS) -#define LUAJIT_TARGET LUAJIT_ARCH_MIPS +#define LUAJIT_TARGET LUAJIT_ARCH_MIPS32 #else #error "No support for this architecture (yet)" #endif @@ -289,13 +295,21 @@ #define LJ_ARCH_XENON 1 #endif -#elif LUAJIT_TARGET == LUAJIT_ARCH_MIPS +#elif LUAJIT_TARGET == LUAJIT_ARCH_MIPS32 || LUAJIT_TARGET == LUAJIT_ARCH_MIPS64 #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) +#if LUAJIT_TARGET == LUAJIT_ARCH_MIPS32 #define LJ_ARCH_NAME "mipsel" +#else +#define LJ_ARCH_NAME "mips64el" +#endif #define LJ_ARCH_ENDIAN LUAJIT_LE #else +#if LUAJIT_TARGET == LUAJIT_ARCH_MIPS32 #define LJ_ARCH_NAME "mips" +#else +#define LJ_ARCH_NAME "mips64" +#endif #define LJ_ARCH_ENDIAN LUAJIT_BE #endif @@ -307,11 +321,6 @@ #endif #endif -/* Temporarily disable features until the code has been merged. */ -#if !defined(LUAJIT_NO_UNWIND) && __GNU_COMPACT_EH__ -#define LUAJIT_NO_UNWIND 1 -#endif - #if !defined(LJ_ABI_SOFTFP) #ifdef __mips_soft_float #define LJ_ABI_SOFTFP 1 @@ -320,7 +329,15 @@ #endif #endif +#if LUAJIT_TARGET == LUAJIT_ARCH_MIPS32 #define LJ_ARCH_BITS 32 +#define LJ_TARGET_MIPS32 1 +#else +#define LJ_ARCH_BITS 64 +#define LJ_TARGET_MIPS64 1 +#define LJ_TARGET_GC64 1 +#define LJ_ARCH_NOJIT 1 /* NYI */ +#endif #define LJ_TARGET_MIPS 1 #define LJ_TARGET_EHRETREG 4 #define LJ_TARGET_JUMPRANGE 27 /* 2*2^27 = 256MB-aligned region */ @@ -329,7 +346,7 @@ #define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */ #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL -#if _MIPS_ARCH_MIPS32R2 +#if _MIPS_ARCH_MIPS32R2 || _MIPS_ARCH_MIPS64R2 #define LJ_ARCH_VERSION 20 #else #define LJ_ARCH_VERSION 10 @@ -410,9 +427,13 @@ #ifdef __NO_FPRS__ #error "No support for PPC/e500 anymore (use LuaJIT 2.0)" #endif -#elif LJ_TARGET_MIPS -#if defined(_LP64) -#error "No support for MIPS64" +#elif LJ_TARGET_MIPS32 +#if _MIPS_SIM != _MIPS_SIM_ABI32 +#error "Only o32 ABI supported for MIPS32" +#endif +#elif LJ_TARGET_MIPS64 +#if _MIPS_SIM != _MIPS_SIM_ABI64 +#error "Only n64 ABI supported for MIPS64" #endif #endif #endif @@ -524,6 +545,11 @@ #define LJ_NO_SYSTEM 1 #endif +#if !defined(LUAJIT_NO_UNWIND) && __GNU_COMPACT_EH__ +/* NYI: no support for compact unwind specification, yet. */ +#define LUAJIT_NO_UNWIND 1 +#endif + #if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4 #define LJ_NO_UNWIND 1 #endif diff --git a/src/lj_asm_mips.h b/src/lj_asm_mips.h index 6094be68..cf446346 100644 --- a/src/lj_asm_mips.h +++ b/src/lj_asm_mips.h @@ -510,7 +510,7 @@ static void asm_conv(ASMState *as, IRIns *ir) Reg left = ra_alloc1(as, ir->op1, RSET_GPR); lua_assert(irt_isint(ir->t) || irt_isu32(ir->t)); if ((ir->op2 & IRCONV_SEXT)) { - if ((as->flags & JIT_F_MIPS32R2)) { + if ((as->flags & JIT_F_MIPSXXR2)) { emit_dst(as, st == IRT_I8 ? MIPSI_SEB : MIPSI_SEH, dest, 0, left); } else { uint32_t shift = st == IRT_I8 ? 24 : 16; @@ -739,7 +739,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) emit_dst(as, MIPSI_SUBU, tmp2, tmp2, dest); if (LJ_SOFTFP ? (irkey[1].o == IR_HIOP) : irt_isnum(kt)) { emit_dst(as, MIPSI_XOR, tmp2, tmp2, tmp1); - if ((as->flags & JIT_F_MIPS32R2)) { + if ((as->flags & JIT_F_MIPSXXR2)) { emit_dta(as, MIPSI_ROTR, dest, tmp1, (-HASH_ROT1)&31); } else { emit_dst(as, MIPSI_OR, dest, dest, tmp1); @@ -1457,7 +1457,7 @@ static void asm_bswap(ASMState *as, IRIns *ir) { Reg dest = ra_dest(as, ir, RSET_GPR); Reg left = ra_alloc1(as, ir->op1, RSET_GPR); - if ((as->flags & JIT_F_MIPS32R2)) { + if ((as->flags & JIT_F_MIPSXXR2)) { emit_dta(as, MIPSI_ROTR, dest, RID_TMP, 16); emit_dst(as, MIPSI_WSBH, RID_TMP, 0, left); } else { @@ -1513,7 +1513,7 @@ static void asm_bitshift(ASMState *as, IRIns *ir, MIPSIns mi, MIPSIns mik) static void asm_bror(ASMState *as, IRIns *ir) { - if ((as->flags & JIT_F_MIPS32R2)) { + if ((as->flags & JIT_F_MIPSXXR2)) { asm_bitshift(as, ir, MIPSI_ROTRV, MIPSI_ROTR); } else { Reg dest = ra_dest(as, ir, RSET_GPR); diff --git a/src/lj_ccall.c b/src/lj_ccall.c index eb7c1ee0..b599be33 100644 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -407,8 +407,8 @@ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ ctr = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ -#elif LJ_TARGET_MIPS -/* -- MIPS calling conventions -------------------------------------------- */ +#elif LJ_TARGET_MIPS32 +/* -- MIPS o32 calling conventions ---------------------------------------- */ #define CCALL_HANDLE_STRUCTRET \ cc->retref = 1; /* Return all structs by reference. */ \ @@ -483,6 +483,78 @@ sp = (uint8_t *)&cc->fpr[0].f; #endif +#elif LJ_TARGET_MIPS64 +/* -- MIPS n64 calling conventions ---------------------------------------- */ + +#define CCALL_HANDLE_STRUCTRET \ + cc->retref = !(sz <= 16); \ + if (cc->retref) cc->gpr[ngpr++] = (GPRArg)dp; + +#define CCALL_HANDLE_STRUCTRET2 \ + ccall_copy_struct(cc, ctr, dp, sp, ccall_classify_struct(cts, ctr, ct)); + +#define CCALL_HANDLE_COMPLEXRET \ + /* Complex values are returned in 1 or 2 FPRs. */ \ + cc->retref = 0; + +#if LJ_ABI_SOFTFP /* MIPS64 soft-float */ + +#define CCALL_HANDLE_COMPLEXRET2 \ + if (ctr->size == 2*sizeof(float)) { /* Copy complex float from GPRs. */ \ + ((intptr_t *)dp)[0] = cc->gpr[0]; \ + } else { /* Copy complex double from GPRs. */ \ + ((intptr_t *)dp)[0] = cc->gpr[0]; \ + ((intptr_t *)dp)[1] = cc->gpr[1]; \ + } + +#define CCALL_HANDLE_COMPLEXARG \ + /* Pass complex by value in 2 or 4 GPRs. */ + +/* Position of soft-float 'float' return value depends on endianess. */ +#define CCALL_HANDLE_RET \ + if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ + sp = (uint8_t *)cc->gpr + LJ_ENDIAN_SELECT(0, 4); + +#else /* MIPS64 hard-float */ + +#define CCALL_HANDLE_COMPLEXRET2 \ + if (ctr->size == 2*sizeof(float)) { /* Copy complex float from FPRs. */ \ + ((float *)dp)[0] = cc->fpr[0].f; \ + ((float *)dp)[1] = cc->fpr[1].f; \ + } else { /* Copy complex double from FPRs. */ \ + ((double *)dp)[0] = cc->fpr[0].d; \ + ((double *)dp)[1] = cc->fpr[1].d; \ + } + +#define CCALL_HANDLE_COMPLEXARG \ + if (sz == 2*sizeof(float)) { \ + isfp = 2; \ + if (ngpr < maxgpr) \ + sz *= 2; \ + } + +#define CCALL_HANDLE_RET \ + if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ + sp = (uint8_t *)&cc->fpr[0].f; + +#endif + +#define CCALL_HANDLE_STRUCTARG \ + /* Pass all structs by value in registers and/or on the stack. */ + +#define CCALL_HANDLE_REGARG \ + if (ngpr < maxgpr) { \ + dp = &cc->gpr[ngpr]; \ + if (ngpr + n > maxgpr) { \ + nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ + if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ + ngpr = maxgpr; \ + } else { \ + ngpr += n; \ + } \ + goto done; \ + } + #else #error "Missing calling convention definitions for this architecture" #endif @@ -722,6 +794,78 @@ noth: /* Not a homogeneous float/double aggregate. */ #endif +/* -- MIPS64 ABI struct classification ---------------------------- */ + +#if LJ_TARGET_MIPS64 + +#define FTYPE_FLOAT 1 +#define FTYPE_DOUBLE 2 + +/* Classify FP fields (max. 2) and their types. */ +static unsigned int ccall_classify_struct(CTState *cts, CType *ct, CType *ctf) +{ + int n = 0, ft = 0; + if ((ctf->info & CTF_VARARG) || (ct->info & CTF_UNION)) + goto noth; + while (ct->sib) { + CType *sct; + ct = ctype_get(cts, ct->sib); + if (n == 2) { + goto noth; + } else if (ctype_isfield(ct->info)) { + sct = ctype_rawchild(cts, ct); + if (ctype_isfp(sct->info)) { + ft |= (sct->size == 4 ? FTYPE_FLOAT : FTYPE_DOUBLE) << 2*n; + n++; + } else { + goto noth; + } + } else if (ctype_isbitfield(ct->info) || + ctype_isxattrib(ct->info, CTA_SUBTYPE)) { + goto noth; + } + } + if (n <= 2) + return ft; +noth: /* Not a homogeneous float/double aggregate. */ + return 0; /* Struct is in GPRs. */ +} + +void ccall_copy_struct(CCallState *cc, CType *ctr, void *dp, void *sp, int ft) +{ + if (LJ_ABI_SOFTFP ? ft : + ((ft & 3) == FTYPE_FLOAT || (ft >> 2) == FTYPE_FLOAT)) { + int i, ofs = 0; + for (i = 0; ft != 0; i++, ft >>= 2) { + if ((ft & 3) == FTYPE_FLOAT) { +#if LJ_ABI_SOFTFP + /* The 2nd FP struct result is in CARG1 (gpr[2]) and not CRET2. */ + memcpy((uint8_t *)dp + ofs, + (uint8_t *)&cc->gpr[2*i] + LJ_ENDIAN_SELECT(0, 4), 4); +#else + *(float *)((uint8_t *)dp + ofs) = cc->fpr[i].f; +#endif + ofs += 4; + } else { + ofs = (ofs + 7) & ~7; /* 64 bit alignment. */ +#if LJ_ABI_SOFTFP + *(intptr_t *)((uint8_t *)dp + ofs) = cc->gpr[2*i]; +#else + *(double *)((uint8_t *)dp + ofs) = cc->fpr[i].d; +#endif + ofs += 8; + } + } + } else { +#if !LJ_ABI_SOFTFP + if (ft) sp = (uint8_t *)&cc->fpr[0]; +#endif + memcpy(dp, sp, ctr->size); + } +} + +#endif + /* -- Common C call handling ---------------------------------------------- */ /* Infer the destination CTypeID for a vararg argument. */ @@ -889,6 +1033,12 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, *(int32_t *)dp = d->size == 1 ? (int32_t)*(int8_t *)dp : (int32_t)*(int16_t *)dp; } +#if LJ_TARGET_MIPS64 + if ((ctype_isinteger_or_bool(d->info) || ctype_isenum(d->info) || + (isfp && nsp == 0)) && d->size <= 4) { + *(int64_t *)dp = (int64_t)*(int32_t *)dp; /* Sign-extend to 64 bit. */ + } +#endif #if LJ_TARGET_X64 && LJ_ABI_WIN if (isva) { /* Windows/x64 mirrors varargs in both register sets. */ if (nfpr == ngpr) @@ -904,7 +1054,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, cc->fpr[nfpr-1].d[0] = cc->fpr[nfpr-2].d[1]; /* Split complex double. */ cc->fpr[nfpr-2].d[1] = 0; } -#elif LJ_TARGET_ARM64 +#elif LJ_TARGET_ARM64 || (LJ_TARGET_MIPS64 && !LJ_ABI_SOFTFP) if (isfp == 2 && (uint8_t *)dp < (uint8_t *)cc->stack) { /* Split float HFA or complex float into separate registers. */ CTSize i = (sz >> 2) - 1; @@ -951,7 +1101,8 @@ static int ccall_get_results(lua_State *L, CTState *cts, CType *ct, CCALL_HANDLE_COMPLEXRET2 return 1; /* One GC step. */ } - if (LJ_BE && ctype_isinteger_or_bool(ctr->info) && ctr->size < CTSIZE_PTR) + if (LJ_BE && ctr->size < CTSIZE_PTR && + (ctype_isinteger_or_bool(ctr->info) || ctype_isenum(ctr->info))) sp += (CTSIZE_PTR - ctr->size); #if CCALL_NUM_FPR if (ctype_isfp(ctr->info) || ctype_isvector(ctr->info)) diff --git a/src/lj_ccall.h b/src/lj_ccall.h index a77afae3..d97227a6 100644 --- a/src/lj_ccall.h +++ b/src/lj_ccall.h @@ -95,11 +95,11 @@ typedef union FPRArg { typedef intptr_t GPRArg; typedef double FPRArg; -#elif LJ_TARGET_MIPS +#elif LJ_TARGET_MIPS32 #define CCALL_NARG_GPR 4 #define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 2) -#define CCALL_NRET_GPR 2 +#define CCALL_NRET_GPR (LJ_ABI_SOFTFP ? 4 : 2) #define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 2) #define CCALL_SPS_EXTRA 7 #define CCALL_SPS_FREE 1 @@ -110,6 +110,22 @@ typedef union FPRArg { struct { LJ_ENDIAN_LOHI(float f; , float g;) }; } FPRArg; +#elif LJ_TARGET_MIPS64 + +/* FP args are positional and overlay the GPR array. */ +#define CCALL_NARG_GPR 8 +#define CCALL_NARG_FPR 0 +#define CCALL_NRET_GPR 2 +#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 2) +#define CCALL_SPS_EXTRA 3 +#define CCALL_SPS_FREE 1 + +typedef intptr_t GPRArg; +typedef union FPRArg { + double d; + struct { LJ_ENDIAN_LOHI(float f; , float g;) }; +} FPRArg; + #else #error "Missing calling convention definitions for this architecture" #endif diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c index 43c29e41..2ca6406c 100644 --- a/src/lj_ccallback.c +++ b/src/lj_ccallback.c @@ -63,9 +63,13 @@ static MSize CALLBACK_OFS2SLOT(MSize ofs) #define CALLBACK_MCODE_HEAD 24 -#elif LJ_TARGET_MIPS +#elif LJ_TARGET_MIPS32 -#define CALLBACK_MCODE_HEAD 24 +#define CALLBACK_MCODE_HEAD 20 + +#elif LJ_TARGET_MIPS64 + +#define CALLBACK_MCODE_HEAD 52 #else @@ -206,14 +210,27 @@ static void callback_mcode_init(global_State *g, uint32_t *page) static void callback_mcode_init(global_State *g, uint32_t *page) { uint32_t *p = page; - void *target = (void *)lj_vm_ffi_callback; + uintptr_t target = (uintptr_t)(void *)lj_vm_ffi_callback; + uintptr_t ug = (uintptr_t)(void *)g; MSize slot; - *p++ = MIPSI_SW | MIPSF_T(RID_R1)|MIPSF_S(RID_SP) | 0; - *p++ = MIPSI_LUI | MIPSF_T(RID_R3) | (u32ptr(target) >> 16); - *p++ = MIPSI_LUI | MIPSF_T(RID_R2) | (u32ptr(g) >> 16); - *p++ = MIPSI_ORI | MIPSF_T(RID_R3)|MIPSF_S(RID_R3) |(u32ptr(target)&0xffff); +#if LJ_TARGET_MIPS32 + *p++ = MIPSI_LUI | MIPSF_T(RID_R3) | (target >> 16); + *p++ = MIPSI_LUI | MIPSF_T(RID_R2) | (ug >> 16); +#else + *p++ = MIPSI_LUI | MIPSF_T(RID_R3) | (target >> 48); + *p++ = MIPSI_LUI | MIPSF_T(RID_R2) | (ug >> 48); + *p++ = MIPSI_ORI | MIPSF_T(RID_R3)|MIPSF_S(RID_R3) | ((target >> 32) & 0xffff); + *p++ = MIPSI_ORI | MIPSF_T(RID_R2)|MIPSF_S(RID_R2) | ((ug >> 32) & 0xffff); + *p++ = MIPSI_DSLL | MIPSF_D(RID_R3)|MIPSF_T(RID_R3) | MIPSF_A(16); + *p++ = MIPSI_DSLL | MIPSF_D(RID_R2)|MIPSF_T(RID_R2) | MIPSF_A(16); + *p++ = MIPSI_ORI | MIPSF_T(RID_R3)|MIPSF_S(RID_R3) | ((target >> 16) & 0xffff); + *p++ = MIPSI_ORI | MIPSF_T(RID_R2)|MIPSF_S(RID_R2) | ((ug >> 16) & 0xffff); + *p++ = MIPSI_DSLL | MIPSF_D(RID_R3)|MIPSF_T(RID_R3) | MIPSF_A(16); + *p++ = MIPSI_DSLL | MIPSF_D(RID_R2)|MIPSF_T(RID_R2) | MIPSF_A(16); +#endif + *p++ = MIPSI_ORI | MIPSF_T(RID_R3)|MIPSF_S(RID_R3) | (target & 0xffff); *p++ = MIPSI_JR | MIPSF_S(RID_R3); - *p++ = MIPSI_ORI | MIPSF_T(RID_R2)|MIPSF_S(RID_R2) | (u32ptr(g)&0xffff); + *p++ = MIPSI_ORI | MIPSF_T(RID_R2)|MIPSF_S(RID_R2) | (ug & 0xffff); for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) { *p = MIPSI_B | ((page-p-1) & 0x0000ffffu); p++; @@ -425,7 +442,7 @@ void lj_ccallback_mcode_free(CTState *cts) if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ *(double *)dp = *(float *)dp; /* FPRs always hold doubles. */ -#elif LJ_TARGET_MIPS +#elif LJ_TARGET_MIPS32 #define CALLBACK_HANDLE_GPR \ if (n > 1) ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \ @@ -451,6 +468,29 @@ void lj_ccallback_mcode_free(CTState *cts) UNUSED(isfp); #endif +#define CALLBACK_HANDLE_RET \ + if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ + ((float *)dp)[1] = *(float *)dp; + +#elif LJ_TARGET_MIPS64 + +#if !LJ_ABI_SOFTFP /* MIPS64 hard-float */ +#define CALLBACK_HANDLE_REGARG \ + if (ngpr + n <= maxgpr) { \ + sp = isfp ? (void*) &cts->cb.fpr[ngpr] : (void*) &cts->cb.gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } +#else /* MIPS64 soft-float */ +#define CALLBACK_HANDLE_REGARG \ + if (ngpr + n <= maxgpr) { \ + UNUSED(isfp); \ + sp = (void*) &cts->cb.gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } +#endif + #define CALLBACK_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ ((float *)dp)[1] = *(float *)dp; @@ -542,7 +582,11 @@ static void callback_conv_args(CTState *cts, lua_State *L) nsp += n; done: - if (LJ_BE && cta->size < CTSIZE_PTR) + if (LJ_BE && cta->size < CTSIZE_PTR +#if LJ_TARGET_MIPS64 + && !(isfp && nsp) +#endif + ) sp = (void *)((uint8_t *)sp + CTSIZE_PTR-cta->size); gcsteps += lj_cconv_tv_ct(cts, cta, 0, o++, sp); } @@ -593,6 +637,12 @@ static void callback_conv_result(CTState *cts, lua_State *L, TValue *o) *(int32_t *)dp = ctr->size == 1 ? (int32_t)*(int8_t *)dp : (int32_t)*(int16_t *)dp; } +#if LJ_TARGET_MIPS64 + /* Always sign-extend results to 64 bits. Even a soft-fp 'float'. */ + if (ctr->size <= 4 && + (LJ_ABI_SOFTFP || ctype_isinteger_or_bool(ctr->info))) + *(int64_t *)dp = (int64_t)*(int32_t *)dp; +#endif #if LJ_TARGET_X86 if (ctype_isfp(ctr->info)) cts->cb.gpr[2] = ctr->size == sizeof(float) ? 1 : 2; diff --git a/src/lj_dispatch.c b/src/lj_dispatch.c index 36b920ad..e5aa495d 100644 --- a/src/lj_dispatch.c +++ b/src/lj_dispatch.c @@ -75,7 +75,7 @@ void lj_dispatch_init(GG_State *GG) for (i = 0; i < GG_NUM_ASMFF; i++) GG->bcff[i] = BCINS_AD(BC__MAX+i, 0, 0); #if LJ_TARGET_MIPS - memcpy(GG->got, dispatch_got, LJ_GOT__MAX*4); + memcpy(GG->got, dispatch_got, LJ_GOT__MAX*sizeof(ASMFunction *)); #endif } diff --git a/src/lj_emit_mips.h b/src/lj_emit_mips.h index 29079ea3..9df04771 100644 --- a/src/lj_emit_mips.h +++ b/src/lj_emit_mips.h @@ -35,7 +35,7 @@ static void emit_fgh(ASMState *as, MIPSIns mi, Reg rf, Reg rg, Reg rh) static void emit_rotr(ASMState *as, Reg dest, Reg src, Reg tmp, uint32_t shift) { - if ((as->flags & JIT_F_MIPS32R2)) { + if ((as->flags & JIT_F_MIPSXXR2)) { emit_dta(as, MIPSI_ROTR, dest, src, shift); } else { emit_dst(as, MIPSI_OR, dest, dest, tmp); diff --git a/src/lj_frame.h b/src/lj_frame.h index db2e4da1..d8d8cff2 100644 --- a/src/lj_frame.h +++ b/src/lj_frame.h @@ -229,26 +229,41 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */ #define CFRAME_SIZE 272 #define CFRAME_SHIFT_MULTRES 3 #endif -#elif LJ_TARGET_MIPS +#elif LJ_TARGET_MIPS32 #if LJ_ARCH_HASFPU #define CFRAME_OFS_ERRF 124 #define CFRAME_OFS_NRES 120 #define CFRAME_OFS_PREV 116 #define CFRAME_OFS_L 112 -#define CFRAME_OFS_PC 20 -#define CFRAME_OFS_MULTRES 16 #define CFRAME_SIZE 112 -#define CFRAME_SHIFT_MULTRES 3 #else #define CFRAME_OFS_ERRF 76 #define CFRAME_OFS_NRES 72 #define CFRAME_OFS_PREV 68 #define CFRAME_OFS_L 64 +#define CFRAME_SIZE 64 +#endif #define CFRAME_OFS_PC 20 #define CFRAME_OFS_MULTRES 16 -#define CFRAME_SIZE 64 #define CFRAME_SHIFT_MULTRES 3 +#elif LJ_TARGET_MIPS64 +#if LJ_ARCH_HASFPU +#define CFRAME_OFS_ERRF 188 +#define CFRAME_OFS_NRES 184 +#define CFRAME_OFS_PREV 176 +#define CFRAME_OFS_L 168 +#define CFRAME_OFS_PC 160 +#define CFRAME_SIZE 192 +#else +#define CFRAME_OFS_ERRF 124 +#define CFRAME_OFS_NRES 120 +#define CFRAME_OFS_PREV 112 +#define CFRAME_OFS_L 104 +#define CFRAME_OFS_PC 96 +#define CFRAME_SIZE 128 #endif +#define CFRAME_OFS_MULTRES 0 +#define CFRAME_SHIFT_MULTRES 3 #else #error "Missing CFRAME_* definitions for this architecture" #endif diff --git a/src/lj_ircall.h b/src/lj_ircall.h index f2a6ecec..c7cd4257 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -78,13 +78,13 @@ typedef struct CCallInfo { #define IRCALLCOND_SOFTFP_FFI(x) NULL #endif -#if LJ_SOFTFP && LJ_TARGET_MIPS +#if LJ_SOFTFP && LJ_TARGET_MIPS32 #define IRCALLCOND_SOFTFP_MIPS(x) x #else #define IRCALLCOND_SOFTFP_MIPS(x) NULL #endif -#define LJ_NEED_FP64 (LJ_TARGET_ARM || LJ_TARGET_PPC || LJ_TARGET_MIPS) +#define LJ_NEED_FP64 (LJ_TARGET_ARM || LJ_TARGET_PPC || LJ_TARGET_MIPS32) #if LJ_HASFFI && (LJ_SOFTFP || LJ_NEED_FP64) #define IRCALLCOND_FP64_FFI(x) x diff --git a/src/lj_jit.h b/src/lj_jit.h index f460a0ab..3505c63f 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -46,12 +46,16 @@ #define JIT_F_CPU_FIRST JIT_F_SQRT #define JIT_F_CPUSTRING "\4SQRT\5ROUND" #elif LJ_TARGET_MIPS -#define JIT_F_MIPS32R2 0x00000010 +#define JIT_F_MIPSXXR2 0x00000010 /* Names for the CPU-specific flags. Must match the order above. */ -#define JIT_F_CPU_FIRST JIT_F_MIPS32R2 +#define JIT_F_CPU_FIRST JIT_F_MIPSXXR2 +#if LJ_TARGET_MIPS32 #define JIT_F_CPUSTRING "\010MIPS32R2" #else +#define JIT_F_CPUSTRING "\010MIPS64R2" +#endif +#else #define JIT_F_CPU_FIRST 0 #define JIT_F_CPUSTRING "" #endif diff --git a/src/lj_target_mips.h b/src/lj_target_mips.h index bafa817a..ac72528d 100644 --- a/src/lj_target_mips.h +++ b/src/lj_target_mips.h @@ -82,11 +82,15 @@ enum { #if LJ_SOFTFP #define RSET_FPR 0 #else +#if LJ_32 #define RSET_FPR \ (RID2RSET(RID_F0)|RID2RSET(RID_F2)|RID2RSET(RID_F4)|RID2RSET(RID_F6)|\ RID2RSET(RID_F8)|RID2RSET(RID_F10)|RID2RSET(RID_F12)|RID2RSET(RID_F14)|\ RID2RSET(RID_F16)|RID2RSET(RID_F18)|RID2RSET(RID_F20)|RID2RSET(RID_F22)|\ RID2RSET(RID_F24)|RID2RSET(RID_F26)|RID2RSET(RID_F28)|RID2RSET(RID_F30)) +#else +#define RSET_FPR RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR) +#endif #endif #define RSET_ALL (RSET_GPR|RSET_FPR) #define RSET_INIT RSET_ALL @@ -97,23 +101,37 @@ enum { #if LJ_SOFTFP #define RSET_SCRATCH_FPR 0 #else +#if LJ_32 #define RSET_SCRATCH_FPR \ (RID2RSET(RID_F0)|RID2RSET(RID_F2)|RID2RSET(RID_F4)|RID2RSET(RID_F6)|\ RID2RSET(RID_F8)|RID2RSET(RID_F10)|RID2RSET(RID_F12)|RID2RSET(RID_F14)|\ RID2RSET(RID_F16)|RID2RSET(RID_F18)) +#else +#define RSET_SCRATCH_FPR RSET_RANGE(RID_F0, RID_F24) +#endif #endif #define RSET_SCRATCH (RSET_SCRATCH_GPR|RSET_SCRATCH_FPR) #define REGARG_FIRSTGPR RID_R4 +#if LJ_32 #define REGARG_LASTGPR RID_R7 #define REGARG_NUMGPR 4 +#else +#define REGARG_LASTGPR RID_R11 +#define REGARG_NUMGPR 8 +#endif #if LJ_ABI_SOFTFP #define REGARG_FIRSTFPR 0 #define REGARG_LASTFPR 0 #define REGARG_NUMFPR 0 #else #define REGARG_FIRSTFPR RID_F12 +#if LJ_32 #define REGARG_LASTFPR RID_F14 #define REGARG_NUMFPR 2 +#else +#define REGARG_LASTFPR RID_F19 +#define REGARG_NUMFPR 8 +#endif #endif /* -- Spill slots --------------------------------------------------------- */ @@ -125,7 +143,11 @@ enum { ** ** SPS_FIRST: First spill slot for general use. */ +#if LJ_32 #define SPS_FIXED 5 +#else +#define SPS_FIXED 4 +#endif #define SPS_FIRST 4 #define SPOFS_TMP 0 @@ -140,7 +162,7 @@ typedef struct { #if !LJ_SOFTFP lua_Number fpr[RID_NUM_FPR]; /* Floating-point registers. */ #endif - int32_t gpr[RID_NUM_GPR]; /* General-purpose registers. */ + intptr_t gpr[RID_NUM_GPR]; /* General-purpose registers. */ int32_t spill[256]; /* Spill slots. */ } ExitState; @@ -172,7 +194,7 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p) typedef enum MIPSIns { /* Integer instructions. */ - MIPSI_MOVE = 0x00000021, + MIPSI_MOVE = 0x00000025, MIPSI_NOP = 0x00000000, MIPSI_LI = 0x24000000, @@ -204,15 +226,15 @@ typedef enum MIPSIns { MIPSI_SLL = 0x00000000, MIPSI_SRL = 0x00000002, MIPSI_SRA = 0x00000003, - MIPSI_ROTR = 0x00200002, /* MIPS32R2 */ + MIPSI_ROTR = 0x00200002, /* MIPSXXR2 */ MIPSI_SLLV = 0x00000004, MIPSI_SRLV = 0x00000006, MIPSI_SRAV = 0x00000007, - MIPSI_ROTRV = 0x00000046, /* MIPS32R2 */ + MIPSI_ROTRV = 0x00000046, /* MIPSXXR2 */ - MIPSI_SEB = 0x7c000420, /* MIPS32R2 */ - MIPSI_SEH = 0x7c000620, /* MIPS32R2 */ - MIPSI_WSBH = 0x7c0000a0, /* MIPS32R2 */ + MIPSI_SEB = 0x7c000420, /* MIPSXXR2 */ + MIPSI_SEH = 0x7c000620, /* MIPSXXR2 */ + MIPSI_WSBH = 0x7c0000a0, /* MIPSXXR2 */ MIPSI_B = 0x10000000, MIPSI_J = 0x08000000, @@ -241,6 +263,15 @@ typedef enum MIPSIns { MIPSI_LDC1 = 0xd4000000, MIPSI_SDC1 = 0xf4000000, + /* MIPS64 instructions. */ + MIPSI_DSLL = 0x00000038, + MIPSI_LD = 0xdc000000, + MIPSI_DADDIU = 0x64000000, + MIPSI_SD = 0xfc000000, + MIPSI_DMFC1 = 0x44200000, + MIPSI_DSRA32 = 0x0000003f, + MIPSI_MFHC1 = 0x44600000, + /* FP instructions. */ MIPSI_MOV_S = 0x46000006, MIPSI_MOV_D = 0x46200006, diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc index 0543c7e0..6f5a83dd 100644 --- a/src/vm_mips.dasc +++ b/src/vm_mips.dasc @@ -57,7 +57,7 @@ |.define TMP2, r14 |.define TMP3, r15 | -|// Calling conventions. +|// MIPS o32 calling convention. |.define CFUNCADDR, r25 |.define CARG1, r4 |.define CARG2, r5 diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc new file mode 100644 index 00000000..52b56de4 --- /dev/null +++ b/src/vm_mips64.dasc @@ -0,0 +1,4849 @@ +|// Low-level VM code for MIPS64 CPUs. +|// Bytecode interpreter, fast functions and helper functions. +|// Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h +|// +|// Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. +|// Sponsored by Cisco Systems, Inc. +| +|.arch mips64 +|.section code_op, code_sub +| +|.actionlist build_actionlist +|.globals GLOB_ +|.globalnames globnames +|.externnames extnames +| +|// Note: The ragged indentation of the instructions is intentional. +|// The starting columns indicate data dependencies. +| +|//----------------------------------------------------------------------- +| +|// Fixed register assignments for the interpreter. +|// Don't use: r0 = 0, r26/r27 = reserved, r28 = gp, r29 = sp, r31 = ra +| +|.macro .FPU, a, b +|.if FPU +| a, b +|.endif +|.endmacro +| +|// The following must be C callee-save (but BASE is often refetched). +|.define BASE, r16 // Base of current Lua stack frame. +|.define KBASE, r17 // Constants of current Lua function. +|.define PC, r18 // Next PC. +|.define DISPATCH, r19 // Opcode dispatch table. +|.define LREG, r20 // Register holding lua_State (also in SAVE_L). +|.define MULTRES, r21 // Size of multi-result: (nresults+1)*8. +| +|.define JGL, r30 // On-trace: global_State + 32768. +| +|// Constants for type-comparisons, stores and conversions. C callee-save. +|.define TISNIL, r30 +|.define TISNUM, r22 +|.if FPU +|.define TOBIT, f30 // 2^52 + 2^51. +|.endif +| +|// The following temporaries are not saved across C calls, except for RA. +|.define RA, r23 // Callee-save. +|.define RB, r8 +|.define RC, r9 +|.define RD, r10 +|.define INS, r11 +| +|.define AT, r1 // Assembler temporary. +|.define TMP0, r12 +|.define TMP1, r13 +|.define TMP2, r14 +|.define TMP3, r15 +| +|// MIPS n64 calling convention. +|.define CFUNCADDR, r25 +|.define CARG1, r4 +|.define CARG2, r5 +|.define CARG3, r6 +|.define CARG4, r7 +|.define CARG5, r8 +|.define CARG6, r9 +|.define CARG7, r10 +|.define CARG8, r11 +| +|.define CRET1, r2 +|.define CRET2, r3 +| +|.if FPU +|.define FARG1, f12 +|.define FARG2, f13 +|.define FARG3, f14 +|.define FARG4, f15 +|.define FARG5, f16 +|.define FARG6, f17 +|.define FARG7, f18 +|.define FARG8, f19 +| +|.define FRET1, f0 +|.define FRET2, f2 +|.endif +| +|// Stack layout while in interpreter. Must match with lj_frame.h. +|.if FPU // MIPS64 hard-float. +| +|.define CFRAME_SPACE, 192 // Delta for sp. +| +|//----- 16 byte aligned, <-- sp entering interpreter +|.define SAVE_ERRF, 188(sp) // 32 bit values. +|.define SAVE_NRES, 184(sp) +|.define SAVE_CFRAME, 176(sp) // 64 bit values. +|.define SAVE_L, 168(sp) +|.define SAVE_PC, 160(sp) +|//----- 16 byte aligned +|.define SAVE_GPR_, 80 // .. 80+10*8: 64 bit GPR saves. +|.define SAVE_FPR_, 16 // .. 16+8*8: 64 bit FPR saves. +| +|.else // MIPS64 soft-float +| +|.define CFRAME_SPACE, 128 // Delta for sp. +| +|//----- 16 byte aligned, <-- sp entering interpreter +|.define SAVE_ERRF, 124(sp) // 32 bit values. +|.define SAVE_NRES, 120(sp) +|.define SAVE_CFRAME, 112(sp) // 64 bit values. +|.define SAVE_L, 104(sp) +|.define SAVE_PC, 96(sp) +|//----- 16 byte aligned +|.define SAVE_GPR_, 16 // .. 16+10*8: 64 bit GPR saves. +| +|.endif +| +|.define TMPX, 8(sp) // Unused by interpreter, temp for JIT code. +|.define TMPD, 0(sp) +|//----- 16 byte aligned +| +|.define TMPD_OFS, 0 +| +|.define SAVE_MULTRES, TMPD +| +|//----------------------------------------------------------------------- +| +|.macro saveregs +| daddiu sp, sp, -CFRAME_SPACE +| sd ra, SAVE_GPR_+9*8(sp) +| sd r30, SAVE_GPR_+8*8(sp) +| .FPU sdc1 f31, SAVE_FPR_+7*8(sp) +| sd r23, SAVE_GPR_+7*8(sp) +| .FPU sdc1 f30, SAVE_FPR_+6*8(sp) +| sd r22, SAVE_GPR_+6*8(sp) +| .FPU sdc1 f29, SAVE_FPR_+5*8(sp) +| sd r21, SAVE_GPR_+5*8(sp) +| .FPU sdc1 f28, SAVE_FPR_+4*8(sp) +| sd r20, SAVE_GPR_+4*8(sp) +| .FPU sdc1 f27, SAVE_FPR_+3*8(sp) +| sd r19, SAVE_GPR_+3*8(sp) +| .FPU sdc1 f26, SAVE_FPR_+2*8(sp) +| sd r18, SAVE_GPR_+2*8(sp) +| .FPU sdc1 f25, SAVE_FPR_+1*8(sp) +| sd r17, SAVE_GPR_+1*8(sp) +| .FPU sdc1 f24, SAVE_FPR_+0*8(sp) +| sd r16, SAVE_GPR_+0*8(sp) +|.endmacro +| +|.macro restoreregs_ret +| ld ra, SAVE_GPR_+9*8(sp) +| ld r30, SAVE_GPR_+8*8(sp) +| ld r23, SAVE_GPR_+7*8(sp) +| .FPU ldc1 f31, SAVE_FPR_+7*8(sp) +| ld r22, SAVE_GPR_+6*8(sp) +| .FPU ldc1 f30, SAVE_FPR_+6*8(sp) +| ld r21, SAVE_GPR_+5*8(sp) +| .FPU ldc1 f29, SAVE_FPR_+5*8(sp) +| ld r20, SAVE_GPR_+4*8(sp) +| .FPU ldc1 f28, SAVE_FPR_+4*8(sp) +| ld r19, SAVE_GPR_+3*8(sp) +| .FPU ldc1 f27, SAVE_FPR_+3*8(sp) +| ld r18, SAVE_GPR_+2*8(sp) +| .FPU ldc1 f26, SAVE_FPR_+2*8(sp) +| ld r17, SAVE_GPR_+1*8(sp) +| .FPU ldc1 f25, SAVE_FPR_+1*8(sp) +| ld r16, SAVE_GPR_+0*8(sp) +| .FPU ldc1 f24, SAVE_FPR_+0*8(sp) +| jr ra +| daddiu sp, sp, CFRAME_SPACE +|.endmacro +| +|// Type definitions. Some of these are only used for documentation. +|.type L, lua_State, LREG +|.type GL, global_State +|.type TVALUE, TValue +|.type GCOBJ, GCobj +|.type STR, GCstr +|.type TAB, GCtab +|.type LFUNC, GCfuncL +|.type CFUNC, GCfuncC +|.type PROTO, GCproto +|.type UPVAL, GCupval +|.type NODE, Node +|.type NARGS8, int +|.type TRACE, GCtrace +|.type SBUF, SBuf +| +|//----------------------------------------------------------------------- +| +|// Trap for not-yet-implemented parts. +|.macro NYI; .long 0xf0f0f0f0; .endmacro +| +|// Macros to mark delay slots. +|.macro ., a; a; .endmacro +|.macro ., a,b; a,b; .endmacro +|.macro ., a,b,c; a,b,c; .endmacro +|.macro ., a,b,c,d; a,b,c,d; .endmacro +| +|.define FRAME_PC, -8 +|.define FRAME_FUNC, -16 +| +|//----------------------------------------------------------------------- +| +|// Endian-specific defines. +|.if ENDIAN_LE +|.define HI, 4 +|.define LO, 0 +|.define OFS_RD, 2 +|.define OFS_RA, 1 +|.define OFS_OP, 0 +|.else +|.define HI, 0 +|.define LO, 4 +|.define OFS_RD, 0 +|.define OFS_RA, 2 +|.define OFS_OP, 3 +|.endif +| +|// Instruction decode. +|.macro decode_OP1, dst, ins; andi dst, ins, 0xff; .endmacro +|.macro decode_OP8a, dst, ins; andi dst, ins, 0xff; .endmacro +|.macro decode_OP8b, dst; sll dst, dst, 3; .endmacro +|.macro decode_RC8a, dst, ins; srl dst, ins, 13; .endmacro +|.macro decode_RC8b, dst; andi dst, dst, 0x7f8; .endmacro +|.macro decode_RD4b, dst; sll dst, dst, 2; .endmacro +|.macro decode_RA8a, dst, ins; srl dst, ins, 5; .endmacro +|.macro decode_RA8b, dst; andi dst, dst, 0x7f8; .endmacro +|.macro decode_RB8a, dst, ins; srl dst, ins, 21; .endmacro +|.macro decode_RB8b, dst; andi dst, dst, 0x7f8; .endmacro +|.macro decode_RD8a, dst, ins; srl dst, ins, 16; .endmacro +|.macro decode_RD8b, dst; sll dst, dst, 3; .endmacro +|.macro decode_RDtoRC8, dst, src; andi dst, src, 0x7f8; .endmacro +| +|// Instruction fetch. +|.macro ins_NEXT1 +| lw INS, 0(PC) +| daddiu PC, PC, 4 +|.endmacro +|// Instruction decode+dispatch. +|.macro ins_NEXT2 +| decode_OP8a TMP1, INS +| decode_OP8b TMP1 +| daddu TMP0, DISPATCH, TMP1 +| decode_RD8a RD, INS +| ld AT, 0(TMP0) +| decode_RA8a RA, INS +| decode_RD8b RD +| jr AT +| decode_RA8b RA +|.endmacro +|.macro ins_NEXT +| ins_NEXT1 +| ins_NEXT2 +|.endmacro +| +|// Instruction footer. +|.if 1 +| // Replicated dispatch. Less unpredictable branches, but higher I-Cache use. +| .define ins_next, ins_NEXT +| .define ins_next_, ins_NEXT +| .define ins_next1, ins_NEXT1 +| .define ins_next2, ins_NEXT2 +|.else +| // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch. +| // Affects only certain kinds of benchmarks (and only with -j off). +| .macro ins_next +| b ->ins_next +| .endmacro +| .macro ins_next1 +| .endmacro +| .macro ins_next2 +| b ->ins_next +| .endmacro +| .macro ins_next_ +| ->ins_next: +| ins_NEXT +| .endmacro +|.endif +| +|// Call decode and dispatch. +|.macro ins_callt +| // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC +| ld PC, LFUNC:RB->pc +| lw INS, 0(PC) +| daddiu PC, PC, 4 +| decode_OP8a TMP1, INS +| decode_RA8a RA, INS +| decode_OP8b TMP1 +| decode_RA8b RA +| daddu TMP0, DISPATCH, TMP1 +| ld TMP0, 0(TMP0) +| jr TMP0 +| daddu RA, RA, BASE +|.endmacro +| +|.macro ins_call +| // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, PC = caller PC +| sd PC, FRAME_PC(BASE) +| ins_callt +|.endmacro +| +|//----------------------------------------------------------------------- +| +|.macro branch_RD +| srl TMP0, RD, 1 +| lui AT, (-(BCBIAS_J*4 >> 16) & 65535) +| addu TMP0, TMP0, AT +| daddu PC, PC, TMP0 +|.endmacro +| +|// Assumes DISPATCH is relative to GL. +#define DISPATCH_GL(field) (GG_DISP2G + (int)offsetof(global_State, field)) +#define DISPATCH_J(field) (GG_DISP2J + (int)offsetof(jit_State, field)) +#define GG_DISP2GOT (GG_OFS(got) - GG_OFS(dispatch)) +#define DISPATCH_GOT(name) (GG_DISP2GOT + sizeof(void*)*LJ_GOT_##name) +| +#define PC2PROTO(field) ((int)offsetof(GCproto, field)-(int)sizeof(GCproto)) +| +|.macro load_got, func +| ld CFUNCADDR, DISPATCH_GOT(func)(DISPATCH) +|.endmacro +|// Much faster. Sadly, there's no easy way to force the required code layout. +|// .macro call_intern, func; bal extern func; .endmacro +|.macro call_intern, func; jalr CFUNCADDR; .endmacro +|.macro call_extern; jalr CFUNCADDR; .endmacro +|.macro jmp_extern; jr CFUNCADDR; .endmacro +| +|.macro hotcheck, delta, target +| NYI +|.endmacro +| +|.macro hotloop +| hotcheck HOTCOUNT_LOOP, ->vm_hotloop +|.endmacro +| +|.macro hotcall +| hotcheck HOTCOUNT_CALL, ->vm_hotcall +|.endmacro +| +|// Set current VM state. Uses TMP0. +|.macro li_vmstate, st; li TMP0, ~LJ_VMST_..st; .endmacro +|.macro st_vmstate; sw TMP0, DISPATCH_GL(vmstate)(DISPATCH); .endmacro +| +|// Move table write barrier back. Overwrites mark and tmp. +|.macro barrierback, tab, mark, tmp, target +| ld tmp, DISPATCH_GL(gc.grayagain)(DISPATCH) +| andi mark, mark, ~LJ_GC_BLACK & 255 // black2gray(tab) +| sd tab, DISPATCH_GL(gc.grayagain)(DISPATCH) +| sb mark, tab->marked +| b target +|. sd tmp, tab->gclist +|.endmacro +| +|// Clear type tag. Isolate lowest 14+32+1=47 bits of reg. +|.macro cleartp, reg; dextm reg, reg, 0, 14; .endmacro +|.macro cleartp, dst, reg; dextm dst, reg, 0, 14; .endmacro +| +|// Set type tag: Merge 17 type bits into bits [15+32=47, 31+32+1=64) of dst. +|.macro settp, dst, tp; dinsu dst, tp, 15, 31; .endmacro +| +|// Extract (negative) type tag. +|.macro gettp, dst, src; dsra dst, src, 47; .endmacro +| +|// Macros to check the TValue type and extract the GCobj. Branch on failure. +|.macro checktp, reg, tp, target +| gettp AT, reg +| daddiu AT, AT, tp +| bnez AT, target +|. cleartp reg +|.endmacro +|.macro checktp, dst, reg, tp, target +| gettp AT, reg +| daddiu AT, AT, tp +| bnez AT, target +|. cleartp dst, reg +|.endmacro +|.macro checkstr, reg, target; checktp reg, -LJ_TSTR, target; .endmacro +|.macro checktab, reg, target; checktp reg, -LJ_TTAB, target; .endmacro +|.macro checkfunc, reg, target; checktp reg, -LJ_TFUNC, target; .endmacro +|.macro checkint, reg, target // Caveat: has delay slot! +| gettp AT, reg +| bne AT, TISNUM, target +|.endmacro +|.macro checknum, reg, target // Caveat: has delay slot! +| gettp AT, reg +| sltiu AT, AT, LJ_TISNUM +| beqz AT, target +|.endmacro +| +|.macro mov_false, reg +| lu reg, 0x8000 +| dsll reg, reg, 32 +| not reg, reg +|.endmacro +|.macro mov_true, reg +| li reg, 0x0001 +| dsll reg, reg, 48 +| not reg, reg +|.endmacro +| +|//----------------------------------------------------------------------- + +/* Generate subroutines used by opcodes and other parts of the VM. */ +/* The .code_sub section should be last to help static branch prediction. */ +static void build_subroutines(BuildCtx *ctx) +{ + |.code_sub + | + |//----------------------------------------------------------------------- + |//-- Return handling ---------------------------------------------------- + |//----------------------------------------------------------------------- + | + |->vm_returnp: + | // See vm_return. Also: TMP2 = previous base. + | andi AT, PC, FRAME_P + | beqz AT, ->cont_dispatch + | + | // Return from pcall or xpcall fast func. + |. mov_true TMP1 + | ld PC, FRAME_PC(TMP2) // Fetch PC of previous frame. + | move BASE, TMP2 // Restore caller base. + | // Prepending may overwrite the pcall frame, so do it at the end. + | sd TMP1, -8(RA) // Prepend true to results. + | daddiu RA, RA, -8 + | + |->vm_returnc: + | addiu RD, RD, 8 // RD = (nresults+1)*8. + | andi TMP0, PC, FRAME_TYPE + | beqz RD, ->vm_unwind_c_eh + |. li CRET1, LUA_YIELD + | beqz TMP0, ->BC_RET_Z // Handle regular return to Lua. + |. move MULTRES, RD + | + |->vm_return: + | // BASE = base, RA = resultptr, RD/MULTRES = (nresults+1)*8, PC = return + | // TMP0 = PC & FRAME_TYPE + | li TMP2, -8 + | xori AT, TMP0, FRAME_C + | and TMP2, PC, TMP2 + | bnez AT, ->vm_returnp + | dsubu TMP2, BASE, TMP2 // TMP2 = previous base. + | + | addiu TMP1, RD, -8 + | sd TMP2, L->base + | li_vmstate C + | lw TMP2, SAVE_NRES + | daddiu BASE, BASE, -16 + | st_vmstate + | beqz TMP1, >2 + |. sll TMP2, TMP2, 3 + |1: + | addiu TMP1, TMP1, -8 + | ld CRET1, 0(RA) + | daddiu RA, RA, 8 + | sd CRET1, 0(BASE) + | bnez TMP1, <1 + |. daddiu BASE, BASE, 8 + | + |2: + | bne TMP2, RD, >6 + |3: + |. sd BASE, L->top // Store new top. + | + |->vm_leave_cp: + | ld TMP0, SAVE_CFRAME // Restore previous C frame. + | move CRET1, r0 // Ok return status for vm_pcall. + | sd TMP0, L->cframe + | + |->vm_leave_unw: + | restoreregs_ret + | + |6: + | ld TMP1, L->maxstack + | slt AT, TMP2, RD + | bnez AT, >7 // Less results wanted? + | // More results wanted. Check stack size and fill up results with nil. + |. slt AT, BASE, TMP1 + | beqz AT, >8 + |. nop + | sd TISNIL, 0(BASE) + | addiu RD, RD, 8 + | b <2 + |. daddiu BASE, BASE, 8 + | + |7: // Less results wanted. + | subu TMP0, RD, TMP2 + | dsubu TMP0, BASE, TMP0 // Either keep top or shrink it. + | b <3 + |. movn BASE, TMP0, TMP2 // LUA_MULTRET+1 case? + | + |8: // Corner case: need to grow stack for filling up results. + | // This can happen if: + | // - A C function grows the stack (a lot). + | // - The GC shrinks the stack in between. + | // - A return back from a lua_call() with (high) nresults adjustment. + | load_got lj_state_growstack + | move MULTRES, RD + | srl CARG2, TMP2, 3 + | call_intern lj_state_growstack // (lua_State *L, int n) + |. move CARG1, L + | lw TMP2, SAVE_NRES + | ld BASE, L->top // Need the (realloced) L->top in BASE. + | move RD, MULTRES + | b <2 + |. sll TMP2, TMP2, 3 + | + |->vm_unwind_c: // Unwind C stack, return from vm_pcall. + | // (void *cframe, int errcode) + | move sp, CARG1 + | move CRET1, CARG2 + |->vm_unwind_c_eh: // Landing pad for external unwinder. + | ld L, SAVE_L + | li TMP0, ~LJ_VMST_C + | ld GL:TMP1, L->glref + | b ->vm_leave_unw + |. sw TMP0, GL:TMP1->vmstate + | + |->vm_unwind_ff: // Unwind C stack, return from ff pcall. + | // (void *cframe) + | li AT, -4 + | and sp, CARG1, AT + |->vm_unwind_ff_eh: // Landing pad for external unwinder. + | ld L, SAVE_L + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | li TISNIL, LJ_TNIL + | li TISNUM, LJ_TISNUM + | ld BASE, L->base + | ld DISPATCH, L->glref // Setup pointer to dispatch table. + | .FPU mtc1 TMP3, TOBIT + | mov_false TMP1 + | li_vmstate INTERP + | ld PC, FRAME_PC(BASE) // Fetch PC of previous frame. + | .FPU cvt.d.s TOBIT, TOBIT + | daddiu RA, BASE, -8 // Results start at BASE-8. + | daddiu DISPATCH, DISPATCH, GG_G2DISP + | sd TMP1, 0(RA) // Prepend false to error message. + | st_vmstate + | b ->vm_returnc + |. li RD, 16 // 2 results: false + error message. + | + |//----------------------------------------------------------------------- + |//-- Grow stack for calls ----------------------------------------------- + |//----------------------------------------------------------------------- + | + |->vm_growstack_c: // Grow stack for C function. + | b >2 + |. li CARG2, LUA_MINSTACK + | + |->vm_growstack_l: // Grow stack for Lua function. + | // BASE = new base, RA = BASE+framesize*8, RC = nargs*8, PC = first PC + | daddu RC, BASE, RC + | dsubu RA, RA, BASE + | sd BASE, L->base + | daddiu PC, PC, 4 // Must point after first instruction. + | sd RC, L->top + | srl CARG2, RA, 3 + |2: + | // L->base = new base, L->top = top + | load_got lj_state_growstack + | sd PC, SAVE_PC + | call_intern lj_state_growstack // (lua_State *L, int n) + |. move CARG1, L + | ld BASE, L->base + | ld RC, L->top + | ld LFUNC:RB, FRAME_FUNC(BASE) + | dsubu RC, RC, BASE + | cleartp LFUNC:RB + | // BASE = new base, RB = LFUNC/CFUNC, RC = nargs*8, FRAME_PC(BASE) = PC + | ins_callt // Just retry the call. + | + |//----------------------------------------------------------------------- + |//-- Entry points into the assembler VM --------------------------------- + |//----------------------------------------------------------------------- + | + |->vm_resume: // Setup C frame and resume thread. + | // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0) + | saveregs + | move L, CARG1 + | ld DISPATCH, L->glref // Setup pointer to dispatch table. + | move BASE, CARG2 + | lbu TMP1, L->status + | sd L, SAVE_L + | li PC, FRAME_CP + | daddiu TMP0, sp, CFRAME_RESUME + | daddiu DISPATCH, DISPATCH, GG_G2DISP + | sw r0, SAVE_NRES + | sw r0, SAVE_ERRF + | sd CARG1, SAVE_PC // Any value outside of bytecode is ok. + | sd r0, SAVE_CFRAME + | beqz TMP1, >3 + |. sd TMP0, L->cframe + | + | // Resume after yield (like a return). + | sd L, DISPATCH_GL(cur_L)(DISPATCH) + | move RA, BASE + | ld BASE, L->base + | ld TMP1, L->top + | ld PC, FRAME_PC(BASE) + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | dsubu RD, TMP1, BASE + | .FPU mtc1 TMP3, TOBIT + | sb r0, L->status + | .FPU cvt.d.s TOBIT, TOBIT + | li_vmstate INTERP + | daddiu RD, RD, 8 + | st_vmstate + | move MULTRES, RD + | andi TMP0, PC, FRAME_TYPE + | li TISNIL, LJ_TNIL + | beqz TMP0, ->BC_RET_Z + |. li TISNUM, LJ_TISNUM + | b ->vm_return + |. nop + | + |->vm_pcall: // Setup protected C frame and enter VM. + | // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef) + | saveregs + | sw CARG4, SAVE_ERRF + | b >1 + |. li PC, FRAME_CP + | + |->vm_call: // Setup C frame and enter VM. + | // (lua_State *L, TValue *base, int nres1) + | saveregs + | li PC, FRAME_C + | + |1: // Entry point for vm_pcall above (PC = ftype). + | ld TMP1, L:CARG1->cframe + | move L, CARG1 + | sw CARG3, SAVE_NRES + | ld DISPATCH, L->glref // Setup pointer to dispatch table. + | sd CARG1, SAVE_L + | move BASE, CARG2 + | daddiu DISPATCH, DISPATCH, GG_G2DISP + | sd CARG1, SAVE_PC // Any value outside of bytecode is ok. + | sd TMP1, SAVE_CFRAME + | sd sp, L->cframe // Add our C frame to cframe chain. + | + |3: // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype). + | sd L, DISPATCH_GL(cur_L)(DISPATCH) + | ld TMP2, L->base // TMP2 = old base (used in vmeta_call). + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | ld TMP1, L->top + | .FPU mtc1 TMP3, TOBIT + | daddu PC, PC, BASE + | dsubu NARGS8:RC, TMP1, BASE + | li TISNUM, LJ_TISNUM + | dsubu PC, PC, TMP2 // PC = frame delta + frame type + | .FPU cvt.d.s TOBIT, TOBIT + | li_vmstate INTERP + | li TISNIL, LJ_TNIL + | st_vmstate + | + |->vm_call_dispatch: + | // TMP2 = old base, BASE = new base, RC = nargs*8, PC = caller PC + | ld LFUNC:RB, FRAME_FUNC(BASE) + | checkfunc LFUNC:RB, ->vmeta_call + | + |->vm_call_dispatch_f: + | ins_call + | // BASE = new base, RB = func, RC = nargs*8, PC = caller PC + | + |->vm_cpcall: // Setup protected C frame, call C. + | // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp) + | saveregs + | move L, CARG1 + | ld TMP0, L:CARG1->stack + | sd CARG1, SAVE_L + | ld TMP1, L->top + | ld DISPATCH, L->glref // Setup pointer to dispatch table. + | sd CARG1, SAVE_PC // Any value outside of bytecode is ok. + | dsubu TMP0, TMP0, TMP1 // Compute -savestack(L, L->top). + | ld TMP1, L->cframe + | daddiu DISPATCH, DISPATCH, GG_G2DISP + | sw TMP0, SAVE_NRES // Neg. delta means cframe w/o frame. + | sw r0, SAVE_ERRF // No error function. + | sd TMP1, SAVE_CFRAME + | sd sp, L->cframe // Add our C frame to cframe chain. + | sd L, DISPATCH_GL(cur_L)(DISPATCH) + | jalr CARG4 // (lua_State *L, lua_CFunction func, void *ud) + |. move CFUNCADDR, CARG4 + | move BASE, CRET1 + | bnez CRET1, <3 // Else continue with the call. + |. li PC, FRAME_CP + | b ->vm_leave_cp // No base? Just remove C frame. + |. nop + | + |//----------------------------------------------------------------------- + |//-- Metamethod handling ------------------------------------------------ + |//----------------------------------------------------------------------- + | + |// The lj_meta_* functions (except for lj_meta_cat) don't reallocate the + |// stack, so BASE doesn't need to be reloaded across these calls. + | + |//-- Continuation dispatch ---------------------------------------------- + | + |->cont_dispatch: + | // BASE = meta base, RA = resultptr, RD = (nresults+1)*8 + | ld TMP0, -32(BASE) // Continuation. + | move RB, BASE + | move BASE, TMP2 // Restore caller BASE. + | ld LFUNC:TMP1, FRAME_FUNC(TMP2) + |.if FFI + | sltiu AT, TMP0, 2 + |.endif + | ld PC, -24(RB) // Restore PC from [cont|PC]. + | cleartp LFUNC:TMP1 + | daddu TMP2, RA, RD + | ld TMP1, LFUNC:TMP1->pc + |.if FFI + | bnez AT, >1 + |.endif + |. sd TISNIL, -8(TMP2) // Ensure one valid arg. + | // BASE = base, RA = resultptr, RB = meta base + | jr TMP0 // Jump to continuation. + |. ld KBASE, PC2PROTO(k)(TMP1) + | + |.if FFI + |1: + | bnez TMP0, ->cont_ffi_callback // cont = 1: return from FFI callback. + | // cont = 0: tailcall from C function. + |. daddiu TMP1, RB, -32 + | b ->vm_call_tail + |. dsubu RC, TMP1, BASE + |.endif + | + |->cont_cat: // RA = resultptr, RB = meta base + | lw INS, -4(PC) + | daddiu CARG2, RB, -32 + | ld CRET1, 0(RA) + | decode_RB8a MULTRES, INS + | decode_RA8a RA, INS + | decode_RB8b MULTRES + | decode_RA8b RA + | daddu TMP1, BASE, MULTRES + | sd BASE, L->base + | dsubu CARG3, CARG2, TMP1 + | bne TMP1, CARG2, ->BC_CAT_Z + |. sd CRET1, 0(CARG2) + | daddu RA, BASE, RA + | b ->cont_nop + |. sd CRET1, 0(RA) + | + |//-- Table indexing metamethods ----------------------------------------- + | + |->vmeta_tgets1: + | daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | li TMP0, LJ_TSTR + | settp STR:RC, TMP0 + | b >1 + |. sd STR:RC, 0(CARG3) + | + |->vmeta_tgets: + | daddiu CARG2, DISPATCH, DISPATCH_GL(tmptv) + | li TMP0, LJ_TTAB + | li TMP1, LJ_TSTR + | settp TAB:RB, TMP0 + | daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv2) + | sd TAB:RB, 0(CARG2) + | settp STR:RC, TMP1 + | b >1 + |. sd STR:RC, 0(CARG3) + | + |->vmeta_tgetb: // TMP0 = index + | daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | settp TMP0, TISNUM + | sd TMP0, 0(CARG3) + | + |->vmeta_tgetv: + |1: + | load_got lj_meta_tget + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_meta_tget // (lua_State *L, TValue *o, TValue *k) + |. move CARG1, L + | // Returns TValue * (finished) or NULL (metamethod). + | beqz CRET1, >3 + |. daddiu TMP1, BASE, -FRAME_CONT + | ld CARG1, 0(CRET1) + | ins_next1 + | sd CARG1, 0(RA) + | ins_next2 + | + |3: // Call __index metamethod. + | // BASE = base, L->top = new base, stack = cont/func/t/k + | ld BASE, L->top + | sd PC, -24(BASE) // [cont|PC] + | dsubu PC, BASE, TMP1 + | ld LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. + | cleartp LFUNC:RB + | b ->vm_call_dispatch_f + |. li NARGS8:RC, 16 // 2 args for func(t, k). + | + |->vmeta_tgetr: + | load_got lj_tab_getinth + | call_intern lj_tab_getinth // (GCtab *t, int32_t key) + |. nop + | // Returns cTValue * or NULL. + | beqz CRET1, ->BC_TGETR_Z + |. move CARG2, TISNIL + | b ->BC_TGETR_Z + |. ld CARG2, 0(CRET1) + | + |//----------------------------------------------------------------------- + | + |->vmeta_tsets1: + | daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | li TMP0, LJ_TSTR + | settp STR:RC, TMP0 + | b >1 + |. sd STR:RC, 0(CARG3) + | + |->vmeta_tsets: + | daddiu CARG2, DISPATCH, DISPATCH_GL(tmptv) + | li TMP0, LJ_TTAB + | li TMP1, LJ_TSTR + | settp TAB:RB, TMP0 + | daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv2) + | sd TAB:RB, 0(CARG2) + | settp STR:RC, TMP1 + | b >1 + |. sd STR:RC, 0(CARG3) + | + |->vmeta_tsetb: // TMP0 = index + | daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | settp TMP0, TISNUM + | sd TMP0, 0(CARG3) + | + |->vmeta_tsetv: + |1: + | load_got lj_meta_tset + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_meta_tset // (lua_State *L, TValue *o, TValue *k) + |. move CARG1, L + | // Returns TValue * (finished) or NULL (metamethod). + | beqz CRET1, >3 + |. ld CARG1, 0(RA) + | // NOBARRIER: lj_meta_tset ensures the table is not black. + | ins_next1 + | sd CARG1, 0(CRET1) + | ins_next2 + | + |3: // Call __newindex metamethod. + | // BASE = base, L->top = new base, stack = cont/func/t/k/(v) + | daddiu TMP1, BASE, -FRAME_CONT + | ld BASE, L->top + | sd PC, -24(BASE) // [cont|PC] + | dsubu PC, BASE, TMP1 + | ld LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. + | cleartp LFUNC:RB + | sd CARG1, 16(BASE) // Copy value to third argument. + | b ->vm_call_dispatch_f + |. li NARGS8:RC, 24 // 3 args for func(t, k, v) + | + |->vmeta_tsetr: + | load_got lj_tab_setinth + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_tab_setinth // (lua_State *L, GCtab *t, int32_t key) + |. move CARG1, L + | // Returns TValue *. + | b ->BC_TSETR_Z + |. nop + | + |//-- Comparison metamethods --------------------------------------------- + | + |->vmeta_comp: + | // RA/RD point to o1/o2. + | move CARG2, RA + | move CARG3, RD + | load_got lj_meta_comp + | daddiu PC, PC, -4 + | sd BASE, L->base + | sd PC, SAVE_PC + | decode_OP1 CARG4, INS + | call_intern lj_meta_comp // (lua_State *L, TValue *o1, *o2, int op) + |. move CARG1, L + | // Returns 0/1 or TValue * (metamethod). + |3: + | sltiu AT, CRET1, 2 + | beqz AT, ->vmeta_binop + | negu TMP2, CRET1 + |4: + | lhu RD, OFS_RD(PC) + | daddiu PC, PC, 4 + | lui TMP1, (-(BCBIAS_J*4 >> 16) & 65535) + | sll RD, RD, 2 + | addu RD, RD, TMP1 + | and RD, RD, TMP2 + | daddu PC, PC, RD + |->cont_nop: + | ins_next + | + |->cont_ra: // RA = resultptr + | lbu TMP1, -4+OFS_RA(PC) + | ld CRET1, 0(RA) + | sll TMP1, TMP1, 3 + | daddu TMP1, BASE, TMP1 + | b ->cont_nop + |. sd CRET1, 0(TMP1) + | + |->cont_condt: // RA = resultptr + | ld TMP0, 0(RA) + | gettp TMP0, TMP0 + | sltiu AT, TMP0, LJ_TISTRUECOND + | b <4 + |. negu TMP2, AT // Branch if result is true. + | + |->cont_condf: // RA = resultptr + | ld TMP0, 0(RA) + | gettp TMP0, TMP0 + | sltiu AT, TMP0, LJ_TISTRUECOND + | b <4 + |. addiu TMP2, AT, -1 // Branch if result is false. + | + |->vmeta_equal: + | // CARG1/CARG2 point to o1/o2. TMP0 is set to 0/1. + | load_got lj_meta_equal + | cleartp LFUNC:CARG3, CARG2 + | cleartp LFUNC:CARG2, CARG1 + | move CARG4, TMP0 + | daddiu PC, PC, -4 + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_meta_equal // (lua_State *L, GCobj *o1, *o2, int ne) + |. move CARG1, L + | // Returns 0/1 or TValue * (metamethod). + | b <3 + |. nop + | + |->vmeta_equal_cd: + |.if FFI + | load_got lj_meta_equal_cd + | move CARG2, INS + | daddiu PC, PC, -4 + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_meta_equal_cd // (lua_State *L, BCIns op) + |. move CARG1, L + | // Returns 0/1 or TValue * (metamethod). + | b <3 + |. nop + |.endif + | + |->vmeta_istype: + | load_got lj_meta_istype + | daddiu PC, PC, -4 + | sd BASE, L->base + | srl CARG2, RA, 3 + | srl CARG3, RD, 3 + | sd PC, SAVE_PC + | call_intern lj_meta_istype // (lua_State *L, BCReg ra, BCReg tp) + |. move CARG1, L + | b ->cont_nop + |. nop + | + |//-- Arithmetic metamethods --------------------------------------------- + | + |->vmeta_unm: + | move RC, RB + | + |->vmeta_arith: + | load_got lj_meta_arith + | sd BASE, L->base + | move CARG2, RA + | sd PC, SAVE_PC + | move CARG3, RB + | move CARG4, RC + | decode_OP1 CARG5, INS // CARG5 == RB. + | call_intern lj_meta_arith // (lua_State *L, TValue *ra,*rb,*rc, BCReg op) + |. move CARG1, L + | // Returns NULL (finished) or TValue * (metamethod). + | beqz CRET1, ->cont_nop + |. nop + | + | // Call metamethod for binary op. + |->vmeta_binop: + | // BASE = old base, CRET1 = new base, stack = cont/func/o1/o2 + | dsubu TMP1, CRET1, BASE + | sd PC, -24(CRET1) // [cont|PC] + | move TMP2, BASE + | daddiu PC, TMP1, FRAME_CONT + | move BASE, CRET1 + | b ->vm_call_dispatch + |. li NARGS8:RC, 16 // 2 args for func(o1, o2). + | + |->vmeta_len: + | // CARG2 already set by BC_LEN. +#if LJ_52 + | move MULTRES, CARG1 +#endif + | load_got lj_meta_len + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_meta_len // (lua_State *L, TValue *o) + |. move CARG1, L + | // Returns NULL (retry) or TValue * (metamethod base). +#if LJ_52 + | bnez CRET1, ->vmeta_binop // Binop call for compatibility. + |. nop + | b ->BC_LEN_Z + |. move CARG1, MULTRES +#else + | b ->vmeta_binop // Binop call for compatibility. + |. nop +#endif + | + |//-- Call metamethod ---------------------------------------------------- + | + |->vmeta_call: // Resolve and call __call metamethod. + | // TMP2 = old base, BASE = new base, RC = nargs*8 + | load_got lj_meta_call + | sd TMP2, L->base // This is the callers base! + | daddiu CARG2, BASE, -16 + | sd PC, SAVE_PC + | daddu CARG3, BASE, RC + | move MULTRES, NARGS8:RC + | call_intern lj_meta_call // (lua_State *L, TValue *func, TValue *top) + |. move CARG1, L + | ld LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. + | daddiu NARGS8:RC, MULTRES, 8 // Got one more argument now. + | cleartp LFUNC:RB + | ins_call + | + |->vmeta_callt: // Resolve __call for BC_CALLT. + | // BASE = old base, RA = new base, RC = nargs*8 + | load_got lj_meta_call + | sd BASE, L->base + | daddiu CARG2, RA, -16 + | sd PC, SAVE_PC + | daddu CARG3, RA, RC + | move MULTRES, NARGS8:RC + | call_intern lj_meta_call // (lua_State *L, TValue *func, TValue *top) + |. move CARG1, L + | ld RB, FRAME_FUNC(RA) // Guaranteed to be a function here. + | ld TMP1, FRAME_PC(BASE) + | daddiu NARGS8:RC, MULTRES, 8 // Got one more argument now. + | b ->BC_CALLT_Z + |. cleartp LFUNC:CARG3, RB + | + |//-- Argument coercion for 'for' statement ------------------------------ + | + |->vmeta_for: + | load_got lj_meta_for + | sd BASE, L->base + | move CARG2, RA + | sd PC, SAVE_PC + | move MULTRES, INS + | call_intern lj_meta_for // (lua_State *L, TValue *base) + |. move CARG1, L + |.if JIT + | decode_OP1 TMP0, MULTRES + | li AT, BC_JFORI + |.endif + | decode_RA8a RA, MULTRES + | decode_RD8a RD, MULTRES + | decode_RA8b RA + |.if JIT + | beq TMP0, AT, =>BC_JFORI + |. decode_RD8b RD + | b =>BC_FORI + |. nop + |.else + | b =>BC_FORI + |. decode_RD8b RD + |.endif + | + |//----------------------------------------------------------------------- + |//-- Fast functions ----------------------------------------------------- + |//----------------------------------------------------------------------- + | + |.macro .ffunc, name + |->ff_ .. name: + |.endmacro + | + |.macro .ffunc_1, name + |->ff_ .. name: + | beqz NARGS8:RC, ->fff_fallback + |. ld CARG1, 0(BASE) + |.endmacro + | + |.macro .ffunc_2, name + |->ff_ .. name: + | sltiu AT, NARGS8:RC, 16 + | ld CARG1, 0(BASE) + | bnez AT, ->fff_fallback + |. ld CARG2, 8(BASE) + |.endmacro + | + |.macro .ffunc_n, name // Caveat: has delay slot! + |->ff_ .. name: + | ld CARG1, 0(BASE) + | beqz NARGS8:RC, ->fff_fallback + | // Either ldc1 or the 1st instruction of checknum is in the delay slot. + | .FPU ldc1 FARG1, 0(BASE) + | checknum CARG1, ->fff_fallback + |.endmacro + | + |.macro .ffunc_nn, name // Caveat: has delay slot! + |->ff_ .. name: + | ld CARG1, 0(BASE) + | sltiu AT, NARGS8:RC, 16 + | ld CARG2, 8(BASE) + | bnez AT, ->fff_fallback + |. gettp TMP0, CARG1 + | gettp TMP1, CARG2 + | sltiu TMP0, TMP0, LJ_TISNUM + | sltiu TMP1, TMP1, LJ_TISNUM + | .FPU ldc1 FARG1, 0(BASE) + | and TMP0, TMP0, TMP1 + | .FPU ldc1 FARG2, 8(BASE) + | beqz TMP0, ->fff_fallback + |.endmacro + | + |// Inlined GC threshold check. Caveat: uses TMP0 and TMP1 and has delay slot! + |.macro ffgccheck + | ld TMP0, DISPATCH_GL(gc.total)(DISPATCH) + | ld TMP1, DISPATCH_GL(gc.threshold)(DISPATCH) + | dsubu AT, TMP0, TMP1 + | bgezal AT, ->fff_gcstep + |.endmacro + | + |//-- Base library: checks ----------------------------------------------- + |.ffunc_1 assert + | gettp AT, CARG1 + | sltiu AT, AT, LJ_TISTRUECOND + | beqz AT, ->fff_fallback + |. daddiu RA, BASE, -16 + | ld PC, FRAME_PC(BASE) + | addiu RD, NARGS8:RC, 8 // Compute (nresults+1)*8. + | daddu TMP2, RA, RD + | daddiu TMP1, BASE, 8 + | beq BASE, TMP2, ->fff_res // Done if exactly 1 argument. + |. sd CARG1, 0(RA) + |1: + | ld CRET1, 0(TMP1) + | sd CRET1, -16(TMP1) + | bne TMP1, TMP2, <1 + |. daddiu TMP1, TMP1, 8 + | b ->fff_res + |. nop + | + |.ffunc_1 type + | gettp TMP0, CARG1 + | sltu TMP1, TISNUM, TMP0 + | not TMP2, TMP0 + | li TMP3, ~LJ_TISNUM + | movz TMP2, TMP3, TMP1 + | dsll TMP2, TMP2, 3 + | daddu TMP2, CFUNC:RB, TMP2 + | b ->fff_restv + |. ld CARG1, CFUNC:TMP2->upvalue + | + |//-- Base library: getters and setters --------------------------------- + | + |.ffunc_1 getmetatable + | gettp TMP2, CARG1 + | daddiu TMP0, TMP2, -LJ_TTAB + | daddiu TMP1, TMP2, -LJ_TUDATA + | movn TMP0, TMP1, TMP0 + | bnez TMP0, >6 + |. cleartp TAB:CARG1 + |1: // Field metatable must be at same offset for GCtab and GCudata! + | ld TAB:RB, TAB:CARG1->metatable + |2: + | ld STR:RC, DISPATCH_GL(gcroot[GCROOT_MMNAME+MM_metatable])(DISPATCH) + | beqz TAB:RB, ->fff_restv + |. li CARG1, LJ_TNIL + | lw TMP0, TAB:RB->hmask + | lw TMP1, STR:RC->hash + | ld NODE:TMP2, TAB:RB->node + | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask + | dsll TMP0, TMP1, 5 + | dsll TMP1, TMP1, 3 + | dsubu TMP1, TMP0, TMP1 + | daddu NODE:TMP2, NODE:TMP2, TMP1 // node = tab->node + (idx*32-idx*8) + | li CARG4, LJ_TSTR + | settp STR:RC, CARG4 // Tagged key to look for. + |3: // Rearranged logic, because we expect _not_ to find the key. + | ld TMP0, NODE:TMP2->key + | ld CARG1, NODE:TMP2->val + | ld NODE:TMP2, NODE:TMP2->next + | beq RC, TMP0, >5 + |. li AT, LJ_TTAB + | bnez NODE:TMP2, <3 + |. nop + |4: + | move CARG1, RB + | b ->fff_restv // Not found, keep default result. + |. settp CARG1, AT + |5: + | bne CARG1, TISNIL, ->fff_restv + |. nop + | b <4 // Ditto for nil value. + |. nop + | + |6: + | sltiu AT, TMP2, LJ_TISNUM + | movn TMP2, TISNUM, AT + | dsll TMP2, TMP2, 3 + | dsubu TMP0, DISPATCH, TMP2 + | b <2 + |. ld TAB:RB, DISPATCH_GL(gcroot[GCROOT_BASEMT])-8(TMP0) + | + |.ffunc_2 setmetatable + | // Fast path: no mt for table yet and not clearing the mt. + | checktp TMP1, CARG1, -LJ_TTAB, ->fff_fallback + | gettp TMP3, CARG2 + | ld TAB:TMP0, TAB:TMP1->metatable + | lbu TMP2, TAB:TMP1->marked + | daddiu AT, TMP3, -LJ_TTAB + | cleartp TAB:CARG2 + | or AT, AT, TAB:TMP0 + | bnez AT, ->fff_fallback + |. andi AT, TMP2, LJ_GC_BLACK // isblack(table) + | beqz AT, ->fff_restv + |. sd TAB:CARG2, TAB:TMP1->metatable + | barrierback TAB:TMP1, TMP2, TMP0, ->fff_restv + | + |.ffunc rawget + | ld CARG2, 0(BASE) + | sltiu AT, NARGS8:RC, 16 + | load_got lj_tab_get + | gettp TMP0, CARG2 + | cleartp CARG2 + | daddiu TMP0, TMP0, -LJ_TTAB + | or AT, AT, TMP0 + | bnez AT, ->fff_fallback + |. daddiu CARG3, BASE, 8 + | call_intern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key) + |. move CARG1, L + | b ->fff_restv + |. ld CARG1, 0(CRET1) + | + |//-- Base library: conversions ------------------------------------------ + | + |.ffunc tonumber + | // Only handles the number case inline (without a base argument). + | ld CARG1, 0(BASE) + | xori AT, NARGS8:RC, 8 // Exactly one number argument. + | gettp TMP1, CARG1 + | sltu TMP0, TISNUM, TMP1 + | or AT, AT, TMP0 + | bnez AT, ->fff_fallback + |. nop + | b ->fff_restv + |. nop + | + |.ffunc_1 tostring + | // Only handles the string or number case inline. + | gettp TMP0, CARG1 + | daddiu AT, TMP0, -LJ_TSTR + | // A __tostring method in the string base metatable is ignored. + | beqz AT, ->fff_restv // String key? + | // Handle numbers inline, unless a number base metatable is present. + |. ld TMP1, DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])(DISPATCH) + | sltu TMP0, TISNUM, TMP0 + | or TMP0, TMP0, TMP1 + | bnez TMP0, ->fff_fallback + |. sd BASE, L->base // Add frame since C call can throw. + | ffgccheck + |. sd PC, SAVE_PC // Redundant (but a defined value). + | load_got lj_strfmt_number + | move CARG1, L + | call_intern lj_strfmt_number // (lua_State *L, cTValue *o) + |. move CARG2, BASE + | // Returns GCstr *. + | li AT, LJ_TSTR + | settp CRET1, AT + | b ->fff_restv + |. move CARG1, CRET1 + | + |//-- Base library: iterators ------------------------------------------- + | + |.ffunc_1 next + | checktp CARG2, CARG1, -LJ_TTAB, ->fff_fallback + | daddu TMP2, BASE, NARGS8:RC + | sd TISNIL, 0(TMP2) // Set missing 2nd arg to nil. + | ld PC, FRAME_PC(BASE) + | load_got lj_tab_next + | sd BASE, L->base // Add frame since C call can throw. + | sd BASE, L->top // Dummy frame length is ok. + | daddiu CARG3, BASE, 8 + | sd PC, SAVE_PC + | call_intern lj_tab_next // (lua_State *L, GCtab *t, TValue *key) + |. move CARG1, L + | // Returns 0 at end of traversal. + | beqz CRET1, ->fff_restv // End of traversal: return nil. + |. move CARG1, TISNIL + | ld TMP0, 8(BASE) + | daddiu RA, BASE, -16 + | ld TMP2, 16(BASE) + | sd TMP0, 0(RA) + | sd TMP2, 8(RA) + | b ->fff_res + |. li RD, (2+1)*8 + | + |.ffunc_1 pairs + | checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback + | ld PC, FRAME_PC(BASE) +#if LJ_52 + | ld TAB:TMP2, TAB:TMP1->metatable + | ld TMP0, CFUNC:RB->upvalue[0] + | bnez TAB:TMP2, ->fff_fallback +#else + | ld TMP0, CFUNC:RB->upvalue[0] +#endif + |. daddiu RA, BASE, -16 + | sd TISNIL, 0(BASE) + | sd CARG1, -8(BASE) + | sd TMP0, 0(RA) + | b ->fff_res + |. li RD, (3+1)*8 + | + |.ffunc_2 ipairs_aux + | checktab CARG1, ->fff_fallback + | checkint CARG2, ->fff_fallback + |. lw TMP0, TAB:CARG1->asize + | ld TMP1, TAB:CARG1->array + | ld PC, FRAME_PC(BASE) + | sextw TMP2, CARG2 + | addiu TMP2, TMP2, 1 + | sltu AT, TMP2, TMP0 + | daddiu RA, BASE, -16 + | zextw TMP0, TMP2 + | settp TMP0, TISNUM + | beqz AT, >2 // Not in array part? + |. sd TMP0, 0(RA) + | dsll TMP3, TMP2, 3 + | daddu TMP3, TMP1, TMP3 + | ld TMP1, 0(TMP3) + |1: + | beq TMP1, TISNIL, ->fff_res // End of iteration, return 0 results. + |. li RD, (0+1)*8 + | sd TMP1, -8(BASE) + | b ->fff_res + |. li RD, (2+1)*8 + |2: // Check for empty hash part first. Otherwise call C function. + | lw TMP0, TAB:CARG1->hmask + | load_got lj_tab_getinth + | beqz TMP0, ->fff_res + |. li RD, (0+1)*8 + | call_intern lj_tab_getinth // (GCtab *t, int32_t key) + |. move CARG2, TMP2 + | // Returns cTValue * or NULL. + | beqz CRET1, ->fff_res + |. li RD, (0+1)*8 + | b <1 + |. ld TMP1, 0(CRET1) + | + |.ffunc_1 ipairs + | checktp TAB:TMP1, CARG1, -LJ_TTAB, ->fff_fallback + | ld PC, FRAME_PC(BASE) +#if LJ_52 + | ld TAB:TMP2, TAB:TMP1->metatable + | ld CFUNC:TMP0, CFUNC:RB->upvalue[0] + | bnez TAB:TMP2, ->fff_fallback +#else + | ld TMP0, CFUNC:RB->upvalue[0] +#endif + | daddiu RA, BASE, -16 + | dsll AT, TISNUM, 47 + | sd CARG1, -8(BASE) + | sd AT, 0(BASE) + | sd CFUNC:TMP0, 0(RA) + | b ->fff_res + |. li RD, (3+1)*8 + | + |//-- Base library: catch errors ---------------------------------------- + | + |.ffunc pcall + | daddiu NARGS8:RC, NARGS8:RC, -8 + | lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH) + | bltz NARGS8:RC, ->fff_fallback + |. move TMP2, BASE + | daddiu BASE, BASE, 16 + | // Remember active hook before pcall. + | srl TMP3, TMP3, HOOK_ACTIVE_SHIFT + | andi TMP3, TMP3, 1 + | daddiu PC, TMP3, 16+FRAME_PCALL + | beqz NARGS8:RC, ->vm_call_dispatch + |1: + |. daddu TMP0, BASE, NARGS8:RC + |2: + | ld TMP1, -16(TMP0) + | sd TMP1, -8(TMP0) + | daddiu TMP0, TMP0, -8 + | bne TMP0, BASE, <2 + |. nop + | b ->vm_call_dispatch + |. nop + | + |.ffunc xpcall + | daddiu NARGS8:RC, NARGS8:RC, -16 + | ld CARG1, 0(BASE) + | ld CARG2, 8(BASE) + | bltz NARGS8:RC, ->fff_fallback + |. lbu TMP1, DISPATCH_GL(hookmask)(DISPATCH) + | gettp AT, CARG2 + | daddiu AT, AT, -LJ_TFUNC + | bnez AT, ->fff_fallback // Traceback must be a function. + |. move TMP2, BASE + | daddiu BASE, BASE, 24 + | // Remember active hook before pcall. + | srl TMP3, TMP3, HOOK_ACTIVE_SHIFT + | sd CARG2, 0(TMP2) // Swap function and traceback. + | andi TMP3, TMP3, 1 + | sd CARG1, 8(TMP2) + | beqz NARGS8:RC, ->vm_call_dispatch + |. daddiu PC, TMP3, 24+FRAME_PCALL + | b <1 + |. nop + | + |//-- Coroutine library -------------------------------------------------- + | + |.macro coroutine_resume_wrap, resume + |.if resume + |.ffunc_1 coroutine_resume + | checktp CARG1, CARG1, -LJ_TTHREAD, ->fff_fallback + |.else + |.ffunc coroutine_wrap_aux + | ld L:CARG1, CFUNC:RB->upvalue[0].gcr + | cleartp L:CARG1 + |.endif + | lbu TMP0, L:CARG1->status + | ld TMP1, L:CARG1->cframe + | ld CARG2, L:CARG1->top + | ld TMP2, L:CARG1->base + | addiu AT, TMP0, -LUA_YIELD + | daddu CARG3, CARG2, TMP0 + | daddiu TMP3, CARG2, 8 + | bgtz AT, ->fff_fallback // st > LUA_YIELD? + |. movn CARG2, TMP3, AT + | xor TMP2, TMP2, CARG3 + | bnez TMP1, ->fff_fallback // cframe != 0? + |. or AT, TMP2, TMP0 + | ld TMP0, L:CARG1->maxstack + | beqz AT, ->fff_fallback // base == top && st == 0? + |. ld PC, FRAME_PC(BASE) + | daddu TMP2, CARG2, NARGS8:RC + | sltu AT, TMP0, TMP2 + | bnez AT, ->fff_fallback // Stack overflow? + |. sd PC, SAVE_PC + | sd BASE, L->base + |1: + |.if resume + | daddiu BASE, BASE, 8 // Keep resumed thread in stack for GC. + | daddiu NARGS8:RC, NARGS8:RC, -8 + | daddiu TMP2, TMP2, -8 + |.endif + | sd TMP2, L:CARG1->top + | daddu TMP1, BASE, NARGS8:RC + | move CARG3, CARG2 + | sd BASE, L->top + |2: // Move args to coroutine. + | ld CRET1, 0(BASE) + | sltu AT, BASE, TMP1 + | beqz AT, >3 + |. daddiu BASE, BASE, 8 + | sd CRET1, 0(CARG3) + | b <2 + |. daddiu CARG3, CARG3, 8 + |3: + | bal ->vm_resume // (lua_State *L, TValue *base, 0, 0) + |. move L:RA, L:CARG1 + | // Returns thread status. + |4: + | ld TMP2, L:RA->base + | sltiu AT, CRET1, LUA_YIELD+1 + | ld TMP3, L:RA->top + | li_vmstate INTERP + | ld BASE, L->base + | sd L, DISPATCH_GL(cur_L)(DISPATCH) + | st_vmstate + | beqz AT, >8 + |. dsubu RD, TMP3, TMP2 + | ld TMP0, L->maxstack + | beqz RD, >6 // No results? + |. daddu TMP1, BASE, RD + | sltu AT, TMP0, TMP1 + | bnez AT, >9 // Need to grow stack? + |. daddu TMP3, TMP2, RD + | sd TMP2, L:RA->top // Clear coroutine stack. + | move TMP1, BASE + |5: // Move results from coroutine. + | ld CRET1, 0(TMP2) + | daddiu TMP2, TMP2, 8 + | sltu AT, TMP2, TMP3 + | sd CRET1, 0(TMP1) + | bnez AT, <5 + |. daddiu TMP1, TMP1, 8 + |6: + | andi TMP0, PC, FRAME_TYPE + |.if resume + | mov_true TMP1 + | daddiu RA, BASE, -8 + | sd TMP1, -8(BASE) // Prepend true to results. + | daddiu RD, RD, 16 + |.else + | move RA, BASE + | daddiu RD, RD, 8 + |.endif + |7: + | sd PC, SAVE_PC + | beqz TMP0, ->BC_RET_Z + |. move MULTRES, RD + | b ->vm_return + |. nop + | + |8: // Coroutine returned with error (at co->top-1). + |.if resume + | daddiu TMP3, TMP3, -8 + | mov_false TMP1 + | ld CRET1, 0(TMP3) + | sd TMP3, L:RA->top // Remove error from coroutine stack. + | li RD, (2+1)*8 + | sd TMP1, -8(BASE) // Prepend false to results. + | daddiu RA, BASE, -8 + | sd CRET1, 0(BASE) // Copy error message. + | b <7 + |. andi TMP0, PC, FRAME_TYPE + |.else + | load_got lj_ffh_coroutine_wrap_err + | move CARG2, L:RA + | call_intern lj_ffh_coroutine_wrap_err // (lua_State *L, lua_State *co) + |. move CARG1, L + |.endif + | + |9: // Handle stack expansion on return from yield. + | load_got lj_state_growstack + | srl CARG2, RD, 3 + | call_intern lj_state_growstack // (lua_State *L, int n) + |. move CARG1, L + | b <4 + |. li CRET1, 0 + |.endmacro + | + | coroutine_resume_wrap 1 // coroutine.resume + | coroutine_resume_wrap 0 // coroutine.wrap + | + |.ffunc coroutine_yield + | ld TMP0, L->cframe + | daddu TMP1, BASE, NARGS8:RC + | sd BASE, L->base + | andi TMP0, TMP0, CFRAME_RESUME + | sd TMP1, L->top + | beqz TMP0, ->fff_fallback + |. li CRET1, LUA_YIELD + | sd r0, L->cframe + | b ->vm_leave_unw + |. sb CRET1, L->status + | + |//-- Math library ------------------------------------------------------- + | + |.ffunc_1 math_abs + | gettp CARG2, CARG1 + | daddiu AT, CARG2, -LJ_TISNUM + | bnez AT, >1 + |. sextw TMP1, CARG1 + | sra TMP0, TMP1, 31 // Extract sign. + | xor TMP1, TMP1, TMP0 + | dsubu CARG1, TMP1, TMP0 + | dsll TMP3, CARG1, 32 + | bgez TMP3, ->fff_restv + |. settp CARG1, TISNUM + | li CARG1, 0x41e0 // 2^31 as a double. + | b ->fff_restv + |. dsll CARG1, CARG1, 48 + |1: + | sltiu AT, CARG2, LJ_TISNUM + | beqz AT, ->fff_fallback + |. dextm CARG1, CARG1, 0, 30 + |// fallthrough + | + |->fff_restv: + | // CARG1 = TValue result. + | ld PC, FRAME_PC(BASE) + | daddiu RA, BASE, -16 + | sd CARG1, -16(BASE) + |->fff_res1: + | // RA = results, PC = return. + | li RD, (1+1)*8 + |->fff_res: + | // RA = results, RD = (nresults+1)*8, PC = return. + | andi TMP0, PC, FRAME_TYPE + | bnez TMP0, ->vm_return + |. move MULTRES, RD + | lw INS, -4(PC) + | decode_RB8a RB, INS + | decode_RB8b RB + |5: + | sltu AT, RD, RB + | bnez AT, >6 // More results expected? + |. decode_RA8a TMP0, INS + | decode_RA8b TMP0 + | ins_next1 + | // Adjust BASE. KBASE is assumed to be set for the calling frame. + | dsubu BASE, RA, TMP0 + | ins_next2 + | + |6: // Fill up results with nil. + | daddu TMP1, RA, RD + | daddiu RD, RD, 8 + | b <5 + |. sd TISNIL, -8(TMP1) + | + |.macro math_extern, func + | .ffunc_n math_ .. func + | load_got func + | call_extern + |. nop + | b ->fff_resn + |. nop + |.endmacro + | + |.macro math_extern2, func + | .ffunc_nn math_ .. func + |. load_got func + | call_extern + |. nop + | b ->fff_resn + |. nop + |.endmacro + | + |// TODO: Return integer type if result is integer (own sf implementation). + |.macro math_round, func + |->ff_math_ .. func: + | ld CARG1, 0(BASE) + | beqz NARGS8:RC, ->fff_fallback + |. gettp TMP0, CARG1 + | beq TMP0, TISNUM, ->fff_restv + |. sltu AT, TMP0, TISNUM + | beqz AT, ->fff_fallback + |.if FPU + |. ldc1 FARG1, 0(BASE) + | bal ->vm_ .. func + |. nop + |.else + |. load_got func + | call_extern + |. nop + |.endif + | b ->fff_resn + |. nop + |.endmacro + | + | math_round floor + | math_round ceil + | + |.ffunc math_log + | li AT, 8 + | bne NARGS8:RC, AT, ->fff_fallback // Exactly 1 argument. + |. ld CARG1, 0(BASE) + | checknum CARG1, ->fff_fallback + |. load_got log + |.if FPU + | call_extern + |. ldc1 FARG1, 0(BASE) + |.else + | call_extern + |. nop + |.endif + | b ->fff_resn + |. nop + | + | math_extern log10 + | math_extern exp + | math_extern sin + | math_extern cos + | math_extern tan + | math_extern asin + | math_extern acos + | math_extern atan + | math_extern sinh + | math_extern cosh + | math_extern tanh + | math_extern2 pow + | math_extern2 atan2 + | math_extern2 fmod + | + |.if FPU + |.ffunc_n math_sqrt + |. sqrt.d FRET1, FARG1 + |// fallthrough to ->fff_resn + |.else + | math_extern sqrt + |.endif + | + |->fff_resn: + | ld PC, FRAME_PC(BASE) + | daddiu RA, BASE, -16 + | b ->fff_res1 + |.if FPU + |. sdc1 FRET1, 0(RA) + |.else + |. sd CRET1, 0(RA) + |.endif + | + | + |.ffunc_2 math_ldexp + | checknum CARG1, ->fff_fallback + | checkint CARG2, ->fff_fallback + |. load_got ldexp + | .FPU ldc1 FARG1, 0(BASE) + | call_extern + |. lw CARG2, 8+LO(BASE) + | b ->fff_resn + |. nop + | + |.ffunc_n math_frexp + | load_got frexp + | ld PC, FRAME_PC(BASE) + | call_extern + |. daddiu CARG2, DISPATCH, DISPATCH_GL(tmptv) + | lw TMP1, DISPATCH_GL(tmptv)(DISPATCH) + | daddiu RA, BASE, -16 + |.if FPU + | mtc1 TMP1, FARG2 + | sdc1 FRET1, 0(RA) + | cvt.d.w FARG2, FARG2 + | sdc1 FARG2, 8(RA) + |.else + | sd CRET1, 0(RA) + | zextw TMP1, TMP1 + | settp TMP1, TISNUM + | sd TMP1, 8(RA) + |.endif + | b ->fff_res + |. li RD, (2+1)*8 + | + |.ffunc_n math_modf + | load_got modf + | ld PC, FRAME_PC(BASE) + | call_extern + |. daddiu CARG2, BASE, -16 + | daddiu RA, BASE, -16 + |.if FPU + | sdc1 FRET1, -8(BASE) + |.else + | sd CRET1, -8(BASE) + |.endif + | b ->fff_res + |. li RD, (2+1)*8 + | + |.macro math_minmax, name, intins, fpins + | .ffunc_1 name + | daddu TMP3, BASE, NARGS8:RC + | checkint CARG1, >5 + |. daddiu TMP2, BASE, 8 + |1: // Handle integers. + | beq TMP2, TMP3, ->fff_restv + |. ld CARG2, 0(TMP2) + | checkint CARG2, >3 + |. sextw CARG1, CARG1 + | lw CARG2, LO(TMP2) + |. slt AT, CARG1, CARG2 + | intins CARG1, CARG2, AT + | daddiu TMP2, TMP2, 8 + | zextw CARG1, CARG1 + | b <1 + |. settp CARG1, TISNUM + | + |3: // Convert intermediate result to number and continue with number loop. + | checknum CARG2, ->fff_fallback + |.if FPU + |. mtc1 CARG1, FRET1 + | cvt.d.w FRET1, FRET1 + | b >7 + |. ldc1 FARG1, 0(TMP2) + |.else + |. nop + | bal ->vm_sfi2d_1 + |. nop + | b >7 + |. nop + |.endif + | + |5: + | .FPU ldc1 FRET1, 0(BASE) + | checknum CARG1, ->fff_fallback + |6: // Handle numbers. + |. ld CARG2, 0(TMP2) + | beq TMP2, TMP3, ->fff_resn + |.if FPU + | ldc1 FARG1, 0(TMP2) + |.else + | move CRET1, CARG1 + |.endif + | checknum CARG2, >8 + |. nop + |7: + |.if FPU + | c.olt.d FRET1, FARG1 + | fpins FRET1, FARG1 + |.else + | bal ->vm_sfcmpolt + |. nop + | intins CARG1, CARG2, CRET1 + |.endif + | b <6 + |. daddiu TMP2, TMP2, 8 + | + |8: // Convert integer to number and continue with number loop. + | checkint CARG2, ->fff_fallback + |.if FPU + |. lwc1 FARG1, LO(TMP2) + | b <7 + |. cvt.d.w FARG1, FARG1 + |.else + |. lw CARG2, LO(TMP2) + | bal ->vm_sfi2d_2 + |. nop + | b <7 + |. nop + |.endif + | + |.endmacro + | + | math_minmax math_min, movz, movf.d + | math_minmax math_max, movn, movt.d + | + |//-- String library ----------------------------------------------------- + | + |.ffunc string_byte // Only handle the 1-arg case here. + | ld CARG1, 0(BASE) + | gettp TMP0, CARG1 + | xori AT, NARGS8:RC, 8 + | daddiu TMP0, TMP0, -LJ_TSTR + | or AT, AT, TMP0 + | bnez AT, ->fff_fallback // Need exactly 1 string argument. + |. cleartp STR:CARG1 + | lw TMP0, STR:CARG1->len + | daddiu RA, BASE, -16 + | ld PC, FRAME_PC(BASE) + | sltu RD, r0, TMP0 + | lbu TMP1, STR:CARG1[1] // Access is always ok (NUL at end). + | addiu RD, RD, 1 + | sll RD, RD, 3 // RD = ((str->len != 0)+1)*8 + | settp TMP1, TISNUM + | b ->fff_res + |. sd TMP1, 0(RA) + | + |.ffunc string_char // Only handle the 1-arg case here. + | ffgccheck + |. nop + | ld CARG1, 0(BASE) + | gettp TMP0, CARG1 + | xori AT, NARGS8:RC, 8 // Exactly 1 argument. + | daddiu TMP0, TMP0, -LJ_TISNUM // Integer. + | li TMP1, 255 + | sextw CARG1, CARG1 + | or AT, AT, TMP0 + | sltu TMP1, TMP1, CARG1 // !(255 < n). + | or AT, AT, TMP1 + | bnez AT, ->fff_fallback + |. li CARG3, 1 + | daddiu CARG2, sp, TMPD_OFS + | sb CARG1, TMPD + |->fff_newstr: + | load_got lj_str_new + | sd BASE, L->base + | sd PC, SAVE_PC + | call_intern lj_str_new // (lua_State *L, char *str, size_t l) + |. move CARG1, L + | // Returns GCstr *. + | ld BASE, L->base + |->fff_resstr: + | li AT, LJ_TSTR + | settp CRET1, AT + | b ->fff_restv + |. move CARG1, CRET1 + | + |.ffunc string_sub + | ffgccheck + |. nop + | addiu AT, NARGS8:RC, -16 + | ld TMP0, 0(BASE) + | bltz AT, ->fff_fallback + |. gettp TMP3, TMP0 + | cleartp STR:CARG1, TMP0 + | ld CARG2, 8(BASE) + | beqz AT, >1 + |. li CARG4, -1 + | ld CARG3, 16(BASE) + | checkint CARG3, ->fff_fallback + |. sextw CARG4, CARG3 + |1: + | checkint CARG2, ->fff_fallback + |. li AT, LJ_TSTR + | bne TMP3, AT, ->fff_fallback + |. sextw CARG3, CARG2 + | lw CARG2, STR:CARG1->len + | // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end + | slt AT, CARG4, r0 + | addiu TMP0, CARG2, 1 + | addu TMP1, CARG4, TMP0 + | slt TMP3, CARG3, r0 + | movn CARG4, TMP1, AT // if (end < 0) end += len+1 + | addu TMP1, CARG3, TMP0 + | movn CARG3, TMP1, TMP3 // if (start < 0) start += len+1 + | li TMP2, 1 + | slt AT, CARG4, r0 + | slt TMP3, r0, CARG3 + | movn CARG4, r0, AT // if (end < 0) end = 0 + | movz CARG3, TMP2, TMP3 // if (start < 1) start = 1 + | slt AT, CARG2, CARG4 + | movn CARG4, CARG2, AT // if (end > len) end = len + | daddu CARG2, STR:CARG1, CARG3 + | subu CARG3, CARG4, CARG3 // len = end - start + | daddiu CARG2, CARG2, sizeof(GCstr)-1 + | bgez CARG3, ->fff_newstr + |. addiu CARG3, CARG3, 1 // len++ + |->fff_emptystr: // Return empty string. + | li AT, LJ_TSTR + | daddiu STR:CARG1, DISPATCH, DISPATCH_GL(strempty) + | b ->fff_restv + |. settp CARG1, AT + | + |.macro ffstring_op, name + | .ffunc string_ .. name + | ffgccheck + |. nop + | beqz NARGS8:RC, ->fff_fallback + |. ld CARG2, 0(BASE) + | checkstr STR:CARG2, ->fff_fallback + | daddiu SBUF:CARG1, DISPATCH, DISPATCH_GL(tmpbuf) + | load_got lj_buf_putstr_ .. name + | ld TMP0, SBUF:CARG1->b + | sd L, SBUF:CARG1->L + | sd BASE, L->base + | sd TMP0, SBUF:CARG1->p + | call_intern extern lj_buf_putstr_ .. name + |. sd PC, SAVE_PC + | load_got lj_buf_tostr + | call_intern lj_buf_tostr + |. move SBUF:CARG1, SBUF:CRET1 + | b ->fff_resstr + |. ld BASE, L->base + |.endmacro + | + |ffstring_op reverse + |ffstring_op lower + |ffstring_op upper + | + |//-- Bit library -------------------------------------------------------- + | + |->vm_tobit_fb: + | beqz TMP1, ->fff_fallback + |.if FPU + |. ldc1 FARG1, 0(BASE) + | add.d FARG1, FARG1, TOBIT + | mfc1 CRET1, FARG1 + | jr ra + |. zextw CRET1, CRET1 + |.else + |// FP number to bit conversion for soft-float. + |->vm_tobit: + | dsll TMP0, CARG1, 1 + | li CARG3, 1076 + | dsrl AT, TMP0, 53 + | dsubu CARG3, CARG3, AT + | sltiu AT, CARG3, 54 + | beqz AT, >1 + |. dextm TMP0, TMP0, 0, 20 + | dinsu TMP0, AT, 21, 21 + | slt AT, CARG1, r0 + | dsrlv CRET1, TMP0, CARG3 + | dsubu TMP0, r0, CRET1 + | movn CRET1, TMP0, AT + | jr ra + |. zextw CRET1, CRET1 + |1: + | jr ra + |. move CRET1, r0 + |.endif + | + |.macro .ffunc_bit, name + | .ffunc_1 bit_..name + | gettp TMP0, CARG1 + | beq TMP0, TISNUM, >6 + |. zextw CRET1, CARG1 + | bal ->vm_tobit_fb + |. sltiu TMP1, TMP0, LJ_TISNUM + |6: + |.endmacro + | + |.macro .ffunc_bit_op, name, bins + | .ffunc_bit name + | daddiu TMP2, BASE, 8 + | daddu TMP3, BASE, NARGS8:RC + |1: + | beq TMP2, TMP3, ->fff_resi + |. ld CARG1, 0(TMP2) + | gettp TMP0, CARG1 + |.if FPU + | bne TMP0, TISNUM, >2 + |. daddiu TMP2, TMP2, 8 + | zextw CARG1, CARG1 + | b <1 + |. bins CRET1, CRET1, CARG1 + |2: + | ldc1 FARG1, -8(TMP2) + | sltiu AT, TMP0, LJ_TISNUM + | beqz AT, ->fff_fallback + |. add.d FARG1, FARG1, TOBIT + | mfc1 CARG1, FARG1 + | zextw CARG1, CARG1 + | b <1 + |. bins CRET1, CRET1, CARG1 + |.else + | beq TMP0, TISNUM, >2 + |. move CRET2, CRET1 + | bal ->vm_tobit_fb + |. sltiu TMP1, TMP0, LJ_TISNUM + | move CARG1, CRET2 + |2: + | zextw CARG1, CARG1 + | bins CRET1, CRET1, CARG1 + | b <1 + |. daddiu TMP2, TMP2, 8 + |.endif + |.endmacro + | + |.ffunc_bit_op band, and + |.ffunc_bit_op bor, or + |.ffunc_bit_op bxor, xor + | + |.ffunc_bit bswap + | dsrl TMP0, CRET1, 8 + | dsrl TMP1, CRET1, 24 + | andi TMP2, TMP0, 0xff00 + | dins TMP1, CRET1, 24, 31 + | dins TMP2, TMP0, 16, 23 + | b ->fff_resi + |. or CRET1, TMP1, TMP2 + | + |.ffunc_bit bnot + | not CRET1, CRET1 + | b ->fff_resi + |. zextw CRET1, CRET1 + | + |.macro .ffunc_bit_sh, name, shins, shmod + | .ffunc_2 bit_..name + | gettp TMP0, CARG1 + | beq TMP0, TISNUM, >1 + |. nop + | bal ->vm_tobit_fb + |. sltiu TMP1, TMP0, LJ_TISNUM + | move CARG1, CRET1 + |1: + | gettp TMP0, CARG2 + | bne TMP0, TISNUM, ->fff_fallback + |. zextw CARG2, CARG2 + | sextw CARG1, CARG1 + |.if shmod == 1 + | negu CARG2, CARG2 + |.endif + | shins CRET1, CARG1, CARG2 + | b ->fff_resi + |. zextw CRET1, CRET1 + |.endmacro + | + |.ffunc_bit_sh lshift, sllv, 0 + |.ffunc_bit_sh rshift, srlv, 0 + |.ffunc_bit_sh arshift, srav, 0 + |.ffunc_bit_sh rol, rotrv, 1 + |.ffunc_bit_sh ror, rotrv, 0 + | + |.ffunc_bit tobit + |->fff_resi: + | ld PC, FRAME_PC(BASE) + | daddiu RA, BASE, -16 + | settp CRET1, TISNUM + | b ->fff_res1 + |. sd CRET1, -16(BASE) + | + |//----------------------------------------------------------------------- + |->fff_fallback: // Call fast function fallback handler. + | // BASE = new base, RB = CFUNC, RC = nargs*8 + | ld TMP3, CFUNC:RB->f + | daddu TMP1, BASE, NARGS8:RC + | ld PC, FRAME_PC(BASE) // Fallback may overwrite PC. + | daddiu TMP0, TMP1, 8*LUA_MINSTACK + | ld TMP2, L->maxstack + | sd PC, SAVE_PC // Redundant (but a defined value). + | sltu AT, TMP2, TMP0 + | sd BASE, L->base + | sd TMP1, L->top + | bnez AT, >5 // Need to grow stack. + |. move CFUNCADDR, TMP3 + | jalr TMP3 // (lua_State *L) + |. move CARG1, L + | // Either throws an error, or recovers and returns -1, 0 or nresults+1. + | ld BASE, L->base + | sll RD, CRET1, 3 + | bgtz CRET1, ->fff_res // Returned nresults+1? + |. daddiu RA, BASE, -16 + |1: // Returned 0 or -1: retry fast path. + | ld LFUNC:RB, FRAME_FUNC(BASE) + | ld TMP0, L->top + | cleartp LFUNC:RB + | bnez CRET1, ->vm_call_tail // Returned -1? + |. dsubu NARGS8:RC, TMP0, BASE + | ins_callt // Returned 0: retry fast path. + | + |// Reconstruct previous base for vmeta_call during tailcall. + |->vm_call_tail: + | andi TMP0, PC, FRAME_TYPE + | li AT, -4 + | bnez TMP0, >3 + |. and TMP1, PC, AT + | lbu TMP1, OFS_RA(PC) + | sll TMP1, TMP1, 3 + | addiu TMP1, TMP1, 16 + |3: + | b ->vm_call_dispatch // Resolve again for tailcall. + |. dsubu TMP2, BASE, TMP1 + | + |5: // Grow stack for fallback handler. + | load_got lj_state_growstack + | li CARG2, LUA_MINSTACK + | call_intern lj_state_growstack // (lua_State *L, int n) + |. move CARG1, L + | ld BASE, L->base + | b <1 + |. li CRET1, 0 // Force retry. + | + |->fff_gcstep: // Call GC step function. + | // BASE = new base, RC = nargs*8 + | move MULTRES, ra + | load_got lj_gc_step + | sd BASE, L->base + | daddu TMP0, BASE, NARGS8:RC + | sd PC, SAVE_PC // Redundant (but a defined value). + | sd TMP0, L->top + | call_intern lj_gc_step // (lua_State *L) + |. move CARG1, L + | ld BASE, L->base + | move ra, MULTRES + | ld TMP0, L->top + | ld CFUNC:RB, FRAME_FUNC(BASE) + | cleartp CFUNC:RB + | jr ra + |. dsubu NARGS8:RC, TMP0, BASE + | + |//----------------------------------------------------------------------- + |//-- Special dispatch targets ------------------------------------------- + |//----------------------------------------------------------------------- + | + |->vm_record: // Dispatch target for recording phase. + | NYI + | + |->vm_rethook: // Dispatch target for return hooks. + | lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH) + | andi AT, TMP3, HOOK_ACTIVE // Hook already active? + | beqz AT, >1 + |5: // Re-dispatch to static ins. + |. ld AT, GG_DISP2STATIC(TMP0) // Assumes TMP0 holds DISPATCH+OP*4. + | jr AT + |. nop + | + |->vm_inshook: // Dispatch target for instr/line hooks. + | lbu TMP3, DISPATCH_GL(hookmask)(DISPATCH) + | lw TMP2, DISPATCH_GL(hookcount)(DISPATCH) + | andi AT, TMP3, HOOK_ACTIVE // Hook already active? + | bnez AT, <5 + |. andi AT, TMP3, LUA_MASKLINE|LUA_MASKCOUNT + | beqz AT, <5 + |. addiu TMP2, TMP2, -1 + | beqz TMP2, >1 + |. sw TMP2, DISPATCH_GL(hookcount)(DISPATCH) + | andi AT, TMP3, LUA_MASKLINE + | beqz AT, <5 + |1: + |. load_got lj_dispatch_ins + | sw MULTRES, SAVE_MULTRES + | move CARG2, PC + | sd BASE, L->base + | // SAVE_PC must hold the _previous_ PC. The callee updates it with PC. + | call_intern lj_dispatch_ins // (lua_State *L, const BCIns *pc) + |. move CARG1, L + |3: + | ld BASE, L->base + |4: // Re-dispatch to static ins. + | lw INS, -4(PC) + | decode_OP8a TMP1, INS + | decode_OP8b TMP1 + | daddu TMP0, DISPATCH, TMP1 + | decode_RD8a RD, INS + | ld AT, GG_DISP2STATIC(TMP0) + | decode_RA8a RA, INS + | decode_RD8b RD + | jr AT + | decode_RA8b RA + | + |->cont_hook: // Continue from hook yield. + | daddiu PC, PC, 4 + | b <4 + |. lw MULTRES, -24+LO(RB) // Restore MULTRES for *M ins. + | + |->vm_hotloop: // Hot loop counter underflow. + | NYI + | + |->vm_callhook: // Dispatch target for call hooks. + |.if JIT + | b >1 + |.endif + |. move CARG2, PC + | + |->vm_hotcall: // Hot call counter underflow. + |.if JIT + | ori CARG2, PC, 1 + |1: + |.endif + | load_got lj_dispatch_call + | daddu TMP0, BASE, RC + | sd PC, SAVE_PC + | sd BASE, L->base + | dsubu RA, RA, BASE + | sd TMP0, L->top + | call_intern lj_dispatch_call // (lua_State *L, const BCIns *pc) + |. move CARG1, L + | // Returns ASMFunction. + | ld BASE, L->base + | ld TMP0, L->top + | sd r0, SAVE_PC // Invalidate for subsequent line hook. + | dsubu NARGS8:RC, TMP0, BASE + | daddu RA, BASE, RA + | ld LFUNC:RB, FRAME_FUNC(BASE) + | cleartp LFUNC:RB + | jr CRET1 + |. lw INS, -4(PC) + | + |->cont_stitch: // Trace stitching. + |.if JIT + | NYI + |.endif + | + |->vm_profhook: // Dispatch target for profiler hook. +#if LJ_HASPROFILE + | load_got lj_dispatch_profile + | sw MULTRES, SAVE_MULTRES + | move CARG2, PC + | sw BASE, L->base + | call_intern lj_dispatch_profile // (lua_State *L, const BCIns *pc) + |. move CARG1, L + | // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction. + | daddiu PC, PC, -4 + | b ->cont_nop + |. lw BASE, L->base +#endif + | + |//----------------------------------------------------------------------- + |//-- Trace exit handler ------------------------------------------------- + |//----------------------------------------------------------------------- + | + |.macro savex_, a, b + |.if FPU + | sdc1 f..a, a*8(sp) + | sd r..a, 32*8+a*8(sp) + | sd r..b, 32*8+b*8(sp) + |.else + | sd r..a, a*8(sp) + | sd r..b, b*8(sp) + |.endif + |.endmacro + | + |->vm_exit_handler: + |.if JIT + | NYI + |.endif + |->vm_exit_interp: + |.if JIT + | NYI + |.endif + | + |//----------------------------------------------------------------------- + |//-- Math helper functions ---------------------------------------------- + |//----------------------------------------------------------------------- + | + |// Hard-float round to integer. + |// Modifies AT, TMP0, FRET1, FRET2, f4. Keeps all others incl. FARG1. + |.macro vm_round_hf, func + | lui TMP0, 0x4330 // Hiword of 2^52 (double). + | dsll TMP0, TMP0, 32 + | dmtc1 TMP0, f4 + | abs.d FRET2, FARG1 // |x| + | dmfc1 AT, FARG1 + | c.olt.d 0, FRET2, f4 + | add.d FRET1, FRET2, f4 // (|x| + 2^52) - 2^52 + | bc1f 0, >1 // Truncate only if |x| < 2^52. + |. sub.d FRET1, FRET1, f4 + | slt AT, AT, r0 + |.if "func" == "ceil" + | lui TMP0, 0xbff0 // Hiword of -1 (double). Preserves -0. + |.else + | lui TMP0, 0x3ff0 // Hiword of +1 (double). + |.endif + |.if "func" == "trunc" + | dsll TMP0, TMP0, 32 + | dmtc1 TMP0, f4 + | c.olt.d 0, FRET2, FRET1 // |x| < result? + | sub.d FRET2, FRET1, f4 + | movt.d FRET1, FRET2, 0 // If yes, subtract +1. + | neg.d FRET2, FRET1 + | jr ra + |. movn.d FRET1, FRET2, AT // Merge sign bit back in. + |.else + | neg.d FRET2, FRET1 + | dsll TMP0, TMP0, 32 + | dmtc1 TMP0, f4 + | movn.d FRET1, FRET2, AT // Merge sign bit back in. + |.if "func" == "ceil" + | c.olt.d 0, FRET1, FARG1 // x > result? + |.else + | c.olt.d 0, FARG1, FRET1 // x < result? + |.endif + | sub.d FRET2, FRET1, f4 // If yes, subtract +-1. + | jr ra + |. movt.d FRET1, FRET2, 0 + |.endif + |1: + | jr ra + |. mov.d FRET1, FARG1 + |.endmacro + | + |.macro vm_round, func + |.if FPU + | vm_round_hf, func + |.endif + |.endmacro + | + |->vm_floor: + | vm_round floor + |->vm_ceil: + | vm_round ceil + |->vm_trunc: + |.if JIT + | vm_round trunc + |.endif + | + |// Soft-float integer to number conversion. + |.macro sfi2d, ARG + |.if not FPU + | beqz ARG, >9 // Handle zero first. + |. sra TMP0, ARG, 31 + | xor TMP1, ARG, TMP0 + | dsubu TMP1, TMP1, TMP0 // Absolute value in TMP1. + | dclz ARG, TMP1 + | addiu ARG, ARG, -11 + | li AT, 0x3ff+63-11-1 + | dsllv TMP1, TMP1, ARG // Align mantissa left with leading 1. + | subu ARG, AT, ARG // Exponent - 1. + | ins ARG, TMP0, 11, 11 // Sign | Exponent. + | dsll ARG, ARG, 52 // Align left. + | jr ra + |. daddu ARG, ARG, TMP1 // Add mantissa, increment exponent. + |9: + | jr ra + |. nop + |.endif + |.endmacro + | + |// Input CARG1. Output: CARG1. Temporaries: AT, TMP0, TMP1. + |->vm_sfi2d_1: + | sfi2d CARG1 + | + |// Input CARG2. Output: CARG2. Temporaries: AT, TMP0, TMP1. + |->vm_sfi2d_2: + | sfi2d CARG2 + | + |// Soft-float comparison. Equivalent to c.eq.d. + |// Input: CARG*. Output: CRET1. Temporaries: AT, TMP0, TMP1. + |->vm_sfcmpeq: + |.if not FPU + | dsll AT, CARG1, 1 + | dsll TMP0, CARG2, 1 + | or TMP1, AT, TMP0 + | beqz TMP1, >8 // Both args +-0: return 1. + |. lui TMP1, 0xffe0 + | dsll TMP1, TMP1, 32 + | sltu AT, TMP1, AT + | sltu TMP0, TMP1, TMP0 + | or TMP1, AT, TMP0 + | bnez TMP1, >9 // Either arg is NaN: return 0; + |. xor AT, CARG1, CARG2 + | jr ra + |. sltiu CRET1, AT, 1 // Same values: return 1. + |8: + | jr ra + |. li CRET1, 1 + |9: + | jr ra + |. li CRET1, 0 + |.endif + | + |// Soft-float comparison. Equivalent to c.ult.d and c.olt.d. + |// Input: CARG1, CARG2. Output: CRET1. Temporaries: AT, TMP0, TMP1, CRET2. + |->vm_sfcmpult: + |.if not FPU + | b >1 + |. li CRET2, 1 + |.endif + | + |->vm_sfcmpolt: + |.if not FPU + | li CRET2, 0 + |1: + | dsll AT, CARG1, 1 + | dsll TMP0, CARG2, 1 + | or TMP1, AT, TMP0 + | beqz TMP1, >8 // Both args +-0: return 0. + |. lui TMP1, 0xffe0 + | dsll TMP1, TMP1, 32 + | sltu AT, TMP1, AT + | sltu TMP0, TMP1, TMP0 + | or TMP1, AT, TMP0 + | bnez TMP1, >9 // Either arg is NaN: return 0 or 1; + |. and AT, CARG1, CARG2 + | bltz AT, >5 // Both args negative? + |. nop + | jr ra + |. slt CRET1, CARG1, CARG2 + |5: // Swap conditions if both operands are negative. + | jr ra + |. slt CRET1, CARG2, CARG1 + |8: + | jr ra + |. nop + |9: + | jr ra + |. move CRET1, CRET2 + |.endif + | + |// Soft-float comparison. Equivalent to c.ole.d a, b or c.ole.d b, a. + |// Input: CARG1, CARG2, TMP3. Output: CRET1. Temporaries: AT, TMP0, TMP1. + |->vm_sfcmpolex: + |.if not FPU + | dsll AT, CARG1, 1 + | dsll TMP0, CARG2, 1 + | or TMP1, AT, TMP0 + | beqz TMP1, >8 // Both args +-0: return 1. + |. lui TMP1, 0xffe0 + | dsll TMP1, TMP1, 32 + | sltu AT, TMP1, AT + | sltu TMP0, TMP1, TMP0 + | or TMP1, AT, TMP0 + | bnez TMP1, >9 // Either arg is NaN: return 0; + |. and AT, CARG1, CARG2 + | xor AT, AT, TMP3 + | bltz AT, >5 // Both args negative? + |. nop + | jr ra + |. slt CRET1, CARG2, CARG1 + |5: // Swap conditions if both operands are negative. + | jr ra + |. slt CRET1, CARG1, CARG2 + |8: + | jr ra + |. li CRET1, 1 + |9: + | jr ra + |. li CRET1, 0 + |.endif + | + |//----------------------------------------------------------------------- + |//-- Miscellaneous functions -------------------------------------------- + |//----------------------------------------------------------------------- + | + |//----------------------------------------------------------------------- + |//-- FFI helper functions ----------------------------------------------- + |//----------------------------------------------------------------------- + | + |// Handler for callback functions. Callback slot number in r1, g in r2. + |->vm_ffi_callback: + |.if FFI + |.type CTSTATE, CTState, PC + | saveregs + | ld CTSTATE, GL:r2->ctype_state + | daddiu DISPATCH, r2, GG_G2DISP + | load_got lj_ccallback_enter + | sw r1, CTSTATE->cb.slot + | sd CARG1, CTSTATE->cb.gpr[0] + | .FPU sdc1 FARG1, CTSTATE->cb.fpr[0] + | sd CARG2, CTSTATE->cb.gpr[1] + | .FPU sdc1 FARG2, CTSTATE->cb.fpr[1] + | sd CARG3, CTSTATE->cb.gpr[2] + | .FPU sdc1 FARG3, CTSTATE->cb.fpr[2] + | sd CARG4, CTSTATE->cb.gpr[3] + | .FPU sdc1 FARG4, CTSTATE->cb.fpr[3] + | sd CARG5, CTSTATE->cb.gpr[4] + | .FPU sdc1 FARG5, CTSTATE->cb.fpr[4] + | sd CARG6, CTSTATE->cb.gpr[5] + | .FPU sdc1 FARG6, CTSTATE->cb.fpr[5] + | sd CARG7, CTSTATE->cb.gpr[6] + | .FPU sdc1 FARG7, CTSTATE->cb.fpr[6] + | sd CARG8, CTSTATE->cb.gpr[7] + | .FPU sdc1 FARG8, CTSTATE->cb.fpr[7] + | daddiu TMP0, sp, CFRAME_SPACE + | sd TMP0, CTSTATE->cb.stack + | sd r0, SAVE_PC // Any value outside of bytecode is ok. + | move CARG2, sp + | call_intern lj_ccallback_enter // (CTState *cts, void *cf) + |. move CARG1, CTSTATE + | // Returns lua_State *. + | ld BASE, L:CRET1->base + | ld RC, L:CRET1->top + | move L, CRET1 + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | ld LFUNC:RB, FRAME_FUNC(BASE) + | .FPU mtc1 TMP3, TOBIT + | li TISNIL, LJ_TNIL + | li TISNUM, LJ_TISNUM + | li_vmstate INTERP + | subu RC, RC, BASE + | cleartp LFUNC:RB + | st_vmstate + | .FPU cvt.d.s TOBIT, TOBIT + | ins_callt + |.endif + | + |->cont_ffi_callback: // Return from FFI callback. + |.if FFI + | load_got lj_ccallback_leave + | ld CTSTATE, DISPATCH_GL(ctype_state)(DISPATCH) + | sd BASE, L->base + | sd RB, L->top + | sd L, CTSTATE->L + | move CARG2, RA + | call_intern lj_ccallback_leave // (CTState *cts, TValue *o) + |. move CARG1, CTSTATE + | .FPU ldc1 FRET1, CTSTATE->cb.fpr[0] + | ld CRET1, CTSTATE->cb.gpr[0] + | .FPU ldc1 FRET2, CTSTATE->cb.fpr[1] + | b ->vm_leave_unw + |. ld CRET2, CTSTATE->cb.gpr[1] + |.endif + | + |->vm_ffi_call: // Call C function via FFI. + | // Caveat: needs special frame unwinding, see below. + |.if FFI + | .type CCSTATE, CCallState, CARG1 + | lw TMP1, CCSTATE->spadj + | lbu CARG2, CCSTATE->nsp + | move TMP2, sp + | dsubu sp, sp, TMP1 + | sd ra, -8(TMP2) + | sll CARG2, CARG2, 3 + | sd r16, -16(TMP2) + | sd CCSTATE, -24(TMP2) + | move r16, TMP2 + | daddiu TMP1, CCSTATE, offsetof(CCallState, stack) + | move TMP2, sp + | beqz CARG2, >2 + |. daddu TMP3, TMP1, CARG2 + |1: + | ld TMP0, 0(TMP1) + | daddiu TMP1, TMP1, 8 + | sltu AT, TMP1, TMP3 + | sd TMP0, 0(TMP2) + | bnez AT, <1 + |. daddiu TMP2, TMP2, 8 + |2: + | ld CFUNCADDR, CCSTATE->func + | .FPU ldc1 FARG1, CCSTATE->gpr[0] + | ld CARG2, CCSTATE->gpr[1] + | .FPU ldc1 FARG2, CCSTATE->gpr[1] + | ld CARG3, CCSTATE->gpr[2] + | .FPU ldc1 FARG3, CCSTATE->gpr[2] + | ld CARG4, CCSTATE->gpr[3] + | .FPU ldc1 FARG4, CCSTATE->gpr[3] + | ld CARG5, CCSTATE->gpr[4] + | .FPU ldc1 FARG5, CCSTATE->gpr[4] + | ld CARG6, CCSTATE->gpr[5] + | .FPU ldc1 FARG6, CCSTATE->gpr[5] + | ld CARG7, CCSTATE->gpr[6] + | .FPU ldc1 FARG7, CCSTATE->gpr[6] + | ld CARG8, CCSTATE->gpr[7] + | .FPU ldc1 FARG8, CCSTATE->gpr[7] + | jalr CFUNCADDR + |. ld CARG1, CCSTATE->gpr[0] // Do this last, since CCSTATE is CARG1. + | ld CCSTATE:TMP1, -24(r16) + | ld TMP2, -16(r16) + | ld ra, -8(r16) + | sd CRET1, CCSTATE:TMP1->gpr[0] + | sd CRET2, CCSTATE:TMP1->gpr[1] + |.if FPU + | sdc1 FRET1, CCSTATE:TMP1->fpr[0] + | sdc1 FRET2, CCSTATE:TMP1->fpr[1] + |.else + | sd CARG1, CCSTATE:TMP1->gpr[2] // 2nd FP struct field for soft-float. + |.endif + | move sp, r16 + | jr ra + |. move r16, TMP2 + |.endif + |// Note: vm_ffi_call must be the last function in this object file! + | + |//----------------------------------------------------------------------- +} + +/* Generate the code for a single instruction. */ +static void build_ins(BuildCtx *ctx, BCOp op, int defop) +{ + int vk = 0; + |=>defop: + + switch (op) { + + /* -- Comparison ops ---------------------------------------------------- */ + + /* Remember: all ops branch for a true comparison, fall through otherwise. */ + + case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT: + | // RA = src1*8, RD = src2*8, JMP with RD = target + |.macro bc_comp, FRA, FRD, ARGRA, ARGRD, movop, fmovop, fcomp, sfcomp + | daddu RA, BASE, RA + | daddu RD, BASE, RD + | ld ARGRA, 0(RA) + | ld ARGRD, 0(RD) + | lhu TMP2, OFS_RD(PC) + | gettp CARG3, ARGRA + | gettp CARG4, ARGRD + | bne CARG3, TISNUM, >2 + |. daddiu PC, PC, 4 + | bne CARG4, TISNUM, >5 + |. decode_RD4b TMP2 + | sextw ARGRA, ARGRA + | sextw ARGRD, ARGRD + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | slt AT, CARG1, CARG2 + | addu TMP2, TMP2, TMP3 + | movop TMP2, r0, AT + |1: + | daddu PC, PC, TMP2 + | ins_next + | + |2: // RA is not an integer. + | sltiu AT, CARG3, LJ_TISNUM + | beqz AT, ->vmeta_comp + |. lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | sltiu AT, CARG4, LJ_TISNUM + | beqz AT, >4 + |. decode_RD4b TMP2 + |.if FPU + | ldc1 FRA, 0(RA) + | ldc1 FRD, 0(RD) + |.endif + |3: // RA and RD are both numbers. + |.if FPU + | fcomp f20, f22 + | addu TMP2, TMP2, TMP3 + | b <1 + |. fmovop TMP2, r0 + |.else + | bal sfcomp + |. addu TMP2, TMP2, TMP3 + | b <1 + |. movop TMP2, r0, CRET1 + |.endif + | + |4: // RA is a number, RD is not a number. + | bne CARG4, TISNUM, ->vmeta_comp + | // RA is a number, RD is an integer. Convert RD to a number. + |.if FPU + |. lwc1 FRD, LO(RD) + | ldc1 FRA, 0(RA) + | b <3 + |. cvt.d.w FRD, FRD + |.else + |.if "ARGRD" == "CARG1" + |. sextw CARG1, CARG1 + | bal ->vm_sfi2d_1 + |. nop + |.else + |. sextw CARG2, CARG2 + | bal ->vm_sfi2d_2 + |. nop + |.endif + | b <3 + |. nop + |.endif + | + |5: // RA is an integer, RD is not an integer + | sltiu AT, CARG4, LJ_TISNUM + | beqz AT, ->vmeta_comp + |. lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | // RA is an integer, RD is a number. Convert RA to a number. + |.if FPU + | lwc1 FRA, LO(RA) + | ldc1 FRD, 0(RD) + | b <3 + | cvt.d.w FRA, FRA + |.else + |.if "ARGRA" == "CARG1" + | bal ->vm_sfi2d_1 + |. sextw CARG1, CARG1 + |.else + | bal ->vm_sfi2d_2 + |. sextw CARG2, CARG2 + |.endif + | b <3 + |. nop + |.endif + |.endmacro + | + if (op == BC_ISLT) { + | bc_comp f20, f22, CARG1, CARG2, movz, movf, c.olt.d, ->vm_sfcmpolt + } else if (op == BC_ISGE) { + | bc_comp f20, f22, CARG1, CARG2, movn, movt, c.olt.d, ->vm_sfcmpolt + } else if (op == BC_ISLE) { + | bc_comp f22, f20, CARG2, CARG1, movn, movt, c.ult.d, ->vm_sfcmpult + } else { + | bc_comp f22, f20, CARG2, CARG1, movz, movf, c.ult.d, ->vm_sfcmpult + } + break; + + case BC_ISEQV: case BC_ISNEV: + vk = op == BC_ISEQV; + | // RA = src1*8, RD = src2*8, JMP with RD = target + | daddu RA, BASE, RA + | daddiu PC, PC, 4 + | daddu RD, BASE, RD + | ld CARG1, 0(RA) + | lhu TMP2, -4+OFS_RD(PC) + | ld CARG2, 0(RD) + | gettp CARG3, CARG1 + | gettp CARG4, CARG2 + | sltu AT, TISNUM, CARG3 + | sltu TMP1, TISNUM, CARG4 + | or AT, AT, TMP1 + if (vk) { + | beqz AT, ->BC_ISEQN_Z + } else { + | beqz AT, ->BC_ISNEN_Z + } + | // Either or both types are not numbers. + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + |.if FFI + |. li AT, LJ_TCDATA + | beq CARG3, AT, ->vmeta_equal_cd + |.endif + | decode_RD4b TMP2 + |.if FFI + | beq CARG4, AT, ->vmeta_equal_cd + |. nop + |.endif + | bne CARG1, CARG2, >2 + |. addu TMP2, TMP2, TMP3 + | // Tag and value are equal. + if (vk) { + |->BC_ISEQV_Z: + | daddu PC, PC, TMP2 + } + |1: + | ins_next + | + |2: // Check if the tags are the same and it's a table or userdata. + | xor AT, CARG3, CARG4 // Same type? + | sltiu TMP0, CARG3, LJ_TISTABUD+1 // Table or userdata? + | movn TMP0, r0, AT + if (vk) { + | beqz TMP0, <1 + } else { + | beqz TMP0, ->BC_ISEQV_Z // Reuse code from opposite instruction. + } + | // Different tables or userdatas. Need to check __eq metamethod. + | // Field metatable must be at same offset for GCtab and GCudata! + |. cleartp TAB:TMP1, CARG1 + | ld TAB:TMP3, TAB:TMP1->metatable + if (vk) { + | beqz TAB:TMP3, <1 // No metatable? + |. nop + | lbu TMP3, TAB:TMP3->nomm + | andi TMP3, TMP3, 1<1 // Or 'no __eq' flag set? + } else { + | beqz TAB:TMP3,->BC_ISEQV_Z // No metatable? + |. nop + | lbu TMP3, TAB:TMP3->nomm + | andi TMP3, TMP3, 1<BC_ISEQV_Z // Or 'no __eq' flag set? + } + |. nop + | b ->vmeta_equal // Handle __eq metamethod. + |. li TMP0, 1-vk // ne = 0 or 1. + break; + + case BC_ISEQS: case BC_ISNES: + vk = op == BC_ISEQS; + | // RA = src*8, RD = str_const*8 (~), JMP with RD = target + | daddu RA, BASE, RA + | daddiu PC, PC, 4 + | ld CARG1, 0(RA) + | dsubu RD, KBASE, RD + | lhu TMP2, -4+OFS_RD(PC) + | ld CARG2, -8(RD) // KBASE-8-str_const*8 + |.if FFI + | gettp TMP0, CARG1 + | li AT, LJ_TCDATA + |.endif + | li TMP1, LJ_TSTR + | decode_RD4b TMP2 + |.if FFI + | beq TMP0, AT, ->vmeta_equal_cd + |.endif + |. settp CARG2, TMP1 + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | xor TMP1, CARG1, CARG2 + | addu TMP2, TMP2, TMP3 + if (vk) { + | movn TMP2, r0, TMP1 + } else { + | movz TMP2, r0, TMP1 + } + | daddu PC, PC, TMP2 + | ins_next + break; + + case BC_ISEQN: case BC_ISNEN: + vk = op == BC_ISEQN; + | // RA = src*8, RD = num_const*8, JMP with RD = target + | daddu RA, BASE, RA + | daddu RD, KBASE, RD + | ld CARG1, 0(RA) + | ld CARG2, 0(RD) + | lhu TMP2, OFS_RD(PC) + | gettp CARG3, CARG1 + | gettp CARG4, CARG2 + | daddiu PC, PC, 4 + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + if (vk) { + |->BC_ISEQN_Z: + } else { + |->BC_ISNEN_Z: + } + | bne CARG3, TISNUM, >3 + |. decode_RD4b TMP2 + | bne CARG4, TISNUM, >6 + |. addu TMP2, TMP2, TMP3 + | xor AT, CARG1, CARG2 + if (vk) { + | movn TMP2, r0, AT + |1: + | daddu PC, PC, TMP2 + |2: + } else { + | movz TMP2, r0, AT + |1: + |2: + | daddu PC, PC, TMP2 + } + | ins_next + | + |3: // RA is not an integer. + | sltu AT, CARG3, TISNUM + |.if FFI + | beqz AT, >8 + |.else + | beqz AT, <2 + |.endif + |. addu TMP2, TMP2, TMP3 + | sltu AT, CARG4, TISNUM + |.if FPU + | ldc1 f20, 0(RA) + | ldc1 f22, 0(RD) + |.endif + | beqz AT, >5 + |. nop + |4: // RA and RD are both numbers. + |.if FPU + | c.eq.d f20, f22 + | b <1 + if (vk) { + |. movf TMP2, r0 + } else { + |. movt TMP2, r0 + } + |.else + | bal ->vm_sfcmpeq + |. nop + | b <1 + if (vk) { + |. movz TMP2, r0, CRET1 + } else { + |. movn TMP2, r0, CRET1 + } + |.endif + | + |5: // RA is a number, RD is not a number. + |.if FFI + | bne CARG4, TISNUM, >9 + |.else + | bne CARG4, TISNUM, <2 + |.endif + | // RA is a number, RD is an integer. Convert RD to a number. + |.if FPU + |. lwc1 f22, LO(RD) + | b <4 + |. cvt.d.w f22, f22 + |.else + |. sextw CARG2, CARG2 + | bal ->vm_sfi2d_2 + |. nop + | b <4 + |. nop + |.endif + | + |6: // RA is an integer, RD is not an integer + | sltu AT, CARG4, TISNUM + |.if FFI + | beqz AT, >9 + |.else + | beqz AT, <2 + |.endif + | // RA is an integer, RD is a number. Convert RA to a number. + |.if FPU + |. lwc1 f20, LO(RA) + | ldc1 f22, 0(RD) + | b <4 + | cvt.d.w f20, f20 + |.else + |. sextw CARG1, CARG1 + | bal ->vm_sfi2d_1 + |. nop + | b <4 + |. nop + |.endif + | + |.if FFI + |8: + | li AT, LJ_TCDATA + | bne CARG3, AT, <2 + |. nop + | b ->vmeta_equal_cd + |. nop + |9: + | li AT, LJ_TCDATA + | bne CARG4, AT, <2 + |. nop + | b ->vmeta_equal_cd + |. nop + |.endif + break; + + case BC_ISEQP: case BC_ISNEP: + vk = op == BC_ISEQP; + | // RA = src*8, RD = primitive_type*8 (~), JMP with RD = target + | daddu RA, BASE, RA + | srl TMP1, RD, 3 + | ld TMP0, 0(RA) + | lhu TMP2, OFS_RD(PC) + | not TMP1, TMP1 + | gettp TMP0, TMP0 + | daddiu PC, PC, 4 + |.if FFI + | li AT, LJ_TCDATA + | beq TMP0, AT, ->vmeta_equal_cd + |.endif + |. xor TMP0, TMP0, TMP1 + | decode_RD4b TMP2 + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | addu TMP2, TMP2, TMP3 + if (vk) { + | movn TMP2, r0, TMP0 + } else { + | movz TMP2, r0, TMP0 + } + | daddu PC, PC, TMP2 + | ins_next + break; + + /* -- Unary test and copy ops ------------------------------------------- */ + + case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF: + | // RA = dst*8 or unused, RD = src*8, JMP with RD = target + | daddu RD, BASE, RD + | lhu TMP2, OFS_RD(PC) + | ld TMP0, 0(RD) + | daddiu PC, PC, 4 + | gettp TMP0, TMP0 + | sltiu TMP0, TMP0, LJ_TISTRUECOND + if (op == BC_IST || op == BC_ISF) { + | decode_RD4b TMP2 + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | addu TMP2, TMP2, TMP3 + if (op == BC_IST) { + | movz TMP2, r0, TMP0 + } else { + | movn TMP2, r0, TMP0 + } + | daddu PC, PC, TMP2 + } else { + | ld CRET1, 0(RD) + if (op == BC_ISTC) { + | beqz TMP0, >1 + } else { + | bnez TMP0, >1 + } + |. daddu RA, BASE, RA + | decode_RD4b TMP2 + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | addu TMP2, TMP2, TMP3 + | sd CRET1, 0(RA) + | daddu PC, PC, TMP2 + |1: + } + | ins_next + break; + + case BC_ISTYPE: + | // RA = src*8, RD = -type*8 + | daddu TMP2, BASE, RA + | srl TMP1, RD, 3 + | ld TMP0, 0(TMP2) + | ins_next1 + | gettp TMP0, TMP0 + | daddu AT, TMP0, TMP1 + | bnez AT, ->vmeta_istype + |. ins_next2 + break; + case BC_ISNUM: + | // RA = src*8, RD = -(TISNUM-1)*8 + | daddu TMP2, BASE, RA + | ld TMP0, 0(TMP2) + | ins_next1 + | checknum TMP0, ->vmeta_istype + |. ins_next2 + break; + + /* -- Unary ops --------------------------------------------------------- */ + + case BC_MOV: + | // RA = dst*8, RD = src*8 + | daddu RD, BASE, RD + | daddu RA, BASE, RA + | ld CRET1, 0(RD) + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + break; + case BC_NOT: + | // RA = dst*8, RD = src*8 + | daddu RD, BASE, RD + | daddu RA, BASE, RA + | ld TMP0, 0(RD) + | li AT, LJ_TTRUE + | gettp TMP0, TMP0 + | sltu TMP0, AT, TMP0 + | addiu TMP0, TMP0, 1 + | dsll TMP0, TMP0, 47 + | not TMP0, TMP0 + | ins_next1 + | sd TMP0, 0(RA) + | ins_next2 + break; + case BC_UNM: + | // RA = dst*8, RD = src*8 + | daddu RB, BASE, RD + | ld CARG1, 0(RB) + | daddu RA, BASE, RA + | gettp CARG3, CARG1 + | bne CARG3, TISNUM, >2 + |. lui TMP1, 0x8000 + | sextw CARG1, CARG1 + | beq CARG1, TMP1, ->vmeta_unm // Meta handler deals with -2^31. + |. negu CARG1, CARG1 + | zextw CARG1, CARG1 + | settp CARG1, TISNUM + |1: + | ins_next1 + | sd CARG1, 0(RA) + | ins_next2 + |2: + | sltiu AT, CARG3, LJ_TISNUM + | beqz AT, ->vmeta_unm + |. dsll TMP1, TMP1, 32 + | b <1 + |. xor CARG1, CARG1, TMP1 + break; + case BC_LEN: + | // RA = dst*8, RD = src*8 + | daddu CARG2, BASE, RD + | daddu RA, BASE, RA + | ld TMP0, 0(CARG2) + | gettp TMP1, TMP0 + | daddiu AT, TMP1, -LJ_TSTR + | bnez AT, >2 + |. cleartp STR:CARG1, TMP0 + | lw CRET1, STR:CARG1->len + |1: + | settp CRET1, TISNUM + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + |2: + | daddiu AT, TMP1, -LJ_TTAB + | bnez AT, ->vmeta_len + |. nop +#if LJ_52 + | ld TAB:TMP2, TAB:CARG1->metatable + | bnez TAB:TMP2, >9 + |. nop + |3: +#endif + |->BC_LEN_Z: + | load_got lj_tab_len + | call_intern lj_tab_len // (GCtab *t) + |. nop + | // Returns uint32_t (but less than 2^31). + | b <1 + |. nop +#if LJ_52 + |9: + | lbu TMP0, TAB:TMP2->nomm + | andi TMP0, TMP0, 1<vmeta_len + |. nop +#endif + break; + + /* -- Binary ops -------------------------------------------------------- */ + + |.macro fpmod, a, b, c + | bal ->vm_floor // floor(b/c) + |. div.d FARG1, b, c + | mul.d a, FRET1, c + | sub.d a, b, a // b - floor(b/c)*c + |.endmacro + + |.macro sfpmod + | daddiu sp, sp, -16 + | + | load_got __divdf3 + | sd CARG1, 0(sp) + | call_extern + |. sd CARG2, 8(sp) + | + | load_got floor + | call_extern + |. move CARG1, CRET1 + | + | load_got __muldf3 + | move CARG1, CRET1 + | call_extern + |. ld CARG2, 8(sp) + | + | load_got __subdf3 + | ld CARG1, 0(sp) + | call_extern + |. move CARG2, CRET1 + | + | daddiu sp, sp, 16 + |.endmacro + + |.macro ins_arithpre, label + ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); + | // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8 + ||switch (vk) { + ||case 0: + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | // RA = dst*8, RB = src1*8, RC = num_const*8 + | daddu RB, BASE, RB + |.if "label" ~= "none" + | b label + |.endif + |. daddu RC, KBASE, RC + || break; + ||case 1: + | decode_RB8a RC, INS + | decode_RB8b RC + | decode_RDtoRC8 RB, RD + | // RA = dst*8, RB = num_const*8, RC = src1*8 + | daddu RC, BASE, RC + |.if "label" ~= "none" + | b label + |.endif + |. daddu RB, KBASE, RB + || break; + ||default: + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | // RA = dst*8, RB = src1*8, RC = src2*8 + | daddu RB, BASE, RB + |.if "label" ~= "none" + | b label + |.endif + |. daddu RC, BASE, RC + || break; + ||} + |.endmacro + | + |.macro ins_arith, intins, fpins, fpcall, label + | ins_arithpre none + | + |.if "label" ~= "none" + |label: + |.endif + | + |// Used in 5. + | ld CARG1, 0(RB) + | ld CARG2, 0(RC) + | gettp TMP0, CARG1 + | gettp TMP1, CARG2 + | + |.if "intins" ~= "div" + | + | // Check for two integers. + | sextw CARG3, CARG1 + | bne TMP0, TISNUM, >5 + |. sextw CARG4, CARG2 + | bne TMP1, TISNUM, >5 + | + |.if "intins" == "addu" + |. intins CRET1, CARG3, CARG4 + | xor TMP1, CRET1, CARG3 // ((y^a) & (y^b)) < 0: overflow. + | xor TMP2, CRET1, CARG4 + | and TMP1, TMP1, TMP2 + | bltz TMP1, ->vmeta_arith + |. daddu RA, BASE, RA + |.elif "intins" == "subu" + |. intins CRET1, CARG3, CARG4 + | xor TMP1, CRET1, CARG3 // ((y^a) & (a^b)) < 0: overflow. + | xor TMP2, CARG3, CARG4 + | and TMP1, TMP1, TMP2 + | bltz TMP1, ->vmeta_arith + |. daddu RA, BASE, RA + |.elif "intins" == "mult" + |. intins CARG3, CARG4 + | mflo CRET1 + | mfhi TMP2 + | sra TMP1, CRET1, 31 + | bne TMP1, TMP2, ->vmeta_arith + |. daddu RA, BASE, RA + |.else + |. load_got lj_vm_modi + | beqz CARG4, ->vmeta_arith + |. daddu RA, BASE, RA + | move CARG1, CARG3 + | call_extern + |. move CARG2, CARG4 + |.endif + | + | zextw CRET1, CRET1 + | settp CRET1, TISNUM + | ins_next1 + | sd CRET1, 0(RA) + |3: + | ins_next2 + | + |.endif + | + |5: // Check for two numbers. + | .FPU ldc1 f20, 0(RB) + | sltu AT, TMP0, TISNUM + | sltu TMP0, TMP1, TISNUM + | .FPU ldc1 f22, 0(RC) + | and AT, AT, TMP0 + | beqz AT, ->vmeta_arith + |. daddu RA, BASE, RA + | + |.if FPU + | fpins FRET1, f20, f22 + |.elif "fpcall" == "sfpmod" + | sfpmod + |.else + | load_got fpcall + | call_extern + |. nop + |.endif + | + | ins_next1 + |.if "intins" ~= "div" + | b <3 + |.endif + |.if FPU + |. sdc1 FRET1, 0(RA) + |.else + |. sd CRET1, 0(RA) + |.endif + |.if "intins" == "div" + | ins_next2 + |.endif + | + |.endmacro + + case BC_ADDVN: case BC_ADDNV: case BC_ADDVV: + | ins_arith addu, add.d, __adddf3, none + break; + case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: + | ins_arith subu, sub.d, __subdf3, none + break; + case BC_MULVN: case BC_MULNV: case BC_MULVV: + | ins_arith mult, mul.d, __muldf3, none + break; + case BC_DIVVN: + | ins_arith div, div.d, __divdf3, ->BC_DIVVN_Z + break; + case BC_DIVNV: case BC_DIVVV: + | ins_arithpre ->BC_DIVVN_Z + break; + case BC_MODVN: + | ins_arith modi, fpmod, sfpmod, ->BC_MODVN_Z + break; + case BC_MODNV: case BC_MODVV: + | ins_arithpre ->BC_MODVN_Z + break; + case BC_POW: + | ins_arithpre none + | ld CARG1, 0(RB) + | ld CARG2, 0(RC) + | gettp TMP0, CARG1 + | gettp TMP1, CARG2 + | sltiu TMP0, TMP0, LJ_TISNUM + | sltiu TMP1, TMP1, LJ_TISNUM + | and AT, TMP0, TMP1 + | load_got pow + | beqz AT, ->vmeta_arith + |. daddu RA, BASE, RA + |.if FPU + | ldc1 FARG1, 0(RB) + | ldc1 FARG2, 0(RC) + |.endif + | call_extern + |. nop + | ins_next1 + |.if FPU + | sdc1 FRET1, 0(RA) + |.else + | sd CRET1, 0(RA) + |.endif + | ins_next2 + break; + + case BC_CAT: + | // RA = dst*8, RB = src_start*8, RC = src_end*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | dsubu CARG3, RC, RB + | sd BASE, L->base + | daddu CARG2, BASE, RC + | move MULTRES, RB + |->BC_CAT_Z: + | load_got lj_meta_cat + | srl CARG3, CARG3, 3 + | sd PC, SAVE_PC + | call_intern lj_meta_cat // (lua_State *L, TValue *top, int left) + |. move CARG1, L + | // Returns NULL (finished) or TValue * (metamethod). + | bnez CRET1, ->vmeta_binop + |. ld BASE, L->base + | daddu RB, BASE, MULTRES + | ld CRET1, 0(RB) + | daddu RA, BASE, RA + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + break; + + /* -- Constant ops ------------------------------------------------------ */ + + case BC_KSTR: + | // RA = dst*8, RD = str_const*8 (~) + | dsubu TMP1, KBASE, RD + | ins_next1 + | li TMP2, LJ_TSTR + | ld TMP0, -8(TMP1) // KBASE-8-str_const*8 + | daddu RA, BASE, RA + | settp TMP0, TMP2 + | sd TMP0, 0(RA) + | ins_next2 + break; + case BC_KCDATA: + |.if FFI + | // RA = dst*8, RD = cdata_const*8 (~) + | dsubu TMP1, KBASE, RD + | ins_next1 + | ld TMP0, -8(TMP1) // KBASE-8-cdata_const*8 + | li TMP2, LJ_TCDATA + | daddu RA, BASE, RA + | settp TMP0, TMP2 + | sd TMP0, 0(RA) + | ins_next2 + |.endif + break; + case BC_KSHORT: + | // RA = dst*8, RD = int16_literal*8 + | sra RD, INS, 16 + | daddu RA, BASE, RA + | zextw RD, RD + | ins_next1 + | settp RD, TISNUM + | sd RD, 0(RA) + | ins_next2 + break; + case BC_KNUM: + | // RA = dst*8, RD = num_const*8 + | daddu RD, KBASE, RD + | daddu RA, BASE, RA + | ld CRET1, 0(RD) + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + break; + case BC_KPRI: + | // RA = dst*8, RD = primitive_type*8 (~) + | daddu RA, BASE, RA + | dsll TMP0, RD, 44 + | not TMP0, TMP0 + | ins_next1 + | sd TMP0, 0(RA) + | ins_next2 + break; + case BC_KNIL: + | // RA = base*8, RD = end*8 + | daddu RA, BASE, RA + | sd TISNIL, 0(RA) + | daddiu RA, RA, 8 + | daddu RD, BASE, RD + |1: + | sd TISNIL, 0(RA) + | slt AT, RA, RD + | bnez AT, <1 + |. daddiu RA, RA, 8 + | ins_next_ + break; + + /* -- Upvalue and function ops ------------------------------------------ */ + + case BC_UGET: + | // RA = dst*8, RD = uvnum*8 + | ld LFUNC:RB, FRAME_FUNC(BASE) + | daddu RA, BASE, RA + | cleartp LFUNC:RB + | daddu RD, RD, LFUNC:RB + | ld UPVAL:RB, LFUNC:RD->uvptr + | ins_next1 + | ld TMP1, UPVAL:RB->v + | ld CRET1, 0(TMP1) + | sd CRET1, 0(RA) + | ins_next2 + break; + case BC_USETV: + | // RA = uvnum*8, RD = src*8 + | ld LFUNC:RB, FRAME_FUNC(BASE) + | daddu RD, BASE, RD + | cleartp LFUNC:RB + | daddu RA, RA, LFUNC:RB + | ld UPVAL:RB, LFUNC:RA->uvptr + | ld CRET1, 0(RD) + | lbu TMP3, UPVAL:RB->marked + | ld CARG2, UPVAL:RB->v + | andi TMP3, TMP3, LJ_GC_BLACK // isblack(uv) + | lbu TMP0, UPVAL:RB->closed + | gettp TMP2, RD + | sd CRET1, 0(CARG2) + | li AT, LJ_GC_BLACK|1 + | or TMP3, TMP3, TMP0 + | beq TMP3, AT, >2 // Upvalue is closed and black? + |. daddiu TMP2, TMP2, -(LJ_TNUMX+1) + |1: + | ins_next + | + |2: // Check if new value is collectable. + | sltiu AT, TMP2, LJ_TISGCV - (LJ_TNUMX+1) + | beqz AT, <1 // tvisgcv(v) + |. cleartp GCOBJ:TMP1, RB + | lbu TMP3, GCOBJ:TMP1->gch.marked + | andi TMP3, TMP3, LJ_GC_WHITES // iswhite(v) + | beqz TMP3, <1 + |. load_got lj_gc_barrieruv + | // Crossed a write barrier. Move the barrier forward. + | call_intern lj_gc_barrieruv // (global_State *g, TValue *tv) + |. daddiu CARG1, DISPATCH, GG_DISP2G + | b <1 + |. nop + break; + case BC_USETS: + | // RA = uvnum*8, RD = str_const*8 (~) + | ld LFUNC:RB, FRAME_FUNC(BASE) + | dsubu TMP1, KBASE, RD + | cleartp LFUNC:RB + | daddu RA, RA, LFUNC:RB + | ld UPVAL:RB, LFUNC:RA->uvptr + | ld STR:TMP1, -8(TMP1) // KBASE-8-str_const*8 + | lbu TMP2, UPVAL:RB->marked + | ld CARG2, UPVAL:RB->v + | lbu TMP3, STR:TMP1->marked + | andi AT, TMP2, LJ_GC_BLACK // isblack(uv) + | lbu TMP2, UPVAL:RB->closed + | li TMP0, LJ_TSTR + | settp TMP1, TMP0 + | bnez AT, >2 + |. sd TMP1, 0(CARG2) + |1: + | ins_next + | + |2: // Check if string is white and ensure upvalue is closed. + | beqz TMP2, <1 + |. andi AT, TMP3, LJ_GC_WHITES // iswhite(str) + | beqz AT, <1 + |. load_got lj_gc_barrieruv + | // Crossed a write barrier. Move the barrier forward. + | call_intern lj_gc_barrieruv // (global_State *g, TValue *tv) + |. daddiu CARG1, DISPATCH, GG_DISP2G + | b <1 + |. nop + break; + case BC_USETN: + | // RA = uvnum*8, RD = num_const*8 + | ld LFUNC:RB, FRAME_FUNC(BASE) + | daddu RD, KBASE, RD + | cleartp LFUNC:RB + | daddu RA, RA, LFUNC:RB + | ld UPVAL:RB, LFUNC:RA->uvptr + | ld CRET1, 0(RD) + | ld TMP1, UPVAL:RB->v + | ins_next1 + | sd CRET1, 0(TMP1) + | ins_next2 + break; + case BC_USETP: + | // RA = uvnum*8, RD = primitive_type*8 (~) + | ld LFUNC:RB, FRAME_FUNC(BASE) + | dsll TMP0, RD, 44 + | cleartp LFUNC:RB + | daddu RA, RA, LFUNC:RB + | not TMP0, TMP0 + | ld UPVAL:RB, LFUNC:RA->uvptr + | ins_next1 + | ld TMP1, UPVAL:RB->v + | sd TMP0, 0(TMP1) + | ins_next2 + break; + + case BC_UCLO: + | // RA = level*8, RD = target + | ld TMP2, L->openupval + | branch_RD // Do this first since RD is not saved. + | load_got lj_func_closeuv + | sd BASE, L->base + | beqz TMP2, >1 + |. move CARG1, L + | call_intern lj_func_closeuv // (lua_State *L, TValue *level) + |. daddu CARG2, BASE, RA + | ld BASE, L->base + |1: + | ins_next + break; + + case BC_FNEW: + | // RA = dst*8, RD = proto_const*8 (~) (holding function prototype) + | load_got lj_func_newL_gc + | dsubu TMP1, KBASE, RD + | ld CARG3, FRAME_FUNC(BASE) + | ld CARG2, -8(TMP1) // KBASE-8-tab_const*8 + | sd BASE, L->base + | sd PC, SAVE_PC + | cleartp CARG3 + | // (lua_State *L, GCproto *pt, GCfuncL *parent) + | call_intern lj_func_newL_gc + |. move CARG1, L + | // Returns GCfuncL *. + | li TMP0, LJ_TFUNC + | ld BASE, L->base + | ins_next1 + | settp CRET1, TMP0 + | daddu RA, BASE, RA + | sd CRET1, 0(RA) + | ins_next2 + break; + + /* -- Table ops --------------------------------------------------------- */ + + case BC_TNEW: + case BC_TDUP: + | // RA = dst*8, RD = (hbits|asize)*8 | tab_const*8 (~) + | ld TMP0, DISPATCH_GL(gc.total)(DISPATCH) + | ld TMP1, DISPATCH_GL(gc.threshold)(DISPATCH) + | sd BASE, L->base + | sd PC, SAVE_PC + | sltu AT, TMP0, TMP1 + | beqz AT, >5 + |1: + if (op == BC_TNEW) { + | load_got lj_tab_new + | srl CARG2, RD, 3 + | andi CARG2, CARG2, 0x7ff + | li TMP0, 0x801 + | addiu AT, CARG2, -0x7ff + | srl CARG3, RD, 14 + | movz CARG2, TMP0, AT + | // (lua_State *L, int32_t asize, uint32_t hbits) + | call_intern lj_tab_new + |. move CARG1, L + | // Returns Table *. + } else { + | load_got lj_tab_dup + | dsubu TMP1, KBASE, RD + | move CARG1, L + | call_intern lj_tab_dup // (lua_State *L, Table *kt) + |. ld CARG2, -8(TMP1) // KBASE-8-str_const*8 + | // Returns Table *. + } + | li TMP0, LJ_TTAB + | ld BASE, L->base + | ins_next1 + | daddu RA, BASE, RA + | settp CRET1, TMP0 + | sd CRET1, 0(RA) + | ins_next2 + |5: + | load_got lj_gc_step_fixtop + | move MULTRES, RD + | call_intern lj_gc_step_fixtop // (lua_State *L) + |. move CARG1, L + | b <1 + |. move RD, MULTRES + break; + + case BC_GGET: + | // RA = dst*8, RD = str_const*8 (~) + case BC_GSET: + | // RA = src*8, RD = str_const*8 (~) + | ld LFUNC:TMP2, FRAME_FUNC(BASE) + | dsubu TMP1, KBASE, RD + | ld STR:RC, -8(TMP1) // KBASE-8-str_const*8 + | cleartp LFUNC:TMP2 + | ld TAB:RB, LFUNC:TMP2->env + if (op == BC_GGET) { + | b ->BC_TGETS_Z + } else { + | b ->BC_TSETS_Z + } + |. daddu RA, BASE, RA + break; + + case BC_TGETV: + | // RA = dst*8, RB = table*8, RC = key*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | daddu CARG2, BASE, RB + | daddu CARG3, BASE, RC + | ld TAB:RB, 0(CARG2) + | ld TMP2, 0(CARG3) + | daddu RA, BASE, RA + | checktab TAB:RB, ->vmeta_tgetv + | gettp TMP3, TMP2 + | bne TMP3, TISNUM, >5 // Integer key? + |. lw TMP0, TAB:RB->asize + | sextw TMP2, TMP2 + | ld TMP1, TAB:RB->array + | sltu AT, TMP2, TMP0 + | sll TMP2, TMP2, 3 + | beqz AT, ->vmeta_tgetv // Integer key and in array part? + |. daddu TMP2, TMP1, TMP2 + | ld AT, 0(TMP2) + | beq AT, TISNIL, >2 + |. ld CRET1, 0(TMP2) + |1: + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + | + |2: // Check for __index if table value is nil. + | ld TAB:TMP2, TAB:RB->metatable + | beqz TAB:TMP2, <1 // No metatable: done. + |. nop + | lbu TMP0, TAB:TMP2->nomm + | andi TMP0, TMP0, 1<vmeta_tgetv + |. nop + | + |5: + | li AT, LJ_TSTR + | bne TMP3, AT, ->vmeta_tgetv + |. cleartp RC, TMP2 + | b ->BC_TGETS_Z // String key? + |. nop + break; + case BC_TGETS: + | // RA = dst*8, RB = table*8, RC = str_const*8 (~) + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RC8a RC, INS + | daddu CARG2, BASE, RB + | decode_RC8b RC + | ld TAB:RB, 0(CARG2) + | dsubu CARG3, KBASE, RC + | daddu RA, BASE, RA + | ld STR:RC, -8(CARG3) // KBASE-8-str_const*8 + | checktab TAB:RB, ->vmeta_tgets1 + |->BC_TGETS_Z: + | // TAB:RB = GCtab *, STR:RC = GCstr *, RA = dst*8 + | lw TMP0, TAB:RB->hmask + | lw TMP1, STR:RC->hash + | ld NODE:TMP2, TAB:RB->node + | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask + | sll TMP0, TMP1, 5 + | sll TMP1, TMP1, 3 + | subu TMP1, TMP0, TMP1 + | li TMP3, LJ_TSTR + | daddu NODE:TMP2, NODE:TMP2, TMP1 // node = tab->node + (idx*32-idx*8) + | settp STR:RC, TMP3 // Tagged key to look for. + |1: + | ld CARG1, NODE:TMP2->key + | ld CRET1, NODE:TMP2->val + | ld NODE:TMP1, NODE:TMP2->next + | bne CARG1, RC, >4 + |. ld TAB:TMP3, TAB:RB->metatable + | beq CRET1, TISNIL, >5 // Key found, but nil value? + |. nop + |3: + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + | + |4: // Follow hash chain. + | bnez NODE:TMP1, <1 + |. move NODE:TMP2, NODE:TMP1 + | // End of hash chain: key not found, nil result. + | + |5: // Check for __index if table value is nil. + | beqz TAB:TMP3, <3 // No metatable: done. + |. move CRET1, TISNIL + | lbu TMP0, TAB:TMP3->nomm + | andi TMP0, TMP0, 1<vmeta_tgets + |. nop + break; + case BC_TGETB: + | // RA = dst*8, RB = table*8, RC = index*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | daddu CARG2, BASE, RB + | decode_RDtoRC8 RC, RD + | ld TAB:RB, 0(CARG2) + | daddu RA, BASE, RA + | srl TMP0, RC, 3 + | checktab TAB:RB, ->vmeta_tgetb + | lw TMP1, TAB:RB->asize + | ld TMP2, TAB:RB->array + | sltu AT, TMP0, TMP1 + | beqz AT, ->vmeta_tgetb + |. daddu RC, TMP2, RC + | ld AT, 0(RC) + | beq AT, TISNIL, >5 + |. ld CRET1, 0(RC) + |1: + | ins_next1 + | sd CRET1, 0(RA) + | ins_next2 + | + |5: // Check for __index if table value is nil. + | ld TAB:TMP2, TAB:RB->metatable + | beqz TAB:TMP2, <1 // No metatable: done. + |. nop + | lbu TMP1, TAB:TMP2->nomm + | andi TMP1, TMP1, 1<vmeta_tgetb // Caveat: preserve TMP0 and CARG2! + |. nop + break; + case BC_TGETR: + | // RA = dst*8, RB = table*8, RC = key*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | daddu RB, BASE, RB + | daddu RC, BASE, RC + | ld TAB:CARG1, 0(RB) + | lw CARG2, LO(RC) + | daddu RA, BASE, RA + | cleartp TAB:CARG1 + | lw TMP0, TAB:CARG1->asize + | ld TMP1, TAB:CARG1->array + | sltu AT, CARG2, TMP0 + | sll TMP2, CARG2, 3 + | beqz AT, ->vmeta_tgetr // In array part? + |. daddu CRET1, TMP1, TMP2 + | ld CARG2, 0(CRET1) + |->BC_TGETR_Z: + | ins_next1 + | sd CARG2, 0(RA) + | ins_next2 + break; + + case BC_TSETV: + | // RA = src*8, RB = table*8, RC = key*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | daddu CARG2, BASE, RB + | daddu CARG3, BASE, RC + | ld RB, 0(CARG2) + | ld TMP2, 0(CARG3) + | daddu RA, BASE, RA + | checktab RB, ->vmeta_tsetv + | checkint TMP2, >5 + |. sextw RC, TMP2 + | lw TMP0, TAB:RB->asize + | ld TMP1, TAB:RB->array + | sltu AT, RC, TMP0 + | sll TMP2, RC, 3 + | beqz AT, ->vmeta_tsetv // Integer key and in array part? + |. daddu TMP1, TMP1, TMP2 + | ld TMP0, 0(TMP1) + | lbu TMP3, TAB:RB->marked + | beq TMP0, TISNIL, >3 + |. ld CRET1, 0(RA) + |1: + | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + | bnez AT, >7 + |. sd CRET1, 0(TMP1) + |2: + | ins_next + | + |3: // Check for __newindex if previous value is nil. + | ld TAB:TMP2, TAB:RB->metatable + | beqz TAB:TMP2, <1 // No metatable: done. + |. nop + | lbu TMP2, TAB:TMP2->nomm + | andi TMP2, TMP2, 1<vmeta_tsetv + |. nop + | + |5: + | gettp AT, TMP2 + | daddiu AT, AT, -LJ_TSTR + | bnez AT, ->vmeta_tsetv + |. nop + | b ->BC_TSETS_Z // String key? + |. cleartp STR:RC, TMP2 + | + |7: // Possible table write barrier for the value. Skip valiswhite check. + | barrierback TAB:RB, TMP3, TMP0, <2 + break; + case BC_TSETS: + | // RA = src*8, RB = table*8, RC = str_const*8 (~) + | decode_RB8a RB, INS + | decode_RB8b RB + | daddu CARG2, BASE, RB + | decode_RC8a RC, INS + | ld TAB:RB, 0(CARG2) + | decode_RC8b RC + | dsubu CARG3, KBASE, RC + | ld RC, -8(CARG3) // KBASE-8-str_const*8 + | daddu RA, BASE, RA + | cleartp STR:RC + | checktab TAB:RB, ->vmeta_tsets1 + |->BC_TSETS_Z: + | // TAB:RB = GCtab *, STR:RC = GCstr *, RA = BASE+src*8 + | lw TMP0, TAB:RB->hmask + | lw TMP1, STR:RC->hash + | ld NODE:TMP2, TAB:RB->node + | sb r0, TAB:RB->nomm // Clear metamethod cache. + | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask + | sll TMP0, TMP1, 5 + | sll TMP1, TMP1, 3 + | subu TMP1, TMP0, TMP1 + | li TMP3, LJ_TSTR + | daddu NODE:TMP2, NODE:TMP2, TMP1 // node = tab->node + (idx*32-idx*8) + | settp STR:RC, TMP3 // Tagged key to look for. + |.if FPU + | ldc1 f20, 0(RA) + |.else + | ld CRET1, 0(RA) + |.endif + |1: + | ld TMP0, NODE:TMP2->key + | ld CARG2, NODE:TMP2->val + | ld NODE:TMP1, NODE:TMP2->next + | bne TMP0, RC, >5 + |. lbu TMP3, TAB:RB->marked + | beq CARG2, TISNIL, >4 // Key found, but nil value? + |. ld TAB:TMP0, TAB:RB->metatable + |2: + | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + | bnez AT, >7 + |.if FPU + |. sdc1 f20, NODE:TMP2->val + |.else + |. sd CRET1, NODE:TMP2->val + |.endif + |3: + | ins_next + | + |4: // Check for __newindex if previous value is nil. + | beqz TAB:TMP0, <2 // No metatable: done. + |. nop + | lbu TMP0, TAB:TMP0->nomm + | andi TMP0, TMP0, 1<vmeta_tsets + |. nop + | + |5: // Follow hash chain. + | bnez NODE:TMP1, <1 + |. move NODE:TMP2, NODE:TMP1 + | // End of hash chain: key not found, add a new one + | + | // But check for __newindex first. + | ld TAB:TMP2, TAB:RB->metatable + | beqz TAB:TMP2, >6 // No metatable: continue. + |. daddiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | lbu TMP0, TAB:TMP2->nomm + | andi TMP0, TMP0, 1<vmeta_tsets // 'no __newindex' flag NOT set: check. + |6: + | load_got lj_tab_newkey + | sd RC, 0(CARG3) + | sd BASE, L->base + | move CARG2, TAB:RB + | sd PC, SAVE_PC + | call_intern lj_tab_newkey // (lua_State *L, GCtab *t, TValue *k + |. move CARG1, L + | // Returns TValue *. + | ld BASE, L->base + |.if FPU + | b <3 // No 2nd write barrier needed. + |. sdc1 f20, 0(CRET1) + |.else + | ld CARG1, 0(RA) + | b <3 // No 2nd write barrier needed. + |. sd CARG1, 0(CRET1) + |.endif + | + |7: // Possible table write barrier for the value. Skip valiswhite check. + | barrierback TAB:RB, TMP3, TMP0, <3 + break; + case BC_TSETB: + | // RA = src*8, RB = table*8, RC = index*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | daddu CARG2, BASE, RB + | decode_RDtoRC8 RC, RD + | ld TAB:RB, 0(CARG2) + | daddu RA, BASE, RA + | srl TMP0, RC, 3 + | checktab RB, ->vmeta_tsetb + | lw TMP1, TAB:RB->asize + | ld TMP2, TAB:RB->array + | sltu AT, TMP0, TMP1 + | beqz AT, ->vmeta_tsetb + |. daddu RC, TMP2, RC + | ld TMP1, 0(RC) + | lbu TMP3, TAB:RB->marked + | beq TMP1, TISNIL, >5 + |1: + |. ld CRET1, 0(RA) + | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + | bnez AT, >7 + |. sd CRET1, 0(RC) + |2: + | ins_next + | + |5: // Check for __newindex if previous value is nil. + | ld TAB:TMP2, TAB:RB->metatable + | beqz TAB:TMP2, <1 // No metatable: done. + |. nop + | lbu TMP1, TAB:TMP2->nomm + | andi TMP1, TMP1, 1<vmeta_tsetb // Caveat: preserve TMP0 and CARG2! + |. nop + | + |7: // Possible table write barrier for the value. Skip valiswhite check. + | barrierback TAB:RB, TMP3, TMP0, <2 + break; + case BC_TSETR: + | // RA = dst*8, RB = table*8, RC = key*8 + | decode_RB8a RB, INS + | decode_RB8b RB + | decode_RDtoRC8 RC, RD + | daddu CARG1, BASE, RB + | daddu CARG3, BASE, RC + | ld TAB:CARG2, 0(CARG1) + | lw CARG3, LO(CARG3) + | cleartp TAB:CARG2 + | lbu TMP3, TAB:CARG2->marked + | lw TMP0, TAB:CARG2->asize + | ld TMP1, TAB:CARG2->array + | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + | bnez AT, >7 + |. daddu RA, BASE, RA + |2: + | sltu AT, CARG3, TMP0 + | sll TMP2, CARG3, 3 + | beqz AT, ->vmeta_tsetr // In array part? + |. daddu CRET1, TMP1, TMP2 + |->BC_TSETR_Z: + | ld CARG1, 0(RA) + | ins_next1 + | sd CARG1, 0(CRET1) + | ins_next2 + | + |7: // Possible table write barrier for the value. Skip valiswhite check. + | barrierback TAB:RB, TMP3, TMP0, <2 + break; + + case BC_TSETM: + | // RA = base*8 (table at base-1), RD = num_const*8 (start index) + | daddu RA, BASE, RA + |1: + | daddu TMP3, KBASE, RD + | ld TAB:CARG2, -8(RA) // Guaranteed to be a table. + | addiu TMP0, MULTRES, -8 + | lw TMP3, LO(TMP3) // Integer constant is in lo-word. + | beqz TMP0, >4 // Nothing to copy? + |. srl CARG3, TMP0, 3 + | cleartp CARG2 + | addu CARG3, CARG3, TMP3 + | lw TMP2, TAB:CARG2->asize + | sll TMP1, TMP3, 3 + | lbu TMP3, TAB:CARG2->marked + | ld CARG1, TAB:CARG2->array + | sltu AT, TMP2, CARG3 + | bnez AT, >5 + |. daddu TMP2, RA, TMP0 + | daddu TMP1, TMP1, CARG1 + | andi TMP0, TMP3, LJ_GC_BLACK // isblack(table) + |3: // Copy result slots to table. + | ld CRET1, 0(RA) + | daddiu RA, RA, 8 + | sltu AT, RA, TMP2 + | sd CRET1, 0(TMP1) + | bnez AT, <3 + |. daddiu TMP1, TMP1, 8 + | bnez TMP0, >7 + |. nop + |4: + | ins_next + | + |5: // Need to resize array part. + | load_got lj_tab_reasize + | sd BASE, L->base + | sd PC, SAVE_PC + | move BASE, RD + | call_intern lj_tab_reasize // (lua_State *L, GCtab *t, int nasize) + |. move CARG1, L + | // Must not reallocate the stack. + | move RD, BASE + | b <1 + |. ld BASE, L->base // Reload BASE for lack of a saved register. + | + |7: // Possible table write barrier for any value. Skip valiswhite check. + | barrierback TAB:CARG2, TMP3, TMP0, <4 + break; + + /* -- Calls and vararg handling ----------------------------------------- */ + + case BC_CALLM: + | // RA = base*8, (RB = (nresults+1)*8,) RC = extra_nargs*8 + | decode_RDtoRC8 NARGS8:RC, RD + | b ->BC_CALL_Z + |. addu NARGS8:RC, NARGS8:RC, MULTRES + break; + case BC_CALL: + | // RA = base*8, (RB = (nresults+1)*8,) RC = (nargs+1)*8 + | decode_RDtoRC8 NARGS8:RC, RD + |->BC_CALL_Z: + | move TMP2, BASE + | daddu BASE, BASE, RA + | ld LFUNC:RB, 0(BASE) + | daddiu BASE, BASE, 16 + | addiu NARGS8:RC, NARGS8:RC, -8 + | checkfunc RB, ->vmeta_call + | ins_call + break; + + case BC_CALLMT: + | // RA = base*8, (RB = 0,) RC = extra_nargs*8 + | addu NARGS8:RD, NARGS8:RD, MULTRES // BC_CALLT gets RC from RD. + | // Fall through. Assumes BC_CALLT follows. + break; + case BC_CALLT: + | // RA = base*8, (RB = 0,) RC = (nargs+1)*8 + | daddu RA, BASE, RA + | ld RB, 0(RA) + | move NARGS8:RC, RD + | ld TMP1, FRAME_PC(BASE) + | daddiu RA, RA, 16 + | addiu NARGS8:RC, NARGS8:RC, -8 + | checktp CARG3, RB, -LJ_TFUNC, ->vmeta_callt + |->BC_CALLT_Z: + | andi TMP0, TMP1, FRAME_TYPE // Caveat: preserve TMP0 until the 'or'. + | lbu TMP3, LFUNC:CARG3->ffid + | bnez TMP0, >7 + |. xori TMP2, TMP1, FRAME_VARG + |1: + | sd RB, FRAME_FUNC(BASE) // Copy function down, but keep PC. + | sltiu AT, TMP3, 2 // (> FF_C) Calling a fast function? + | move TMP2, BASE + | move RB, CARG3 + | beqz NARGS8:RC, >3 + |. move TMP3, NARGS8:RC + |2: + | ld CRET1, 0(RA) + | daddiu RA, RA, 8 + | addiu TMP3, TMP3, -8 + | sd CRET1, 0(TMP2) + | bnez TMP3, <2 + |. daddiu TMP2, TMP2, 8 + |3: + | or TMP0, TMP0, AT + | beqz TMP0, >5 + |. nop + |4: + | ins_callt + | + |5: // Tailcall to a fast function with a Lua frame below. + | lw INS, -4(TMP1) + | decode_RA8a RA, INS + | decode_RA8b RA + | dsubu TMP1, BASE, RA + | ld TMP1, -32(TMP1) + | cleartp LFUNC:TMP1 + | ld TMP1, LFUNC:TMP1->pc + | b <4 + |. ld KBASE, PC2PROTO(k)(TMP1) // Need to prepare KBASE. + | + |7: // Tailcall from a vararg function. + | andi AT, TMP2, FRAME_TYPEP + | bnez AT, <1 // Vararg frame below? + |. dsubu TMP2, BASE, TMP2 // Relocate BASE down. + | move BASE, TMP2 + | ld TMP1, FRAME_PC(TMP2) + | b <1 + |. andi TMP0, TMP1, FRAME_TYPE + break; + + case BC_ITERC: + | // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 ((2+1)*8)) + | move TMP2, BASE // Save old BASE fir vmeta_call. + | daddu BASE, BASE, RA + | ld RB, -24(BASE) + | ld CARG1, -16(BASE) + | ld CARG2, -8(BASE) + | li NARGS8:RC, 16 // Iterators get 2 arguments. + | sd RB, 0(BASE) // Copy callable. + | sd CARG1, 16(BASE) // Copy state. + | sd CARG2, 24(BASE) // Copy control var. + | daddiu BASE, BASE, 16 + | checkfunc RB, ->vmeta_call + | ins_call + break; + + case BC_ITERN: + | // RA = base*8, (RB = (nresults+1)*8, RC = (nargs+1)*8 (2+1)*8) + |.if JIT + | // NYI: add hotloop, record BC_ITERN. + |.endif + | daddu RA, BASE, RA + | ld TAB:RB, -16(RA) + | lw RC, -8+LO(RA) // Get index from control var. + | cleartp TAB:RB + | daddiu PC, PC, 4 + | lw TMP0, TAB:RB->asize + | ld TMP1, TAB:RB->array + | dsll CARG3, TISNUM, 47 + |1: // Traverse array part. + | sltu AT, RC, TMP0 + | beqz AT, >5 // Index points after array part? + |. sll TMP3, RC, 3 + | daddu TMP3, TMP1, TMP3 + | ld CARG1, 0(TMP3) + | lhu RD, -4+OFS_RD(PC) + | or TMP2, RC, CARG3 + | beq CARG1, TISNIL, <1 // Skip holes in array part. + |. addiu RC, RC, 1 + | sd TMP2, 0(RA) + | sd CARG1, 8(RA) + | or TMP0, RC, CARG3 + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | decode_RD4b RD + | daddu RD, RD, TMP3 + | sw TMP0, -8+LO(RA) // Update control var. + | daddu PC, PC, RD + |3: + | ins_next + | + |5: // Traverse hash part. + | lw TMP1, TAB:RB->hmask + | subu RC, RC, TMP0 + | ld TMP2, TAB:RB->node + |6: + | sltu AT, TMP1, RC // End of iteration? Branch to ITERL+1. + | bnez AT, <3 + |. sll TMP3, RC, 5 + | sll RB, RC, 3 + | subu TMP3, TMP3, RB + | daddu NODE:TMP3, TMP3, TMP2 + | ld CARG1, 0(NODE:TMP3) + | lhu RD, -4+OFS_RD(PC) + | beq CARG1, TISNIL, <6 // Skip holes in hash part. + |. addiu RC, RC, 1 + | ld CARG2, NODE:TMP3->key + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | sd CARG1, 8(RA) + | addu RC, RC, TMP0 + | decode_RD4b RD + | addu RD, RD, TMP3 + | sd CARG2, 0(RA) + | daddu PC, PC, RD + | b <3 + |. sw RC, -8+LO(RA) // Update control var. + break; + + case BC_ISNEXT: + | // RA = base*8, RD = target (points to ITERN) + | daddu RA, BASE, RA + | srl TMP0, RD, 1 + | ld CFUNC:CARG1, -24(RA) + | daddu TMP0, PC, TMP0 + | ld CARG2, -16(RA) + | ld CARG3, -8(RA) + | lui TMP2, (-(BCBIAS_J*4 >> 16) & 65535) + | checkfunc CFUNC:CARG1, >5 + | gettp CARG2, CARG2 + | daddiu CARG2, CARG2, -LJ_TTAB + | lbu TMP1, CFUNC:CARG1->ffid + | daddiu CARG3, CARG3, -LJ_TNIL + | or AT, CARG2, CARG3 + | daddiu TMP1, TMP1, -FF_next_N + | or AT, AT, TMP1 + | bnez AT, >5 + |. lui TMP1, 0xfffe + | daddu PC, TMP0, TMP2 + | ori TMP1, TMP1, 0x7fff + | dsll TMP1, TMP1, 32 + | sd TMP1, -8(RA) + |1: + | ins_next + |5: // Despecialize bytecode if any of the checks fail. + | li TMP3, BC_JMP + | li TMP1, BC_ITERC + | sb TMP3, -4+OFS_OP(PC) + | daddu PC, TMP0, TMP2 + | b <1 + |. sb TMP1, OFS_OP(PC) + break; + + case BC_VARG: + | // RA = base*8, RB = (nresults+1)*8, RC = numparams*8 + | ld TMP0, FRAME_PC(BASE) + | decode_RDtoRC8 RC, RD + | decode_RB8a RB, INS + | daddu RC, BASE, RC + | decode_RB8b RB + | daddu RA, BASE, RA + | daddiu RC, RC, FRAME_VARG + | daddu TMP2, RA, RB + | daddiu TMP3, BASE, -16 // TMP3 = vtop + | dsubu RC, RC, TMP0 // RC = vbase + | // Note: RC may now be even _above_ BASE if nargs was < numparams. + | beqz RB, >5 // Copy all varargs? + |. dsubu TMP1, TMP3, RC + | daddiu TMP2, TMP2, -16 + |1: // Copy vararg slots to destination slots. + | ld CARG1, 0(RC) + | sltu AT, RC, TMP3 + | daddiu RC, RC, 8 + | movz CARG1, TISNIL, AT + | sd CARG1, 0(RA) + | sltu AT, RA, TMP2 + | bnez AT, <1 + |. daddiu RA, RA, 8 + |3: + | ins_next + | + |5: // Copy all varargs. + | ld TMP0, L->maxstack + | blez TMP1, <3 // No vararg slots? + |. li MULTRES, 8 // MULTRES = (0+1)*8 + | daddu TMP2, RA, TMP1 + | sltu AT, TMP0, TMP2 + | bnez AT, >7 + |. daddiu MULTRES, TMP1, 8 + |6: + | ld CRET1, 0(RC) + | daddiu RC, RC, 8 + | sd CRET1, 0(RA) + | sltu AT, RC, TMP3 + | bnez AT, <6 // More vararg slots? + |. daddiu RA, RA, 8 + | b <3 + |. nop + | + |7: // Grow stack for varargs. + | load_got lj_state_growstack + | sd RA, L->top + | dsubu RA, RA, BASE + | sd BASE, L->base + | dsubu BASE, RC, BASE // Need delta, because BASE may change. + | sd PC, SAVE_PC + | srl CARG2, TMP1, 3 + | call_intern lj_state_growstack // (lua_State *L, int n) + |. move CARG1, L + | move RC, BASE + | ld BASE, L->base + | daddu RA, BASE, RA + | daddu RC, BASE, RC + | b <6 + |. daddiu TMP3, BASE, -16 + break; + + /* -- Returns ----------------------------------------------------------- */ + + case BC_RETM: + | // RA = results*8, RD = extra_nresults*8 + | addu RD, RD, MULTRES // MULTRES >= 8, so RD >= 8. + | // Fall through. Assumes BC_RET follows. + break; + + case BC_RET: + | // RA = results*8, RD = (nresults+1)*8 + | ld PC, FRAME_PC(BASE) + | daddu RA, BASE, RA + | move MULTRES, RD + |1: + | andi TMP0, PC, FRAME_TYPE + | bnez TMP0, ->BC_RETV_Z + |. xori TMP1, PC, FRAME_VARG + | + |->BC_RET_Z: + | // BASE = base, RA = resultptr, RD = (nresults+1)*8, PC = return + | lw INS, -4(PC) + | daddiu TMP2, BASE, -16 + | daddiu RC, RD, -8 + | decode_RA8a TMP0, INS + | decode_RB8a RB, INS + | decode_RA8b TMP0 + | decode_RB8b RB + | daddu TMP3, TMP2, RB + | beqz RC, >3 + |. dsubu BASE, TMP2, TMP0 + |2: + | ld CRET1, 0(RA) + | daddiu RA, RA, 8 + | daddiu RC, RC, -8 + | sd CRET1, 0(TMP2) + | bnez RC, <2 + |. daddiu TMP2, TMP2, 8 + |3: + | daddiu TMP3, TMP3, -8 + |5: + | sltu AT, TMP2, TMP3 + | bnez AT, >6 + |. ld LFUNC:TMP1, FRAME_FUNC(BASE) + | ins_next1 + | cleartp LFUNC:TMP1 + | ld TMP1, LFUNC:TMP1->pc + | ld KBASE, PC2PROTO(k)(TMP1) + | ins_next2 + | + |6: // Fill up results with nil. + | sd TISNIL, 0(TMP2) + | b <5 + |. daddiu TMP2, TMP2, 8 + | + |->BC_RETV_Z: // Non-standard return case. + | andi TMP2, TMP1, FRAME_TYPEP + | bnez TMP2, ->vm_return + |. nop + | // Return from vararg function: relocate BASE down. + | dsubu BASE, BASE, TMP1 + | b <1 + |. ld PC, FRAME_PC(BASE) + break; + + case BC_RET0: case BC_RET1: + | // RA = results*8, RD = (nresults+1)*8 + | ld PC, FRAME_PC(BASE) + | daddu RA, BASE, RA + | move MULTRES, RD + | andi TMP0, PC, FRAME_TYPE + | bnez TMP0, ->BC_RETV_Z + |. xori TMP1, PC, FRAME_VARG + | lw INS, -4(PC) + | daddiu TMP2, BASE, -16 + if (op == BC_RET1) { + | ld CRET1, 0(RA) + } + | decode_RB8a RB, INS + | decode_RA8a RA, INS + | decode_RB8b RB + | decode_RA8b RA + | dsubu BASE, TMP2, RA + if (op == BC_RET1) { + | sd CRET1, 0(TMP2) + } + |5: + | sltu AT, RD, RB + | bnez AT, >6 + |. ld TMP1, FRAME_FUNC(BASE) + | ins_next1 + | cleartp LFUNC:TMP1 + | ld TMP1, LFUNC:TMP1->pc + | ld KBASE, PC2PROTO(k)(TMP1) + | ins_next2 + | + |6: // Fill up results with nil. + | daddiu TMP2, TMP2, 8 + | daddiu RD, RD, 8 + | b <5 + if (op == BC_RET1) { + |. sd TISNIL, 0(TMP2) + } else { + |. sd TISNIL, -8(TMP2) + } + break; + + /* -- Loops and branches ------------------------------------------------ */ + + case BC_FORL: + |.if JIT + | hotloop + |.endif + | // Fall through. Assumes BC_IFORL follows. + break; + + case BC_JFORI: + case BC_JFORL: +#if !LJ_HASJIT + break; +#endif + case BC_FORI: + case BC_IFORL: + | // RA = base*8, RD = target (after end of loop or start of loop) + vk = (op == BC_IFORL || op == BC_JFORL); + | daddu RA, BASE, RA + | ld CARG1, FORL_IDX*8(RA) // IDX CARG1 - CARG3 type + | gettp CARG3, CARG1 + if (op != BC_JFORL) { + | srl RD, RD, 1 + | lui TMP2, (-(BCBIAS_J*4 >> 16) & 65535) + | daddu TMP2, RD, TMP2 + } + if (!vk) { + | ld CARG2, FORL_STOP*8(RA) // STOP CARG2 - CARG4 type + | ld CRET1, FORL_STEP*8(RA) // STEP CRET1 - CRET2 type + | gettp CARG4, CARG2 + | bne CARG3, TISNUM, >5 + |. gettp CRET2, CRET1 + | bne CARG4, TISNUM, ->vmeta_for + |. sextw CARG3, CARG1 + | bne CRET2, TISNUM, ->vmeta_for + |. sextw CARG2, CARG2 + | dext AT, CRET1, 31, 0 + | slt CRET1, CARG2, CARG3 + | slt TMP1, CARG3, CARG2 + | movn CRET1, TMP1, AT + } else { + | bne CARG3, TISNUM, >5 + |. ld CARG2, FORL_STEP*8(RA) // STEP CARG2 - CARG4 type + | ld CRET1, FORL_STOP*8(RA) // STOP CRET1 - CRET2 type + | sextw TMP3, CARG1 + | sextw CARG2, CARG2 + | sextw CRET1, CRET1 + | addu CARG1, TMP3, CARG2 + | xor TMP0, CARG1, TMP3 + | xor TMP1, CARG1, CARG2 + | and TMP0, TMP0, TMP1 + | slt TMP1, CARG1, CRET1 + | slt CRET1, CRET1, CARG1 + | slt AT, CARG2, r0 + | slt TMP0, TMP0, r0 // ((y^a) & (y^b)) < 0: overflow. + | movn CRET1, TMP1, AT + | or CRET1, CRET1, TMP0 + | zextw CARG1, CARG1 + | settp CARG1, TISNUM + } + |1: + if (op == BC_FORI) { + | movz TMP2, r0, CRET1 + | daddu PC, PC, TMP2 + } else if (op == BC_JFORI) { + | daddu PC, PC, TMP2 + | lhu RD, -4+OFS_RD(PC) + } else if (op == BC_IFORL) { + | movn TMP2, r0, CRET1 + | daddu PC, PC, TMP2 + } + if (vk) { + | sd CARG1, FORL_IDX*8(RA) + } + | ins_next1 + | sd CARG1, FORL_EXT*8(RA) + |2: + if (op == BC_JFORI) { + | beqz CRET1, =>BC_JLOOP + |. decode_RD8b RD + } else if (op == BC_JFORL) { + | beqz CRET1, =>BC_JLOOP + } + | ins_next2 + | + |5: // FP loop. + |.if FPU + if (!vk) { + | ldc1 f0, FORL_IDX*8(RA) + | ldc1 f2, FORL_STOP*8(RA) + | sltiu TMP0, CARG3, LJ_TISNUM + | sltiu TMP1, CARG4, LJ_TISNUM + | sltiu AT, CRET2, LJ_TISNUM + | ld TMP3, FORL_STEP*8(RA) + | and TMP0, TMP0, TMP1 + | and AT, AT, TMP0 + | beqz AT, ->vmeta_for + |. slt TMP3, TMP3, r0 + | c.ole.d 0, f0, f2 + | c.ole.d 1, f2, f0 + | li CRET1, 1 + | movt CRET1, r0, 0 + | movt AT, r0, 1 + | b <1 + |. movn CRET1, AT, TMP3 + } else { + | ldc1 f0, FORL_IDX*8(RA) + | ldc1 f4, FORL_STEP*8(RA) + | ldc1 f2, FORL_STOP*8(RA) + | ld TMP3, FORL_STEP*8(RA) + | add.d f0, f0, f4 + | c.ole.d 0, f0, f2 + | c.ole.d 1, f2, f0 + | slt TMP3, TMP3, r0 + | li CRET1, 1 + | li AT, 1 + | movt CRET1, r0, 0 + | movt AT, r0, 1 + | movn CRET1, AT, TMP3 + if (op == BC_IFORL) { + | movn TMP2, r0, CRET1 + | daddu PC, PC, TMP2 + } + | sdc1 f0, FORL_IDX*8(RA) + | ins_next1 + | b <2 + |. sdc1 f0, FORL_EXT*8(RA) + } + |.else + if (!vk) { + | sltiu TMP0, CARG3, LJ_TISNUM + | sltiu TMP1, CARG4, LJ_TISNUM + | sltiu AT, CRET2, LJ_TISNUM + | and TMP0, TMP0, TMP1 + | and AT, AT, TMP0 + | beqz AT, ->vmeta_for + |. nop + | bal ->vm_sfcmpolex + |. lw TMP3, FORL_STEP*8+HI(RA) + | b <1 + |. nop + } else { + | load_got __adddf3 + | call_extern + |. sw TMP2, TMPD + | ld CARG2, FORL_STOP*8(RA) + | move CARG1, CRET1 + if ( op == BC_JFORL ) { + | lhu RD, -4+OFS_RD(PC) + | decode_RD8b RD + } + | bal ->vm_sfcmpolex + |. lw TMP3, FORL_STEP*8+HI(RA) + | b <1 + |. lw TMP2, TMPD + } + |.endif + break; + + case BC_ITERL: + |.if JIT + | hotloop + |.endif + | // Fall through. Assumes BC_IITERL follows. + break; + + case BC_JITERL: +#if !LJ_HASJIT + break; +#endif + case BC_IITERL: + | // RA = base*8, RD = target + | daddu RA, BASE, RA + | ld TMP1, 0(RA) + | beq TMP1, TISNIL, >1 // Stop if iterator returned nil. + |. nop + if (op == BC_JITERL) { + | b =>BC_JLOOP + |. sd TMP1, -8(RA) + } else { + | branch_RD // Otherwise save control var + branch. + | sd TMP1, -8(RA) + } + |1: + | ins_next + break; + + case BC_LOOP: + | // RA = base*8, RD = target (loop extent) + | // Note: RA/RD is only used by trace recorder to determine scope/extent + | // This opcode does NOT jump, it's only purpose is to detect a hot loop. + |.if JIT + | hotloop + |.endif + | // Fall through. Assumes BC_ILOOP follows. + break; + + case BC_ILOOP: + | // RA = base*8, RD = target (loop extent) + | ins_next + break; + + case BC_JLOOP: + |.if JIT + | NYI + |.endif + break; + + case BC_JMP: + | // RA = base*8 (only used by trace recorder), RD = target + | branch_RD + | ins_next + break; + + /* -- Function headers -------------------------------------------------- */ + + case BC_FUNCF: + |.if JIT + | hotcall + |.endif + case BC_FUNCV: /* NYI: compiled vararg functions. */ + | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow. + break; + + case BC_JFUNCF: +#if !LJ_HASJIT + break; +#endif + case BC_IFUNCF: + | // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8 + | ld TMP2, L->maxstack + | lbu TMP1, -4+PC2PROTO(numparams)(PC) + | ld KBASE, -4+PC2PROTO(k)(PC) + | sltu AT, TMP2, RA + | bnez AT, ->vm_growstack_l + |. sll TMP1, TMP1, 3 + if (op != BC_JFUNCF) { + | ins_next1 + } + |2: + | sltu AT, NARGS8:RC, TMP1 // Check for missing parameters. + | bnez AT, >3 + |. daddu AT, BASE, NARGS8:RC + if (op == BC_JFUNCF) { + | decode_RD8a RD, INS + | b =>BC_JLOOP + |. decode_RD8b RD + } else { + | ins_next2 + } + | + |3: // Clear missing parameters. + | sd TISNIL, 0(AT) + | b <2 + |. addiu NARGS8:RC, NARGS8:RC, 8 + break; + + case BC_JFUNCV: +#if !LJ_HASJIT + break; +#endif + | NYI // NYI: compiled vararg functions + break; /* NYI: compiled vararg functions. */ + + case BC_IFUNCV: + | // BASE = new base, RA = BASE+framesize*8, RB = LFUNC, RC = nargs*8 + | daddu TMP1, BASE, RC + | ld TMP2, L->maxstack + | daddu TMP0, RA, RC + | sd LFUNC:RB, 0(TMP1) // Store (untagged) copy of LFUNC. + | daddiu TMP3, RC, 16+FRAME_VARG + | sltu AT, TMP0, TMP2 + | ld KBASE, -4+PC2PROTO(k)(PC) + | beqz AT, ->vm_growstack_l + |. sd TMP3, 8(TMP1) // Store delta + FRAME_VARG. + | lbu TMP2, -4+PC2PROTO(numparams)(PC) + | move RA, BASE + | move RC, TMP1 + | ins_next1 + | beqz TMP2, >3 + |. daddiu BASE, TMP1, 16 + |1: + | ld TMP0, 0(RA) + | sltu AT, RA, RC // Less args than parameters? + | move CARG1, TMP0 + | movz TMP0, TISNIL, AT // Clear missing parameters. + | movn CARG1, TISNIL, AT // Clear old fixarg slot (help the GC). + | addiu TMP2, TMP2, -1 + | sd TMP0, 16(TMP1) + | daddiu TMP1, TMP1, 8 + | sd CARG1, 0(RA) + | bnez TMP2, <1 + |. daddiu RA, RA, 8 + |3: + | ins_next2 + break; + + case BC_FUNCC: + case BC_FUNCCW: + | // BASE = new base, RA = BASE+framesize*8, RB = CFUNC, RC = nargs*8 + if (op == BC_FUNCC) { + | ld CFUNCADDR, CFUNC:RB->f + } else { + | ld CFUNCADDR, DISPATCH_GL(wrapf)(DISPATCH) + } + | daddu TMP1, RA, NARGS8:RC + | ld TMP2, L->maxstack + | daddu RC, BASE, NARGS8:RC + | sd BASE, L->base + | sltu AT, TMP2, TMP1 + | sd RC, L->top + | li_vmstate C + if (op == BC_FUNCCW) { + | ld CARG2, CFUNC:RB->f + } + | bnez AT, ->vm_growstack_c // Need to grow stack. + |. move CARG1, L + | jalr CFUNCADDR // (lua_State *L [, lua_CFunction f]) + |. st_vmstate + | // Returns nresults. + | ld BASE, L->base + | sll RD, CRET1, 3 + | ld TMP1, L->top + | li_vmstate INTERP + | ld PC, FRAME_PC(BASE) // Fetch PC of caller. + | dsubu RA, TMP1, RD // RA = L->top - nresults*8 + | sd L, DISPATCH_GL(cur_L)(DISPATCH) + | b ->vm_returnc + |. st_vmstate + break; + + /* ---------------------------------------------------------------------- */ + + default: + fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]); + exit(2); + break; + } +} + +static int build_backend(BuildCtx *ctx) +{ + int op; + + dasm_growpc(Dst, BC__MAX); + + build_subroutines(ctx); + + |.code_op + for (op = 0; op < BC__MAX; op++) + build_ins(ctx, (BCOp)op, op); + + return BC__MAX; +} + +/* Emit pseudo frame-info for all assembler functions. */ +static void emit_asm_debug(BuildCtx *ctx) +{ + int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code); + int i; + switch (ctx->mode) { + case BUILD_elfasm: + fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n"); + fprintf(ctx->fp, + ".Lframe0:\n" + "\t.4byte .LECIE0-.LSCIE0\n" + ".LSCIE0:\n" + "\t.4byte 0xffffffff\n" + "\t.byte 0x1\n" + "\t.string \"\"\n" + "\t.uleb128 0x1\n" + "\t.sleb128 -4\n" + "\t.byte 31\n" + "\t.byte 0xc\n\t.uleb128 29\n\t.uleb128 0\n" + "\t.align 2\n" + ".LECIE0:\n\n"); + fprintf(ctx->fp, + ".LSFDE0:\n" + "\t.4byte .LEFDE0-.LASFDE0\n" + ".LASFDE0:\n" + "\t.4byte .Lframe0\n" + "\t.8byte .Lbegin\n" + "\t.8byte %d\n" + "\t.byte 0xe\n\t.uleb128 %d\n" + "\t.byte 0x9f\n\t.sleb128 2*5\n" + "\t.byte 0x9e\n\t.sleb128 2*6\n", + fcofs, CFRAME_SIZE); + for (i = 23; i >= 16; i--) + fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2*(30-i)); +#if !LJ_SOFTFP + for (i = 31; i >= 24; i--) + fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 2*(46-i)); +#endif + fprintf(ctx->fp, + "\t.align 2\n" + ".LEFDE0:\n\n"); +#if LJ_HASFFI + fprintf(ctx->fp, + ".LSFDE1:\n" + "\t.4byte .LEFDE1-.LASFDE1\n" + ".LASFDE1:\n" + "\t.4byte .Lframe0\n" + "\t.4byte lj_vm_ffi_call\n" + "\t.4byte %d\n" + "\t.byte 0x9f\n\t.uleb128 2*1\n" + "\t.byte 0x90\n\t.uleb128 2*2\n" + "\t.byte 0xd\n\t.uleb128 0x10\n" + "\t.align 2\n" + ".LEFDE1:\n\n", (int)ctx->codesz - fcofs); +#endif +#if !LJ_NO_UNWIND + /* NYI */ +#endif + break; + default: + break; + } +} + From 56fe899a06c5a05e8d4ceb8972b3c45dab1c2979 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sat, 28 May 2016 13:36:14 +0200 Subject: [PATCH 38/57] Proper fix for LJ_GC64 changes to asm_href(). --- src/lj_asm_x86.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index f5230a44..dcdc727a 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1163,7 +1163,8 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) #if LJ_64 && !LJ_GC64 } else if (irt_islightud(kt)) { emit_rmro(as, XO_CMP, key|REX_64, dest, offsetof(Node, key.u64)); -#elif LJ_GC64 +#endif +#if LJ_GC64 } else if (irt_isaddr(kt)) { if (isk) { TValue k; @@ -1180,7 +1181,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) lua_assert(irt_ispri(kt) && !irt_isnil(kt)); emit_u32(as, (irt_toitype(kt)<<15)|0x7fff); emit_rmro(as, XO_ARITHi, XOg_CMP, dest, offsetof(Node, key.it)); -#endif +#else } else { if (!irt_ispri(kt)) { lua_assert(irt_isaddr(kt)); @@ -1194,6 +1195,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge) lua_assert(!irt_isnil(kt)); emit_i8(as, irt_toitype(kt)); emit_rmro(as, XO_ARITHi8, XOg_CMP, dest, offsetof(Node, key.it)); +#endif } emit_sfixup(as, l_loop); checkmclim(as); From 384ce2f9eff0322eca76a26fdaadd5840043710a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 29 May 2016 18:12:58 +0200 Subject: [PATCH 39/57] MIPS: Fix build failures and warnings. --- src/Makefile.dep | 4 ++-- src/lj_arch.h | 4 ++-- src/lj_opt_split.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Makefile.dep b/src/Makefile.dep index 1df1ce6c..4ef002e9 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -162,8 +162,8 @@ lj_opt_narrow.o: lj_opt_narrow.c lj_obj.h lua.h luaconf.h lj_def.h \ lj_opt_sink.o: lj_opt_sink.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_ir.h lj_jit.h lj_iropt.h lj_target.h lj_target_*.h lj_opt_split.o: lj_opt_split.c lj_obj.h lua.h luaconf.h lj_def.h \ - lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_dispatch.h \ - lj_bc.h lj_jit.h lj_ir.h lj_ircall.h lj_iropt.h lj_vm.h + lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h lj_ir.h \ + lj_jit.h lj_ircall.h lj_iropt.h lj_dispatch.h lj_bc.h lj_vm.h lj_parse.o: lj_parse.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h \ lj_gc.h lj_err.h lj_errmsg.h lj_debug.h lj_buf.h lj_str.h lj_tab.h \ lj_func.h lj_state.h lj_bc.h lj_ctype.h lj_strfmt.h lj_lex.h lj_parse.h \ diff --git a/src/lj_arch.h b/src/lj_arch.h index bbcdc739..cc5a0a66 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -428,11 +428,11 @@ #error "No support for PPC/e500 anymore (use LuaJIT 2.0)" #endif #elif LJ_TARGET_MIPS32 -#if _MIPS_SIM != _MIPS_SIM_ABI32 +#if !((defined(_MIPS_SIM_ABI32) && _MIPS_SIM == _MIPS_SIM_ABI32) || (defined(_ABIO32) && _MIPS_SIM == _ABIO32)) #error "Only o32 ABI supported for MIPS32" #endif #elif LJ_TARGET_MIPS64 -#if _MIPS_SIM != _MIPS_SIM_ABI64 +#if !((defined(_MIPS_SIM_ABI64) && _MIPS_SIM == _MIPS_SIM_ABI64) || (defined(_ABI64) && _MIPS_SIM == _ABI64)) #error "Only n64 ABI supported for MIPS64" #endif #endif diff --git a/src/lj_opt_split.c b/src/lj_opt_split.c index 19818660..884285d2 100644 --- a/src/lj_opt_split.c +++ b/src/lj_opt_split.c @@ -12,11 +12,11 @@ #include "lj_err.h" #include "lj_buf.h" -#include "lj_dispatch.h" #include "lj_ir.h" #include "lj_jit.h" #include "lj_ircall.h" #include "lj_iropt.h" +#include "lj_dispatch.h" #include "lj_vm.h" /* SPLIT pass: From a5f8a4819f21ddb9aa0fb4ddd3c221d4512968e9 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 3 Jun 2016 04:26:08 +0200 Subject: [PATCH 40/57] Don't try to record outermost pcall() return to lower frame. --- src/lj_record.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_record.c b/src/lj_record.c index 44b3667f..3f76c92a 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -687,7 +687,7 @@ void lj_record_ret(jit_State *J, BCReg rbase, ptrdiff_t gotresults) (void)getslot(J, rbase+i); /* Ensure all results have a reference. */ while (frame_ispcall(frame)) { /* Immediately resolve pcall() returns. */ BCReg cbase = (BCReg)frame_delta(frame); - if (--J->framedepth < 0) + if (--J->framedepth <= 0) lj_trace_err(J, LJ_TRERR_NYIRETL); lua_assert(J->baseslot > 1); gotresults++; From 32063075636d4f4017f318fc0a367dd49c730895 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 3 Jun 2016 06:39:52 +0200 Subject: [PATCH 41/57] Fix Valgrind suppressions. --- src/lj.supp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lj.supp b/src/lj.supp index acb9e789..217f7c89 100644 --- a/src/lj.supp +++ b/src/lj.supp @@ -27,15 +27,15 @@ { Optimized string compare Memcheck:Addr4 - fun:lj_str_fastcmp + fun:str_fastcmp } { Optimized string compare Memcheck:Addr1 - fun:lj_str_fastcmp + fun:str_fastcmp } { Optimized string compare Memcheck:Cond - fun:lj_str_fastcmp + fun:str_fastcmp } From 58ca165737363af4800683577f2a6cc92e640dcc Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 3 Jun 2016 06:42:35 +0200 Subject: [PATCH 42/57] LJ_GC64: Allow optional use of the system memory allocator. --- src/Makefile | 4 ++-- src/lib_aux.c | 6 +++--- src/lj_state.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index 16aeba19..fb5fdcb7 100644 --- a/src/Makefile +++ b/src/Makefile @@ -121,8 +121,8 @@ XCFLAGS= # # Use the system provided memory allocator (realloc) instead of the # bundled memory allocator. This is slower, but sometimes helpful for -# debugging. This option cannot be enabled on x64, since realloc usually -# doesn't return addresses in the right address range. +# debugging. This option cannot be enabled on x64 without GC64, since +# realloc usually doesn't return addresses in the right address range. # OTOH this option is mandatory for Valgrind's memcheck tool on x64 and # the only way to get useful results from it for all other architectures. #XCFLAGS+= -DLUAJIT_USE_SYSMALLOC diff --git a/src/lib_aux.c b/src/lib_aux.c index 0e61d951..d6f56e30 100644 --- a/src/lib_aux.c +++ b/src/lib_aux.c @@ -302,7 +302,7 @@ static int panic(lua_State *L) #ifdef LUAJIT_USE_SYSMALLOC -#if LJ_64 && !defined(LUAJIT_USE_VALGRIND) +#if LJ_64 && !LJ_GC64 && !defined(LUAJIT_USE_VALGRIND) #error "Must use builtin allocator for 64 bit target" #endif @@ -334,7 +334,7 @@ LUALIB_API lua_State *luaL_newstate(void) lua_State *L; void *ud = lj_alloc_create(); if (ud == NULL) return NULL; -#if LJ_64 +#if LJ_64 && !LJ_GC64 L = lj_state_newstate(lj_alloc_f, ud); #else L = lua_newstate(lj_alloc_f, ud); @@ -343,7 +343,7 @@ LUALIB_API lua_State *luaL_newstate(void) return L; } -#if LJ_64 +#if LJ_64 && !LJ_GC64 LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) { UNUSED(f); UNUSED(ud); diff --git a/src/lj_state.c b/src/lj_state.c index 66bf439f..a3bfc45e 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -180,7 +180,7 @@ static void close_state(lua_State *L) g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0); } -#if LJ_64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC)) +#if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC)) lua_State *lj_state_newstate(lua_Alloc f, void *ud) #else LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) From cc05e79181992d0d595b44f13538486315667491 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 3 Jun 2016 06:53:37 +0200 Subject: [PATCH 43/57] LJ_GC64: Ensure all IR slot fields are initialized. --- src/lj_asm_x86.h | 1 + src/lj_ir.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index dcdc727a..35f48727 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -2970,6 +2970,7 @@ static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) static void asm_setup_target(ASMState *as) { asm_exitstub_setup(as, as->T->nsnap); + as->mrm.base = 0; } /* -- Trace patching ------------------------------------------------------ */ diff --git a/src/lj_ir.c b/src/lj_ir.c index 0a206ebb..87fd0f4d 100644 --- a/src/lj_ir.c +++ b/src/lj_ir.c @@ -224,6 +224,7 @@ TRef lj_ir_k64(jit_State *J, IROp op, uint64_t u64) ir[1].tv.u64 = u64; ir->t.irt = t; ir->o = op; + ir->op12 = 0; ir->prev = J->chain[op]; J->chain[op] = (IRRef1)ref; found: @@ -281,6 +282,7 @@ TRef lj_ir_kgc(jit_State *J, GCobj *o, IRType t) ref = ir_nextkgc(J); ir = IR(ref); /* NOBARRIER: Current trace is a GC root. */ + ir->op12 = 0; setgcref(ir[LJ_GC64].gcr, o); ir->t.irt = (uint8_t)t; ir->o = IR_KGC; @@ -298,6 +300,7 @@ TRef lj_ir_ktrace(jit_State *J) lua_assert(irt_toitype_(IRT_P64) == LJ_TTRACE); ir->t.irt = IRT_P64; ir->o = LJ_GC64 ? IR_KNUM : IR_KNULL; /* Not IR_KGC yet, but same size. */ + ir->op12 = 0; ir->prev = 0; return TREF(ref, IRT_P64); } @@ -319,6 +322,7 @@ TRef lj_ir_kptr_(jit_State *J, IROp op, void *ptr) ref = ir_nextk(J); #endif ir = IR(ref); + ir->op12 = 0; setmref(ir[LJ_GC64].ptr, ptr); ir->t.irt = IRT_PGC; ir->o = op; From ce1ad870c3d9536c759ecada89717c3317d56a3d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 3 Jun 2016 06:54:06 +0200 Subject: [PATCH 44/57] LJ_GC64: Set correct nil value when clearing a cdata finalizer. Thanks to Stefan Pejic. --- src/lj_cdata.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lj_cdata.c b/src/lj_cdata.c index 05e27dad..e8ffdbcb 100644 --- a/src/lj_cdata.c +++ b/src/lj_cdata.c @@ -93,11 +93,13 @@ void lj_cdata_setfin(lua_State *L, GCcdata *cd, GCobj *obj, uint32_t it) setcdataV(L, &tmp, cd); lj_gc_anybarriert(L, t); tv = lj_tab_set(L, t, &tmp); - setgcV(L, tv, obj, it); - if (!tvisnil(tv)) - cd->marked |= LJ_GC_CDATA_FIN; - else + if (it == LJ_TNIL) { + setnilV(tv); cd->marked &= ~LJ_GC_CDATA_FIN; + } else { + setgcV(L, tv, obj, it); + cd->marked |= LJ_GC_CDATA_FIN; + } } } From a7bec69a7593b2887a6d1d8dd13272aa005c36fa Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 5 Jun 2016 12:53:37 +0200 Subject: [PATCH 45/57] Fix PHI remarking in SINK pass. Thanks to Vyacheslav Egorov. --- src/lj_opt_sink.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lj_opt_sink.c b/src/lj_opt_sink.c index 975ee831..c988cdfb 100644 --- a/src/lj_opt_sink.c +++ b/src/lj_opt_sink.c @@ -153,10 +153,9 @@ static void sink_remark_phi(jit_State *J) remark = 0; for (ir = IR(J->cur.nins-1); ir->o == IR_PHI; ir--) { IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); - if (((irl->t.irt ^ irr->t.irt) & IRT_MARK)) - remark = 1; - else if (irl->prev == irr->prev) + if (!((irl->t.irt ^ irr->t.irt) & IRT_MARK) && irl->prev == irr->prev) continue; + remark |= (~(irl->t.irt & irr->t.irt) & IRT_MARK); irt_setmark(IR(ir->op1)->t); irt_setmark(IR(ir->op2)->t); } From f5983437a6b08c140bdeb2fc15fa30d7f3b0daad Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 5 Jun 2016 13:07:43 +0200 Subject: [PATCH 46/57] x64/LJ_GC64: Fix code generation for IR_KNULL call argument. --- src/lj_asm_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 35f48727..50784daa 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -599,7 +599,7 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) if (r) { /* Argument is in a register. */ if (r < RID_MAX_GPR && ref < ASMREF_TMP1) { #if LJ_64 - if (LJ_GC64 ? ir->o != IR_KINT : ir->o == IR_KINT64) + if (LJ_GC64 ? !(ir->o == IR_KINT || ir->o == IR_KNULL) : ir->o == IR_KINT64) emit_loadu64(as, r, ir_k64(ir)->u64); else #endif From 287a5347cfe452d44748327fb7c27f6ce57f5dc2 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 8 Jun 2016 10:24:00 +0200 Subject: [PATCH 47/57] MIPS: Support MIPS16 interlinking. --- doc/install.html | 2 +- src/jit/dis_mips.lua | 2 +- src/lib_jit.c | 4 ++++ src/lj_emit_mips.h | 3 ++- src/lj_target_mips.h | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/install.html b/doc/install.html index 38c9a6bb..bbc71c98 100644 --- a/doc/install.html +++ b/doc/install.html @@ -386,7 +386,7 @@ important to compile with the proper CPU or architecture settings:
  • The best way to get consistent results is to specify the correct settings when building the toolchain yourself.
  • For a pre-built, generic toolchain add -mcpu=... or -march=... and other necessary flags to TARGET_CFLAGS.
  • For ARM it's important to have the correct -mfloat-abi=... setting, too. Otherwise LuaJIT may not run at the full performance of your target CPU.
  • -
  • For MIPS it's important to select a supported ABI (o32 on MIPS32, n64 on MIPS64) and consistently compile your project either with hard-float or soft-float compiler settings. Do not use -mips16.
  • +
  • For MIPS it's important to select a supported ABI (o32 on MIPS32, n64 on MIPS64) and consistently compile your project either with hard-float or soft-float compiler settings.
  • Here are some examples for targets with a different CPU than the host: diff --git a/src/jit/dis_mips.lua b/src/jit/dis_mips.lua index 19327d6f..6776f0cb 100644 --- a/src/jit/dis_mips.lua +++ b/src/jit/dis_mips.lua @@ -214,7 +214,7 @@ local map_pri = { map_cop0, map_cop1, false, map_cop1x, "beql|beqzlST0B", "bnel|bnezlST0B", "blezlSB", "bgtzlSB", false, false, false, false, - map_special2, false, false, map_special3, + map_special2, "jalxJ", false, map_special3, "lbTSO", "lhTSO", "lwlTSO", "lwTSO", "lbuTSO", "lhuTSO", "lwrTSO", false, "sbTSO", "shTSO", "swlTSO", "swTSO", diff --git a/src/lib_jit.c b/src/lib_jit.c index 1655f0c5..592538bd 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -721,8 +721,12 @@ static uint32_t jit_cpudetect(lua_State *L) #if defined(__GNUC__) if (!(flags & JIT_F_MIPSXXR2)) { int x; +#ifdef __mips16 + x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ +#else /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); +#endif if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ } #endif diff --git a/src/lj_emit_mips.h b/src/lj_emit_mips.h index 9df04771..d35f830b 100644 --- a/src/lj_emit_mips.h +++ b/src/lj_emit_mips.h @@ -157,7 +157,8 @@ static void emit_call(ASMState *as, void *target, int needcfa) MCode *p = as->mcp; *--p = MIPSI_NOP; if ((((uintptr_t)target ^ (uintptr_t)p) >> 28) == 0) { - *--p = MIPSI_JAL | (((uintptr_t)target >>2) & 0x03ffffffu); + *--p = (((uintptr_t)target & 1) ? MIPSI_JALX : MIPSI_JAL) | + (((uintptr_t)target >>2) & 0x03ffffffu); } else { /* Target out of range: need indirect call. */ *--p = MIPSI_JALR | MIPSF_S(RID_CFUNCADDR); needcfa = 1; diff --git a/src/lj_target_mips.h b/src/lj_target_mips.h index ac72528d..6a7d4b50 100644 --- a/src/lj_target_mips.h +++ b/src/lj_target_mips.h @@ -239,6 +239,7 @@ typedef enum MIPSIns { MIPSI_B = 0x10000000, MIPSI_J = 0x08000000, MIPSI_JAL = 0x0c000000, + MIPSI_JALX = 0x74000000, MIPSI_JR = 0x00000008, MIPSI_JALR = 0x0000f809, From aef4edddbabb9b510541abd131f9298a25a52e7d Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 27 Jun 2016 14:09:36 +0200 Subject: [PATCH 48/57] Drop leftover regs in 'for' iterator assignment, too. --- src/lj_parse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lj_parse.c b/src/lj_parse.c index 2a601f1d..6b5f12fb 100644 --- a/src/lj_parse.c +++ b/src/lj_parse.c @@ -2207,6 +2207,8 @@ static void assign_adjust(LexState *ls, BCReg nvars, BCReg nexps, ExpDesc *e) bcemit_nil(fs, reg, (BCReg)extra); } } + if (nexps > nvars) + ls->fs->freereg -= nexps - nvars; /* Drop leftover regs. */ } /* Recursively parse assignment statement. */ @@ -2240,8 +2242,6 @@ static void parse_assignment(LexState *ls, LHSVarList *lh, BCReg nvars) return; } assign_adjust(ls, nvars, nexps, &e); - if (nexps > nvars) - ls->fs->freereg -= nexps - nvars; /* Drop leftover regs. */ } /* Assign RHS to LHS and recurse downwards. */ expr_init(&e, VNONRELOC, ls->fs->freereg-1); From 01e4754962130dc85802a9738707d7fdb76c879c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 27 Jun 2016 15:46:09 +0200 Subject: [PATCH 49/57] Properly clean up state before restart of trace assembly. --- src/lj_asm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index dba5c178..7ce58924 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1993,12 +1993,6 @@ static void asm_setup_regsp(ASMState *as) /* REF_BASE is used for implicit references to the BASE register. */ lastir->prev = REGSP_HINT(RID_BASE); - ir = IR(nins-1); - if (ir->o == IR_RENAME) { - /* Remove any renames left over from ASM restart due to LJ_TRERR_MCODELM. */ - do { ir--; nins--; } while (ir->o == IR_RENAME); - T->nins = nins; - } as->snaprename = nins; as->snapref = nins; as->snapno = T->nsnap; @@ -2230,6 +2224,16 @@ void lj_asm_trace(jit_State *J, GCtrace *T) ASMState *as = &as_; MCode *origtop; + /* Remove nops/renames left over from ASM restart due to LJ_TRERR_MCODELM. */ + { + IRRef nins = T->nins; + IRIns *ir = &T->ir[nins-1]; + if (ir->o == IR_NOP || ir->o == IR_RENAME) { + do { ir--; nins--; } while (ir->o == IR_NOP || ir->o == IR_RENAME); + T->nins = nins; + } + } + /* Ensure an initialized instruction beyond the last one for HIOP checks. */ /* This also allows one RENAME to be added without reallocating curfinal. */ as->orignins = lj_ir_nextins(J); From 1914de71c7dc10c502ecf033c63665eb6d3e6433 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 17 Jul 2016 14:29:03 +0200 Subject: [PATCH 50/57] Fix unused vars etc. in internal Lua files. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to François Perrad. --- src/host/genminilua.lua | 8 ++++---- src/jit/dis_arm.lua | 2 +- src/jit/dis_mips.lua | 4 ++-- src/jit/dis_ppc.lua | 2 +- src/jit/dump.lua | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/host/genminilua.lua b/src/host/genminilua.lua index e9a68297..16a81a23 100644 --- a/src/host/genminilua.lua +++ b/src/host/genminilua.lua @@ -157,11 +157,11 @@ local function merge_includes(src) if includes[name] then return "" end includes[name] = true local fp = assert(io.open(LUA_SOURCE..name, "r")) - local src = fp:read("*a") + local inc = fp:read("*a") assert(fp:close()) - src = gsub(src, "#ifndef%s+%w+_h\n#define%s+%w+_h\n", "") - src = gsub(src, "#endif%s*$", "") - return merge_includes(src) + inc = gsub(inc, "#ifndef%s+%w+_h\n#define%s+%w+_h\n", "") + inc = gsub(inc, "#endif%s*$", "") + return merge_includes(inc) end) end diff --git a/src/jit/dis_arm.lua b/src/jit/dis_arm.lua index 661f661a..8ad902cd 100644 --- a/src/jit/dis_arm.lua +++ b/src/jit/dis_arm.lua @@ -12,7 +12,7 @@ local type = type local sub, byte, format = string.sub, string.byte, string.format -local match, gmatch, gsub = string.match, string.gmatch, string.gsub +local match, gmatch = string.match, string.gmatch local concat = table.concat local bit = require("bit") local band, bor, ror, tohex = bit.band, bit.bor, bit.ror, bit.tohex diff --git a/src/jit/dis_mips.lua b/src/jit/dis_mips.lua index 5b68b069..fe9cadb8 100644 --- a/src/jit/dis_mips.lua +++ b/src/jit/dis_mips.lua @@ -11,8 +11,8 @@ ------------------------------------------------------------------------------ local type = type -local sub, byte, format = string.sub, string.byte, string.format -local match, gmatch, gsub = string.match, string.gmatch, string.gsub +local byte, format = string.byte, string.format +local match, gmatch = string.match, string.gmatch local concat = table.concat local bit = require("bit") local band, bor, tohex = bit.band, bit.bor, bit.tohex diff --git a/src/jit/dis_ppc.lua b/src/jit/dis_ppc.lua index 8afecbe6..1a98c7ec 100644 --- a/src/jit/dis_ppc.lua +++ b/src/jit/dis_ppc.lua @@ -13,7 +13,7 @@ ------------------------------------------------------------------------------ local type = type -local sub, byte, format = string.sub, string.byte, string.format +local byte, format = string.byte, string.format local match, gmatch, gsub = string.match, string.gmatch, string.gsub local concat = table.concat local bit = require("bit") diff --git a/src/jit/dump.lua b/src/jit/dump.lua index ec5f8276..7b776422 100644 --- a/src/jit/dump.lua +++ b/src/jit/dump.lua @@ -63,9 +63,9 @@ local traceinfo, traceir, tracek = jutil.traceinfo, jutil.traceir, jutil.tracek local tracemc, tracesnap = jutil.tracemc, jutil.tracesnap local traceexitstub, ircalladdr = jutil.traceexitstub, jutil.ircalladdr local bit = require("bit") -local band, shl, shr = bit.band, bit.lshift, bit.rshift +local band, shr = bit.band, bit.rshift local sub, gsub, format = string.sub, string.gsub, string.format -local byte, char, rep = string.byte, string.char, string.rep +local byte, rep = string.byte, string.rep local type, tostring = type, tostring local stdout, stderr = io.stdout, io.stderr @@ -207,7 +207,7 @@ local colortype_ansi = { "\027[35m%s\027[m", } -local function colorize_text(s, t) +local function colorize_text(s) return s end From b74ddaf174ce087cf9d6087766afcb7180656661 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 17 Jul 2016 16:01:10 +0200 Subject: [PATCH 51/57] Fix for cdata vs. non-cdata arithmetics/comparisons. Thanks to Vyacheslav Egorov. --- src/lj_crecord.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lj_crecord.c b/src/lj_crecord.c index a53a2ce9..0565e5fd 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -1205,7 +1205,7 @@ void LJ_FASTCALL recff_cdata_call(jit_State *J, RecordFFData *rd) static TRef crec_arith_int64(jit_State *J, TRef *sp, CType **s, MMS mm) { - if (ctype_isnum(s[0]->info) && ctype_isnum(s[1]->info)) { + if (sp[0] && sp[1] && ctype_isnum(s[0]->info) && ctype_isnum(s[1]->info)) { IRType dt; CTypeID id; TRef tr; @@ -1263,6 +1263,7 @@ static TRef crec_arith_ptr(jit_State *J, TRef *sp, CType **s, MMS mm) { CTState *cts = ctype_ctsG(J2G(J)); CType *ctp = s[0]; + if (!(sp[0] && sp[1])) return 0; if (ctype_isptr(ctp->info) || ctype_isrefarray(ctp->info)) { if ((mm == MM_sub || mm == MM_eq || mm == MM_lt || mm == MM_le) && (ctype_isptr(s[1]->info) || ctype_isrefarray(s[1]->info))) { From 6be5ffdf2b20b870392ee265f6ff7792f072592f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 17 Jul 2016 16:05:25 +0200 Subject: [PATCH 52/57] Adjust comment with defines. --- src/lj_ctype.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/lj_ctype.h b/src/lj_ctype.h index 6866736e..39759057 100644 --- a/src/lj_ctype.h +++ b/src/lj_ctype.h @@ -42,18 +42,18 @@ LJ_STATIC_ASSERT(((int)CT_STRUCT & (int)CT_ARRAY) == CT_STRUCT); ** ---------- info ------------ ** |type flags... A cid | size | sib | next | name | ** +----------------------------+--------+-------+-------+-------+-- -** |NUM BFvcUL.. A | size | | type | | -** |STRUCT ..vcU..V A | size | field | name? | name? | -** |PTR ..vcR... A cid | size | | type | | -** |ARRAY VCvc...V A cid | size | | type | | -** |VOID ..vc.... A | size | | type | | +** |NUM BFcvUL.. A | size | | type | | +** |STRUCT ..cvU..V A | size | field | name? | name? | +** |PTR ..cvR... A cid | size | | type | | +** |ARRAY VCcv...V A cid | size | | type | | +** |VOID ..cv.... A | size | | type | | ** |ENUM A cid | size | const | name? | name? | ** |FUNC ....VS.. cc cid | nargs | field | name? | name? | ** |TYPEDEF cid | | | name | name | ** |ATTRIB attrnum cid | attr | sib? | type? | | ** |FIELD cid | offset | field | | name? | -** |BITFIELD B.vcU csz bsz pos | offset | field | | name? | -** |CONSTVAL c cid | value | const | name | name | +** |BITFIELD B.cvU csz bsz pos | offset | field | | name? | +** |CONSTVAL c cid | value | const | name | name | ** |EXTERN cid | | sib? | name | name | ** |KW tok | size | | name | name | ** +----------------------------+--------+-------+-------+-------+-- From 92d9ff211ae864777a8580b5a7326d5f408161ce Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 17 Jul 2016 16:23:49 +0200 Subject: [PATCH 53/57] Set arg table before evaluating LUA_INIT and -e chunks. --- doc/extensions.html | 1 + src/luajit.c | 103 +++++++++++++++++++++++++------------------- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/doc/extensions.html b/doc/extensions.html index 368b527c..9d666293 100644 --- a/doc/extensions.html +++ b/doc/extensions.html @@ -349,6 +349,7 @@ break the Lua/C API and ABI (e.g. _ENV). LuaJIT supports some extensions from Lua 5.3:

    • Unicode escape '\u{XX...}' embeds the UTF-8 encoding in string literals.
    • +
    • The argument table arg can be read (and modified) by LUA_INIT and -e chunks.

    C++ Exception Interoperability

    diff --git a/src/luajit.c b/src/luajit.c index 00e12bd3..4de2d43c 100644 --- a/src/luajit.c +++ b/src/luajit.c @@ -152,22 +152,15 @@ static void print_jit_status(lua_State *L) putc('\n', stdout); } -static int getargs(lua_State *L, char **argv, int n) +static void createargtable(lua_State *L, char **argv, int argc, int argf) { - int narg; int i; - int argc = 0; - while (argv[argc]) argc++; /* count total number of arguments */ - narg = argc - (n + 1); /* number of arguments to the script */ - luaL_checkstack(L, narg + 3, "too many arguments to script"); - for (i = n+1; i < argc; i++) - lua_pushstring(L, argv[i]); - lua_createtable(L, narg, n + 1); + lua_createtable(L, argc - argf, argf); for (i = 0; i < argc; i++) { lua_pushstring(L, argv[i]); - lua_rawseti(L, -2, i - n); + lua_rawseti(L, -2, i - argf); } - return narg; + lua_setglobal(L, "arg"); } static int dofile(lua_State *L, const char *name) @@ -273,21 +266,30 @@ static void dotty(lua_State *L) progname = oldprogname; } -static int handle_script(lua_State *L, char **argv, int n) +static int handle_script(lua_State *L, char **argx) { int status; - const char *fname; - int narg = getargs(L, argv, n); /* collect arguments */ - lua_setglobal(L, "arg"); - fname = argv[n]; - if (strcmp(fname, "-") == 0 && strcmp(argv[n-1], "--") != 0) + const char *fname = argx[0]; + if (strcmp(fname, "-") == 0 && strcmp(argx[-1], "--") != 0) fname = NULL; /* stdin */ status = luaL_loadfile(L, fname); - lua_insert(L, -(narg+1)); - if (status == 0) + if (status == 0) { + /* Fetch args from arg table. LUA_INIT or -e might have changed them. */ + int narg = 0; + lua_getglobal(L, "arg"); + if (lua_istable(L, -1)) { + do { + narg++; + lua_rawgeti(L, -narg, narg); + } while (!lua_isnil(L, -1)); + lua_pop(L, 1); + lua_remove(L, -narg); + narg--; + } else { + lua_pop(L, 1); + } status = docall(L, narg, 0); - else - lua_pop(L, narg); + } return report(L, status); } @@ -384,7 +386,8 @@ static int dobytecode(lua_State *L, char **argv) } for (argv++; *argv != NULL; narg++, argv++) lua_pushstring(L, *argv); - return report(L, lua_pcall(L, narg, 0, 0)); + report(L, lua_pcall(L, narg, 0, 0)); + return 1; } /* check that argument has no extra characters at the end */ @@ -405,7 +408,7 @@ static int collectargs(char **argv, int *flags) switch (argv[i][1]) { /* Check option. */ case '-': notail(argv[i]); - return (argv[i+1] != NULL ? i+1 : 0); + return i+1; case '\0': return i; case 'i': @@ -430,23 +433,23 @@ static int collectargs(char **argv, int *flags) case 'b': /* LuaJIT extension */ if (*flags) return -1; *flags |= FLAGS_EXEC; - return 0; + return i+1; case 'E': *flags |= FLAGS_NOENV; break; default: return -1; /* invalid option */ } } - return 0; + return i; } -static int runargs(lua_State *L, char **argv, int n) +static int runargs(lua_State *L, char **argv, int argn) { int i; - for (i = 1; i < n; i++) { + for (i = 1; i < argn; i++) { if (argv[i] == NULL) continue; lua_assert(argv[i][0] == '-'); - switch (argv[i][1]) { /* option */ + switch (argv[i][1]) { case 'e': { const char *chunk = argv[i] + 2; if (*chunk == '\0') chunk = argv[++i]; @@ -460,10 +463,10 @@ static int runargs(lua_State *L, char **argv, int n) if (*filename == '\0') filename = argv[++i]; lua_assert(filename != NULL); if (dolibrary(L, filename)) - return 1; /* stop if file fails */ + return 1; break; } - case 'j': { /* LuaJIT extension */ + case 'j': { /* LuaJIT extension. */ const char *cmd = argv[i] + 2; if (*cmd == '\0') cmd = argv[++i]; lua_assert(cmd != NULL); @@ -471,11 +474,11 @@ static int runargs(lua_State *L, char **argv, int n) return 1; break; } - case 'O': /* LuaJIT extension */ + case 'O': /* LuaJIT extension. */ if (dojitopt(L, argv[i] + 2)) return 1; break; - case 'b': /* LuaJIT extension */ + case 'b': /* LuaJIT extension. */ return dobytecode(L, argv+i); default: break; } @@ -508,45 +511,57 @@ static int pmain(lua_State *L) { struct Smain *s = &smain; char **argv = s->argv; - int script; + int argn; int flags = 0; globalL = L; if (argv[0] && argv[0][0]) progname = argv[0]; - LUAJIT_VERSION_SYM(); /* linker-enforced version check */ - script = collectargs(argv, &flags); - if (script < 0) { /* invalid args? */ + + LUAJIT_VERSION_SYM(); /* Linker-enforced version check. */ + + argn = collectargs(argv, &flags); + if (argn < 0) { /* Invalid args? */ print_usage(); s->status = 1; return 0; } + if ((flags & FLAGS_NOENV)) { lua_pushboolean(L, 1); lua_setfield(L, LUA_REGISTRYINDEX, "LUA_NOENV"); } - lua_gc(L, LUA_GCSTOP, 0); /* stop collector during initialization */ - luaL_openlibs(L); /* open libraries */ + + /* Stop collector during library initialization. */ + lua_gc(L, LUA_GCSTOP, 0); + luaL_openlibs(L); lua_gc(L, LUA_GCRESTART, -1); + + createargtable(L, argv, s->argc, argn); + if (!(flags & FLAGS_NOENV)) { s->status = handle_luainit(L); if (s->status != 0) return 0; } + if ((flags & FLAGS_VERSION)) print_version(); - s->status = runargs(L, argv, (script > 0) ? script : s->argc); + + s->status = runargs(L, argv, argn); if (s->status != 0) return 0; - if (script) { - s->status = handle_script(L, argv, script); + + if (s->argc > argn) { + s->status = handle_script(L, argv + argn); if (s->status != 0) return 0; } + if ((flags & FLAGS_INTERACTIVE)) { print_jit_status(L); dotty(L); - } else if (script == 0 && !(flags & (FLAGS_EXEC|FLAGS_VERSION))) { + } else if (s->argc == argn && !(flags & (FLAGS_EXEC|FLAGS_VERSION))) { if (lua_stdin_is_tty()) { print_version(); print_jit_status(L); dotty(L); } else { - dofile(L, NULL); /* executes stdin as a file */ + dofile(L, NULL); /* Executes stdin as a file. */ } } return 0; @@ -555,7 +570,7 @@ static int pmain(lua_State *L) int main(int argc, char **argv) { int status; - lua_State *L = lua_open(); /* create state */ + lua_State *L = lua_open(); if (L == NULL) { l_message(argv[0], "cannot create state: not enough memory"); return EXIT_FAILURE; From d41469c12461da00db73d548d0fbf018fb28494a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 17 Jul 2016 16:24:30 +0200 Subject: [PATCH 54/57] Emit bytecode in .c/.h files with unsigned char type. --- src/jit/bcsave.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/jit/bcsave.lua b/src/jit/bcsave.lua index d0968b18..5117f714 100644 --- a/src/jit/bcsave.lua +++ b/src/jit/bcsave.lua @@ -125,12 +125,12 @@ extern "C" #ifdef _WIN32 __declspec(dllexport) #endif -const char %s%s[] = { +const unsigned char %s%s[] = { ]], LJBC_PREFIX, ctx.modname)) else fp:write(string.format([[ #define %s%s_SIZE %d -static const char %s%s[] = { +static const unsigned char %s%s[] = { ]], LJBC_PREFIX, ctx.modname, #s, LJBC_PREFIX, ctx.modname)) end local t, n, m = {}, 0, 0 From c98660c8c3921e43029625e51166c9d273ad09df Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 22 Jul 2016 16:35:46 +0200 Subject: [PATCH 55/57] Must preserve J->fold.ins (fins) around call to lj_ir_ksimd(). --- src/lj_opt_fold.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 73a368ed..5f4b8810 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -998,8 +998,10 @@ LJFOLDF(simplify_nummuldiv_k) if (n == 1.0) { /* x o 1 ==> x */ return LEFTFOLD; } else if (n == -1.0) { /* x o -1 ==> -x */ + IRRef op1 = fins->op1; + fins->op2 = (IRRef1)lj_ir_ksimd(J, LJ_KSIMD_NEG); /* Modifies fins. */ + fins->op1 = op1; fins->o = IR_NEG; - fins->op2 = (IRRef1)lj_ir_ksimd(J, LJ_KSIMD_NEG); return RETRYFOLD; } else if (fins->o == IR_MUL && n == 2.0) { /* x * 2 ==> x + x */ fins->o = IR_ADD; From 972a1a4cc6791b4595941d32f27ce2242012164c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Fri, 22 Jul 2016 16:42:09 +0200 Subject: [PATCH 56/57] Fix exit status for 'luajit -b'. --- src/luajit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/luajit.c b/src/luajit.c index 4de2d43c..e582f469 100644 --- a/src/luajit.c +++ b/src/luajit.c @@ -387,7 +387,7 @@ static int dobytecode(lua_State *L, char **argv) for (argv++; *argv != NULL; narg++, argv++) lua_pushstring(L, *argv); report(L, lua_pcall(L, narg, 0, 0)); - return 1; + return -1; } /* check that argument has no extra characters at the end */ @@ -580,6 +580,6 @@ int main(int argc, char **argv) status = lua_cpcall(L, pmain, NULL); report(L, status); lua_close(L); - return (status || smain.status) ? EXIT_FAILURE : EXIT_SUCCESS; + return (status || smain.status > 0) ? EXIT_FAILURE : EXIT_SUCCESS; } From 02b9b5597679658bbd8716cdeaaa33f04b1f6a3c Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 31 Jul 2016 13:26:58 +0200 Subject: [PATCH 57/57] Revert "OSX: Switch to Clang as the default compiler." It breaks cross-compilation to Android. And host "gcc" aliases to "clang", anyway. --- doc/install.html | 4 ++-- src/Makefile | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/doc/install.html b/doc/install.html index a34f39f0..e3139583 100644 --- a/doc/install.html +++ b/doc/install.html @@ -455,8 +455,8 @@ Or use Android. :-p ISDKP=$(xcrun --sdk iphoneos --show-sdk-path) ICC=$(xcrun --sdk iphoneos --find clang) ISDKF="-arch armv7 -isysroot $ISDKP" -make HOST_CC="clang -m32 -arch i386" CROSS="$(dirname $ICC)/" \ - TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS +make DEFAULT_CC=clang HOST_CC="clang -m32 -arch i386" \ + CROSS="$(dirname $ICC)/" TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS

    Cross-compiling for consoles

    diff --git a/src/Makefile b/src/Makefile index 1cddf188..9285c11f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -169,10 +169,6 @@ else HOST_SYS= Windows HOST_MSYS= cygwin endif - # Use Clang for OSX host. - ifeq (Darwin,$(HOST_SYS)) - DEFAULT_CC= clang - endif endif ##############################################################################
    Lua 5.1
    API+ABI
    + JIT+ BitOp+ FFIDrop-in
    DLL/.so