diff --git a/src/host/buildvm_peobj.c b/src/host/buildvm_peobj.c index 5bca6df8..e3e1026e 100644 --- a/src/host/buildvm_peobj.c +++ b/src/host/buildvm_peobj.c @@ -9,7 +9,7 @@ #include "buildvm.h" #include "lj_bc.h" -#if LJ_TARGET_X86ORX64 +#if LJ_TARGET_WINDOWS /* Context for PE object emitter. */ static char *strtab; @@ -93,6 +93,17 @@ typedef struct PEsymaux { #define PEOBJ_RELOC_ADDR32NB 0x03 #define PEOBJ_RELOC_OFS 0 #define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */ +#define PEOBJ_PDATA_NRELOC 6 +#define PEOBJ_XDATA_SIZE (8*2+4+6*2) +#elif LJ_TARGET_ARM64 +#define PEOBJ_ARCH_TARGET 0xaa64 +#define PEOBJ_RELOC_REL32 0x03 /* MS: BRANCH26. */ +#define PEOBJ_RELOC_DIR32 0x01 +#define PEOBJ_RELOC_ADDR32NB 0x02 +#define PEOBJ_RELOC_OFS (-4) +#define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */ +#define PEOBJ_PDATA_NRELOC 4 +#define PEOBJ_XDATA_SIZE (4+24+4 +4+8) #endif /* Section numbers (0-based). */ @@ -100,7 +111,7 @@ enum { PEOBJ_SECT_ABS = -2, PEOBJ_SECT_UNDEF = -1, PEOBJ_SECT_TEXT, -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC PEOBJ_SECT_PDATA, PEOBJ_SECT_XDATA, #elif LJ_TARGET_X86 @@ -175,6 +186,9 @@ void emit_peobj(BuildCtx *ctx) uint32_t sofs; int i, nrsym; union { uint8_t b; uint32_t u; } host_endian; +#ifdef PEOBJ_PDATA_NRELOC + uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs; +#endif sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection); @@ -188,18 +202,18 @@ void emit_peobj(BuildCtx *ctx) /* Flags: 60 = read+execute, 50 = align16, 20 = code. */ pesect[PEOBJ_SECT_TEXT].flags = PEOBJ_TEXT_FLAGS; -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC memcpy(pesect[PEOBJ_SECT_PDATA].name, ".pdata", sizeof(".pdata")-1); pesect[PEOBJ_SECT_PDATA].ofs = sofs; - sofs += (pesect[PEOBJ_SECT_PDATA].size = 6*4); + sofs += (pesect[PEOBJ_SECT_PDATA].size = PEOBJ_PDATA_NRELOC*4); pesect[PEOBJ_SECT_PDATA].relocofs = sofs; - sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = 6) * PEOBJ_RELOC_SIZE; + sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = PEOBJ_PDATA_NRELOC) * PEOBJ_RELOC_SIZE; /* Flags: 40 = read, 30 = align4, 40 = initialized data. */ pesect[PEOBJ_SECT_PDATA].flags = 0x40300040; memcpy(pesect[PEOBJ_SECT_XDATA].name, ".xdata", sizeof(".xdata")-1); pesect[PEOBJ_SECT_XDATA].ofs = sofs; - sofs += (pesect[PEOBJ_SECT_XDATA].size = 8*2+4+6*2); /* See below. */ + sofs += (pesect[PEOBJ_SECT_XDATA].size = PEOBJ_XDATA_SIZE); /* See below. */ pesect[PEOBJ_SECT_XDATA].relocofs = sofs; sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE; /* Flags: 40 = read, 30 = align4, 40 = initialized data. */ @@ -234,7 +248,7 @@ void emit_peobj(BuildCtx *ctx) */ nrsym = ctx->nrelocsym; pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym; -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC pehdr.nsyms += 1; /* Symbol for lj_err_unwind_win. */ #endif @@ -259,7 +273,6 @@ void emit_peobj(BuildCtx *ctx) #if LJ_TARGET_X64 { /* Write .pdata section. */ - uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs; uint32_t pdata[3]; /* Start of .text, end of .text and .xdata. */ PEreloc reloc; pdata[0] = 0; pdata[1] = fcofs; pdata[2] = 0; @@ -308,6 +321,88 @@ void emit_peobj(BuildCtx *ctx) reloc.type = PEOBJ_RELOC_ADDR32NB; owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); } +#elif LJ_TARGET_ARM64 + /* https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling */ + { /* Write .pdata section. */ + uint32_t pdata[4]; + PEreloc reloc; + pdata[0] = 0; + pdata[1] = 0; + pdata[2] = fcofs; + pdata[3] = 4+24+4; + owrite(ctx, &pdata, sizeof(pdata)); + /* Start of .text and start of .xdata. */ + reloc.vaddr = 0; reloc.symidx = 1+2+nrsym+2+2+1; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + reloc.vaddr = 4; reloc.symidx = 1+2+nrsym+2; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + /* Start of vm_ffi_call and start of second part of .xdata. */ + reloc.vaddr = 8; reloc.symidx = 1+2+nrsym+2+2+1; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + reloc.vaddr = 12; reloc.symidx = 1+2+nrsym+2; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + } + { /* Write .xdata section. */ + uint32_t u32; + uint8_t *p, uwc[24]; + PEreloc reloc; + +#define CBE16(x) (*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2) +#define CALLOC_S(s) (*p++ = ((s) >> 4)) /* s < 512 */ +#define CSAVE_FPLR(o) (*p++ = 0x40 | ((o) >> 3)) /* o <= 504 */ +#define CSAVE_REGP(r,o) CBE16(0xc800 | (((r)-19)<< 6) | ((o) >> 3)) +#define CSAVE_REGS(r1,r2,o1) do { \ + int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \ +} while (0) +#define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3)) +#define CSAVE_FREGS(r1,r2,o1) do { \ + int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \ +} while (0) +#define CSAVE_REGX(r,o) CBE16(0xd400 | (((r) - 19) << 5) | (~(o) >> 3)) +#define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */ +#define CODE_NOP 0xe3 +#define CODE_END 0xe4 +#define CEND_ALIGN do { \ + *p++ = CODE_END; \ + while ((p - uwc) & 3) *p++ = CODE_NOP; \ +} while (0) + + /* Unwind codes for .text section with handler. */ + p = uwc; + CALLOC_S(208); /* +1 */ + CSAVE_FPLR(192); /* +1 */ + CADD_FP(192); /* +2 */ + CSAVE_REGS(19, 28, 184); /* +5*2 */ + CSAVE_FREGS(8, 15, 104); /* +4*2 */ + CEND_ALIGN; /* +1 +1 -> 24 */ + + u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2); + owrite(ctx, &u32, 4); + owrite(ctx, &uwc, 24); + + u32 = 0; /* Handler RVA to be relocated at 4 + 24. */ + owrite(ctx, &u32, 4); + + /* Unwind codes for vm_ffi_call without handler. */ + p = uwc; + CSAVE_FPLR(16); /* +1 */ + CADD_FP(16); /* +2 */ + CSAVE_REGX(19, -24); /* +2 */ + CSAVE_REGX(20, -32); /* +2 */ + CEND_ALIGN; /* +1 +0 -> 8 */ + + u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2); + owrite(ctx, &u32, 4); + owrite(ctx, &uwc, 8); + + reloc.vaddr = 4 + 24; reloc.symidx = 1+2+nrsym+2+2; + reloc.type = PEOBJ_RELOC_ADDR32NB; + owrite(ctx, &reloc, PEOBJ_RELOC_SIZE); + } #elif LJ_TARGET_X86 /* Write .sxdata section. */ for (i = 0; i < nrsym; i++) { @@ -339,7 +434,7 @@ void emit_peobj(BuildCtx *ctx) emit_peobj_sym(ctx, ctx->relocsym[i], 0, PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN); -#if LJ_TARGET_X64 +#ifdef PEOBJ_PDATA_NRELOC emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA); emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA); emit_peobj_sym(ctx, "lj_err_unwind_win", 0, diff --git a/src/lj_arch.h b/src/lj_arch.h index 3e920f2a..026e741f 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -57,7 +57,7 @@ #define LUAJIT_TARGET LUAJIT_ARCH_X64 #elif defined(__arm__) || defined(__arm) || defined(__ARM__) || defined(__ARM) #define LUAJIT_TARGET LUAJIT_ARCH_ARM -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) #define LUAJIT_TARGET LUAJIT_ARCH_ARM64 #elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC) #define LUAJIT_TARGET LUAJIT_ARCH_PPC diff --git a/src/lj_def.h b/src/lj_def.h index 88bc6336..1461d3d7 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -146,15 +146,9 @@ typedef uintptr_t BloomFilter; #define LJ_UNLIKELY(x) __builtin_expect(!!(x), 0) #define lj_ffs(x) ((uint32_t)__builtin_ctz(x)) -/* Don't ask ... */ -#if defined(__INTEL_COMPILER) && (defined(__i386__) || defined(__x86_64__)) -static LJ_AINLINE uint32_t lj_fls(uint32_t x) -{ - uint32_t r; __asm__("bsrl %1, %0" : "=r" (r) : "rm" (x) : "cc"); return r; -} -#else #define lj_fls(x) ((uint32_t)(__builtin_clz(x)^31)) -#endif +#define lj_ffs64(x) ((uint32_t)__builtin_ctzll(x)) +#define lj_fls64(x) ((uint32_t)(__builtin_clzll(x)^63)) #if defined(__arm__) static LJ_AINLINE uint32_t lj_bswap(uint32_t x) @@ -265,8 +259,12 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x) #else unsigned char _BitScanForward(unsigned long *, unsigned long); unsigned char _BitScanReverse(unsigned long *, unsigned long); +unsigned char _BitScanForward64(unsigned long *, uint64_t); +unsigned char _BitScanReverse64(unsigned long *, uint64_t); #pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) +#pragma intrinsic(_BitScanForward64) +#pragma intrinsic(_BitScanReverse64) static LJ_AINLINE uint32_t lj_ffs(uint32_t x) { @@ -277,6 +275,16 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x) { unsigned long r; _BitScanReverse(&r, x); return (uint32_t)r; } + +static LJ_AINLINE uint32_t lj_ffs64(uint64_t x) +{ + unsigned long r; _BitScanForward64(&r, x); return (uint32_t)r; +} + +static LJ_AINLINE uint32_t lj_fls64(uint64_t x) +{ + unsigned long r; _BitScanReverse64(&r, x); return (uint32_t)r; +} #endif unsigned long _byteswap_ulong(unsigned long); diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index 9161c958..fef5d973 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -30,15 +30,15 @@ static uint32_t emit_isk12(int64_t n) uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n; uint32_t m = n < 0 ? 0x40000000 : 0; if (k < 0x1000) { - return A64I_K12|m|A64F_U12(k); + return (uint32_t)(A64I_K12|m|A64F_U12(k)); } else if ((k & 0xfff000) == k) { - return A64I_K12|m|0x400000|A64F_U12(k>>12); + return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12)); } return 0; } -#define emit_clz64(n) __builtin_clzll(n) -#define emit_ctz64(n) __builtin_ctzll(n) +#define emit_clz64(n) (lj_fls64(n)^63) +#define emit_ctz64(n) lj_ffs64(n) /* Encode constant in K13 format for logical data processing instructions. */ static uint32_t emit_isk13(uint64_t n, int is64) diff --git a/src/lj_mcode.c b/src/lj_mcode.c index c8ed95e1..8a4851dd 100644 --- a/src/lj_mcode.c +++ b/src/lj_mcode.c @@ -29,6 +29,11 @@ #include #endif +#if LJ_TARGET_WINDOWS +#define WIN32_LEAN_AND_MEAN +#include +#endif + #if LJ_TARGET_IOS void sys_icache_invalidate(void *start, size_t len); #endif @@ -41,6 +46,8 @@ void lj_mcode_sync(void *start, void *end) #endif #if LJ_TARGET_X86ORX64 UNUSED(start); UNUSED(end); +#elif LJ_TARGET_WINDOWS + FlushInstructionCache(GetCurrentProcess(), start, (char *)end-(char *)start); #elif LJ_TARGET_IOS sys_icache_invalidate(start, (char *)end-(char *)start); #elif LJ_TARGET_PPC @@ -58,9 +65,6 @@ void lj_mcode_sync(void *start, void *end) #if LJ_TARGET_WINDOWS -#define WIN32_LEAN_AND_MEAN -#include - #define MCPROT_RW PAGE_READWRITE #define MCPROT_RX PAGE_EXECUTE_READ #define MCPROT_RWX PAGE_EXECUTE_READWRITE diff --git a/src/lj_target.h b/src/lj_target.h index 09d19bd9..e7322c07 100644 --- a/src/lj_target.h +++ b/src/lj_target.h @@ -58,9 +58,13 @@ typedef uint32_t RegSP; #if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 typedef uint64_t RegSet; #define RSET_BITS 6 +#define rset_picktop_(rs) ((Reg)lj_fls64(rs)) +#define rset_pickbot_(rs) ((Reg)lj_ffs64(rs)) #else typedef uint32_t RegSet; #define RSET_BITS 5 +#define rset_picktop_(rs) ((Reg)lj_fls(rs)) +#define rset_pickbot_(rs) ((Reg)lj_ffs(rs)) #endif #define RID2RSET(r) (((RegSet)1) << (r)) @@ -71,13 +75,6 @@ typedef uint32_t RegSet; #define rset_set(rs, r) (rs |= RID2RSET(r)) #define rset_clear(rs, r) (rs &= ~RID2RSET(r)) #define rset_exclude(rs, r) (rs & ~RID2RSET(r)) -#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 -#define rset_picktop_(rs) ((Reg)(__builtin_clzll(rs)^63)) -#define rset_pickbot_(rs) ((Reg)__builtin_ctzll(rs)) -#else -#define rset_picktop_(rs) ((Reg)lj_fls(rs)) -#define rset_pickbot_(rs) ((Reg)lj_ffs(rs)) -#endif /* -- Register allocation cost -------------------------------------------- */ diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat index f9bf2528..2cfcf26e 100644 --- a/src/msvcbuild.bat +++ b/src/msvcbuild.bat @@ -34,20 +34,26 @@ if exist minilua.exe.manifest^ %LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe -@set DASMFLAGS=-D WIN -D JIT -D FFI -D P64 +@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU -D P64 @set LJARCH=x64 @minilua -@if errorlevel 8 goto :X64 +@if errorlevel 8 goto :NO32 @set DASC=vm_x86.dasc -@set DASMFLAGS=-D WIN -D JIT -D FFI +@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU @set LJARCH=x86 @set LJCOMPILE=%LJCOMPILE% /arch:SSE2 +@goto :DA +:NO32 +@if "%VSCMD_ARG_TGT_ARCH%" neq "arm64" goto :X64 +@set DASC=vm_arm64.dasc +@set LJARCH=arm64 +@goto :DA :X64 -@if "%1" neq "nogc64" goto :GC64 +@if "%1" neq "nogc64" goto :DA @shift @set DASC=vm_x86.dasc @set LJCOMPILE=%LJCOMPILE% /DLUAJIT_DISABLE_GC64 -:GC64 +:DA minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC% @if errorlevel 1 goto :BAD