Windows/ARM64: Add initial support.

Only builds with native ARM64 Visual Studio for now.
Thanks to vanc and Stephen Just. #593 #964
This commit is contained in:
Mike Pall 2023-09-10 05:20:22 +02:00
parent 566532b807
commit cb413bf8f4
7 changed files with 147 additions and 37 deletions

View File

@ -9,7 +9,7 @@
#include "buildvm.h"
#include "lj_bc.h"
#if LJ_TARGET_X86ORX64
#if LJ_TARGET_WINDOWS
/* Context for PE object emitter. */
static char *strtab;
@ -93,6 +93,17 @@ typedef struct PEsymaux {
#define PEOBJ_RELOC_ADDR32NB 0x03
#define PEOBJ_RELOC_OFS 0
#define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */
#define PEOBJ_PDATA_NRELOC 6
#define PEOBJ_XDATA_SIZE (8*2+4+6*2)
#elif LJ_TARGET_ARM64
#define PEOBJ_ARCH_TARGET 0xaa64
#define PEOBJ_RELOC_REL32 0x03 /* MS: BRANCH26. */
#define PEOBJ_RELOC_DIR32 0x01
#define PEOBJ_RELOC_ADDR32NB 0x02
#define PEOBJ_RELOC_OFS (-4)
#define PEOBJ_TEXT_FLAGS 0x60500020 /* 60=r+x, 50=align16, 20=code. */
#define PEOBJ_PDATA_NRELOC 4
#define PEOBJ_XDATA_SIZE (4+24+4 +4+8)
#endif
/* Section numbers (0-based). */
@ -100,7 +111,7 @@ enum {
PEOBJ_SECT_ABS = -2,
PEOBJ_SECT_UNDEF = -1,
PEOBJ_SECT_TEXT,
#if LJ_TARGET_X64
#ifdef PEOBJ_PDATA_NRELOC
PEOBJ_SECT_PDATA,
PEOBJ_SECT_XDATA,
#elif LJ_TARGET_X86
@ -175,6 +186,9 @@ void emit_peobj(BuildCtx *ctx)
uint32_t sofs;
int i, nrsym;
union { uint8_t b; uint32_t u; } host_endian;
#ifdef PEOBJ_PDATA_NRELOC
uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
#endif
sofs = sizeof(PEheader) + PEOBJ_NSECTIONS*sizeof(PEsection);
@ -188,18 +202,18 @@ void emit_peobj(BuildCtx *ctx)
/* Flags: 60 = read+execute, 50 = align16, 20 = code. */
pesect[PEOBJ_SECT_TEXT].flags = PEOBJ_TEXT_FLAGS;
#if LJ_TARGET_X64
#ifdef PEOBJ_PDATA_NRELOC
memcpy(pesect[PEOBJ_SECT_PDATA].name, ".pdata", sizeof(".pdata")-1);
pesect[PEOBJ_SECT_PDATA].ofs = sofs;
sofs += (pesect[PEOBJ_SECT_PDATA].size = 6*4);
sofs += (pesect[PEOBJ_SECT_PDATA].size = PEOBJ_PDATA_NRELOC*4);
pesect[PEOBJ_SECT_PDATA].relocofs = sofs;
sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = 6) * PEOBJ_RELOC_SIZE;
sofs += (pesect[PEOBJ_SECT_PDATA].nreloc = PEOBJ_PDATA_NRELOC) * PEOBJ_RELOC_SIZE;
/* Flags: 40 = read, 30 = align4, 40 = initialized data. */
pesect[PEOBJ_SECT_PDATA].flags = 0x40300040;
memcpy(pesect[PEOBJ_SECT_XDATA].name, ".xdata", sizeof(".xdata")-1);
pesect[PEOBJ_SECT_XDATA].ofs = sofs;
sofs += (pesect[PEOBJ_SECT_XDATA].size = 8*2+4+6*2); /* See below. */
sofs += (pesect[PEOBJ_SECT_XDATA].size = PEOBJ_XDATA_SIZE); /* See below. */
pesect[PEOBJ_SECT_XDATA].relocofs = sofs;
sofs += (pesect[PEOBJ_SECT_XDATA].nreloc = 1) * PEOBJ_RELOC_SIZE;
/* Flags: 40 = read, 30 = align4, 40 = initialized data. */
@ -234,7 +248,7 @@ void emit_peobj(BuildCtx *ctx)
*/
nrsym = ctx->nrelocsym;
pehdr.nsyms = 1+PEOBJ_NSECTIONS*2 + 1+ctx->nsym + nrsym;
#if LJ_TARGET_X64
#ifdef PEOBJ_PDATA_NRELOC
pehdr.nsyms += 1; /* Symbol for lj_err_unwind_win. */
#endif
@ -259,7 +273,6 @@ void emit_peobj(BuildCtx *ctx)
#if LJ_TARGET_X64
{ /* Write .pdata section. */
uint32_t fcofs = (uint32_t)ctx->sym[ctx->nsym-1].ofs;
uint32_t pdata[3]; /* Start of .text, end of .text and .xdata. */
PEreloc reloc;
pdata[0] = 0; pdata[1] = fcofs; pdata[2] = 0;
@ -308,6 +321,88 @@ void emit_peobj(BuildCtx *ctx)
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
}
#elif LJ_TARGET_ARM64
/* https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling */
{ /* Write .pdata section. */
uint32_t pdata[4];
PEreloc reloc;
pdata[0] = 0;
pdata[1] = 0;
pdata[2] = fcofs;
pdata[3] = 4+24+4;
owrite(ctx, &pdata, sizeof(pdata));
/* Start of .text and start of .xdata. */
reloc.vaddr = 0; reloc.symidx = 1+2+nrsym+2+2+1;
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
reloc.vaddr = 4; reloc.symidx = 1+2+nrsym+2;
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
/* Start of vm_ffi_call and start of second part of .xdata. */
reloc.vaddr = 8; reloc.symidx = 1+2+nrsym+2+2+1;
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
reloc.vaddr = 12; reloc.symidx = 1+2+nrsym+2;
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
}
{ /* Write .xdata section. */
uint32_t u32;
uint8_t *p, uwc[24];
PEreloc reloc;
#define CBE16(x) (*p = ((x) >> 8) & 0xff, p[1] = (x) & 0xff, p += 2)
#define CALLOC_S(s) (*p++ = ((s) >> 4)) /* s < 512 */
#define CSAVE_FPLR(o) (*p++ = 0x40 | ((o) >> 3)) /* o <= 504 */
#define CSAVE_REGP(r,o) CBE16(0xc800 | (((r)-19)<< 6) | ((o) >> 3))
#define CSAVE_REGS(r1,r2,o1) do { \
int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_REGP(r, o); \
} while (0)
#define CSAVE_FREGP(r,o) CBE16(0xd800 | (((r) - 8) << 6) | ((o) >> 3))
#define CSAVE_FREGS(r1,r2,o1) do { \
int r, o; for (r = r1, o = o1; r <= r2; r += 2, o -= 16) CSAVE_FREGP(r, o); \
} while (0)
#define CSAVE_REGX(r,o) CBE16(0xd400 | (((r) - 19) << 5) | (~(o) >> 3))
#define CADD_FP(s) CBE16(0xe200 | ((s) >> 3)) /* s < 8*256 */
#define CODE_NOP 0xe3
#define CODE_END 0xe4
#define CEND_ALIGN do { \
*p++ = CODE_END; \
while ((p - uwc) & 3) *p++ = CODE_NOP; \
} while (0)
/* Unwind codes for .text section with handler. */
p = uwc;
CALLOC_S(208); /* +1 */
CSAVE_FPLR(192); /* +1 */
CADD_FP(192); /* +2 */
CSAVE_REGS(19, 28, 184); /* +5*2 */
CSAVE_FREGS(8, 15, 104); /* +4*2 */
CEND_ALIGN; /* +1 +1 -> 24 */
u32 = ((24u >> 2) << 27) | (1u << 20) | (fcofs >> 2);
owrite(ctx, &u32, 4);
owrite(ctx, &uwc, 24);
u32 = 0; /* Handler RVA to be relocated at 4 + 24. */
owrite(ctx, &u32, 4);
/* Unwind codes for vm_ffi_call without handler. */
p = uwc;
CSAVE_FPLR(16); /* +1 */
CADD_FP(16); /* +2 */
CSAVE_REGX(19, -24); /* +2 */
CSAVE_REGX(20, -32); /* +2 */
CEND_ALIGN; /* +1 +0 -> 8 */
u32 = ((8u >> 2) << 27) | (((uint32_t)ctx->codesz - fcofs) >> 2);
owrite(ctx, &u32, 4);
owrite(ctx, &uwc, 8);
reloc.vaddr = 4 + 24; reloc.symidx = 1+2+nrsym+2+2;
reloc.type = PEOBJ_RELOC_ADDR32NB;
owrite(ctx, &reloc, PEOBJ_RELOC_SIZE);
}
#elif LJ_TARGET_X86
/* Write .sxdata section. */
for (i = 0; i < nrsym; i++) {
@ -339,7 +434,7 @@ void emit_peobj(BuildCtx *ctx)
emit_peobj_sym(ctx, ctx->relocsym[i], 0,
PEOBJ_SECT_UNDEF, PEOBJ_TYPE_FUNC, PEOBJ_SCL_EXTERN);
#if LJ_TARGET_X64
#ifdef PEOBJ_PDATA_NRELOC
emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_PDATA);
emit_peobj_sym_sect(ctx, pesect, PEOBJ_SECT_XDATA);
emit_peobj_sym(ctx, "lj_err_unwind_win", 0,

View File

@ -57,7 +57,7 @@
#define LUAJIT_TARGET LUAJIT_ARCH_X64
#elif defined(__arm__) || defined(__arm) || defined(__ARM__) || defined(__ARM)
#define LUAJIT_TARGET LUAJIT_ARCH_ARM
#elif defined(__aarch64__)
#elif defined(__aarch64__) || defined(_M_ARM64)
#define LUAJIT_TARGET LUAJIT_ARCH_ARM64
#elif defined(__ppc__) || defined(__ppc) || defined(__PPC__) || defined(__PPC) || defined(__powerpc__) || defined(__powerpc) || defined(__POWERPC__) || defined(__POWERPC) || defined(_M_PPC)
#define LUAJIT_TARGET LUAJIT_ARCH_PPC

View File

@ -146,15 +146,9 @@ typedef uintptr_t BloomFilter;
#define LJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
#define lj_ffs(x) ((uint32_t)__builtin_ctz(x))
/* Don't ask ... */
#if defined(__INTEL_COMPILER) && (defined(__i386__) || defined(__x86_64__))
static LJ_AINLINE uint32_t lj_fls(uint32_t x)
{
uint32_t r; __asm__("bsrl %1, %0" : "=r" (r) : "rm" (x) : "cc"); return r;
}
#else
#define lj_fls(x) ((uint32_t)(__builtin_clz(x)^31))
#endif
#define lj_ffs64(x) ((uint32_t)__builtin_ctzll(x))
#define lj_fls64(x) ((uint32_t)(__builtin_clzll(x)^63))
#if defined(__arm__)
static LJ_AINLINE uint32_t lj_bswap(uint32_t x)
@ -265,8 +259,12 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x)
#else
unsigned char _BitScanForward(unsigned long *, unsigned long);
unsigned char _BitScanReverse(unsigned long *, unsigned long);
unsigned char _BitScanForward64(unsigned long *, uint64_t);
unsigned char _BitScanReverse64(unsigned long *, uint64_t);
#pragma intrinsic(_BitScanForward)
#pragma intrinsic(_BitScanReverse)
#pragma intrinsic(_BitScanForward64)
#pragma intrinsic(_BitScanReverse64)
static LJ_AINLINE uint32_t lj_ffs(uint32_t x)
{
@ -277,6 +275,16 @@ static LJ_AINLINE uint32_t lj_fls(uint32_t x)
{
unsigned long r; _BitScanReverse(&r, x); return (uint32_t)r;
}
static LJ_AINLINE uint32_t lj_ffs64(uint64_t x)
{
unsigned long r; _BitScanForward64(&r, x); return (uint32_t)r;
}
static LJ_AINLINE uint32_t lj_fls64(uint64_t x)
{
unsigned long r; _BitScanReverse64(&r, x); return (uint32_t)r;
}
#endif
unsigned long _byteswap_ulong(unsigned long);

View File

@ -30,15 +30,15 @@ static uint32_t emit_isk12(int64_t n)
uint64_t k = n < 0 ? ~(uint64_t)n+1u : (uint64_t)n;
uint32_t m = n < 0 ? 0x40000000 : 0;
if (k < 0x1000) {
return A64I_K12|m|A64F_U12(k);
return (uint32_t)(A64I_K12|m|A64F_U12(k));
} else if ((k & 0xfff000) == k) {
return A64I_K12|m|0x400000|A64F_U12(k>>12);
return (uint32_t)(A64I_K12|m|0x400000|A64F_U12(k>>12));
}
return 0;
}
#define emit_clz64(n) __builtin_clzll(n)
#define emit_ctz64(n) __builtin_ctzll(n)
#define emit_clz64(n) (lj_fls64(n)^63)
#define emit_ctz64(n) lj_ffs64(n)
/* Encode constant in K13 format for logical data processing instructions. */
static uint32_t emit_isk13(uint64_t n, int is64)

View File

@ -29,6 +29,11 @@
#include <valgrind/valgrind.h>
#endif
#if LJ_TARGET_WINDOWS
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#if LJ_TARGET_IOS
void sys_icache_invalidate(void *start, size_t len);
#endif
@ -41,6 +46,8 @@ void lj_mcode_sync(void *start, void *end)
#endif
#if LJ_TARGET_X86ORX64
UNUSED(start); UNUSED(end);
#elif LJ_TARGET_WINDOWS
FlushInstructionCache(GetCurrentProcess(), start, (char *)end-(char *)start);
#elif LJ_TARGET_IOS
sys_icache_invalidate(start, (char *)end-(char *)start);
#elif LJ_TARGET_PPC
@ -58,9 +65,6 @@ void lj_mcode_sync(void *start, void *end)
#if LJ_TARGET_WINDOWS
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#define MCPROT_RW PAGE_READWRITE
#define MCPROT_RX PAGE_EXECUTE_READ
#define MCPROT_RWX PAGE_EXECUTE_READWRITE

View File

@ -58,9 +58,13 @@ typedef uint32_t RegSP;
#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
typedef uint64_t RegSet;
#define RSET_BITS 6
#define rset_picktop_(rs) ((Reg)lj_fls64(rs))
#define rset_pickbot_(rs) ((Reg)lj_ffs64(rs))
#else
typedef uint32_t RegSet;
#define RSET_BITS 5
#define rset_picktop_(rs) ((Reg)lj_fls(rs))
#define rset_pickbot_(rs) ((Reg)lj_ffs(rs))
#endif
#define RID2RSET(r) (((RegSet)1) << (r))
@ -71,13 +75,6 @@ typedef uint32_t RegSet;
#define rset_set(rs, r) (rs |= RID2RSET(r))
#define rset_clear(rs, r) (rs &= ~RID2RSET(r))
#define rset_exclude(rs, r) (rs & ~RID2RSET(r))
#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64
#define rset_picktop_(rs) ((Reg)(__builtin_clzll(rs)^63))
#define rset_pickbot_(rs) ((Reg)__builtin_ctzll(rs))
#else
#define rset_picktop_(rs) ((Reg)lj_fls(rs))
#define rset_pickbot_(rs) ((Reg)lj_ffs(rs))
#endif
/* -- Register allocation cost -------------------------------------------- */

View File

@ -34,20 +34,26 @@
if exist minilua.exe.manifest^
%LJMT% -manifest minilua.exe.manifest -outputresource:minilua.exe
@set DASMFLAGS=-D WIN -D JIT -D FFI -D P64
@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU -D P64
@set LJARCH=x64
@minilua
@if errorlevel 8 goto :X64
@if errorlevel 8 goto :NO32
@set DASC=vm_x86.dasc
@set DASMFLAGS=-D WIN -D JIT -D FFI
@set DASMFLAGS=-D WIN -D JIT -D FFI -D ENDIAN_LE -D FPU
@set LJARCH=x86
@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
@goto :DA
:NO32
@if "%VSCMD_ARG_TGT_ARCH%" neq "arm64" goto :X64
@set DASC=vm_arm64.dasc
@set LJARCH=arm64
@goto :DA
:X64
@if "%1" neq "nogc64" goto :GC64
@if "%1" neq "nogc64" goto :DA
@shift
@set DASC=vm_x86.dasc
@set LJCOMPILE=%LJCOMPILE% /DLUAJIT_DISABLE_GC64
:GC64
:DA
minilua %DASM% -LN %DASMFLAGS% -o host\buildvm_arch.h %DASC%
@if errorlevel 1 goto :BAD