From 04b60707d7d117da22b40736a353e2a10179108a Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Sun, 20 Nov 2016 22:16:08 +0100 Subject: [PATCH] ARM64: Add JIT compiler backend. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- Makefile | 4 +- src/jit/dis_arm64.lua | 1215 +++++++++++++++++++++++++++ src/lj_arch.h | 1 - src/lj_asm.c | 4 + src/lj_asm_arm64.h | 1823 +++++++++++++++++++++++++++++++++++++++++ src/lj_ccall.c | 2 +- src/lj_dispatch.h | 1 + src/lj_emit_arm64.h | 397 +++++++++ src/lj_gdbjit.c | 12 + src/lj_target.h | 4 +- src/lj_target_arm64.h | 221 ++++- src/vm_arm64.dasc | 227 ++++- 12 files changed, 3887 insertions(+), 24 deletions(-) create mode 100644 src/jit/dis_arm64.lua create mode 100644 src/lj_asm_arm64.h create mode 100644 src/lj_emit_arm64.h diff --git a/Makefile b/Makefile index 6dfbbde4..5e640d94 100644 --- a/Makefile +++ b/Makefile @@ -86,8 +86,8 @@ FILE_MAN= luajit.1 FILE_PC= luajit.pc FILES_INC= lua.h lualib.h lauxlib.h luaconf.h lua.hpp luajit.h FILES_JITLIB= bc.lua bcsave.lua dump.lua p.lua v.lua zone.lua \ - dis_x86.lua dis_x64.lua dis_arm.lua dis_ppc.lua \ - dis_mips.lua dis_mipsel.lua vmdef.lua + dis_x86.lua dis_x64.lua dis_arm.lua dis_arm64.lua \ + dis_ppc.lua dis_mips.lua dis_mipsel.lua vmdef.lua ifeq (,$(findstring Windows,$(OS))) HOST_SYS:= $(shell uname -s) diff --git a/src/jit/dis_arm64.lua b/src/jit/dis_arm64.lua new file mode 100644 index 00000000..909b33bc --- /dev/null +++ b/src/jit/dis_arm64.lua @@ -0,0 +1,1215 @@ +---------------------------------------------------------------------------- +-- LuaJIT ARM64 disassembler module. +-- +-- Copyright (C) 2005-2016 Mike Pall. All rights reserved. +-- Released under the MIT license. See Copyright Notice in luajit.h +-- +-- Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. +-- Sponsored by Cisco Systems, Inc. +---------------------------------------------------------------------------- +-- This is a helper module used by the LuaJIT machine code dumper module. +-- +-- It disassembles most user-mode AArch64 instructions. +-- NYI: Advanced SIMD and VFP instructions. +------------------------------------------------------------------------------ + +local type, tonumber = type, tonumber +local sub, byte, format = string.sub, string.byte, string.format +local match, gmatch, gsub = string.match, string.gmatch, string.gsub +local rep = string.rep +local concat = table.concat +local bit = require("bit") +local band, bor, bxor, tohex = bit.band, bit.bor, bit.bxor, bit.tohex +local lshift, rshift, arshift = bit.lshift, bit.rshift, bit.arshift +local ror = bit.ror + +------------------------------------------------------------------------------ +-- Opcode maps +------------------------------------------------------------------------------ + +local map_adr = { -- PC-relative addressing. + shift = 31, mask = 1, + [0] = "adrDBx", "adrpDBx" +} + +local map_addsubi = { -- Add/subtract immediate. + shift = 29, mask = 3, + [0] = "add|movDNIg", "adds|cmnD0NIg", "subDNIg", "subs|cmpD0NIg", +} + +local map_logi = { -- Logical immediate. + shift = 31, mask = 1, + [0] = { + shift = 22, mask = 1, + [0] = { + shift = 29, mask = 3, + [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig" + }, + false -- unallocated + }, + { + shift = 29, mask = 3, + [0] = "andDNig", "orr|movDN0ig", "eorDNig", "ands|tstD0Nig" + } +} + +local map_movwi = { -- Move wide immediate. + shift = 31, mask = 1, + [0] = { + shift = 22, mask = 1, + [0] = { + shift = 29, mask = 3, + [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg" + }, false -- unallocated + }, + { + shift = 29, mask = 3, + [0] = "movnDWRg", false, "movz|movDYRg", "movkDWRg" + }, +} + +local map_bitf = { -- Bitfield. + shift = 31, mask = 1, + [0] = { + shift = 22, mask = 1, + [0] = { + shift = 29, mask = 3, + [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12w", + "bfm|bfi|bfxilDN13w", + "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12w" + } + }, + { + shift = 22, mask = 1, + { + shift = 29, mask = 3, + [0] = "sbfm|sbfiz|sbfx|asr|sxtw|sxth|sxtbDN12x", + "bfm|bfi|bfxilDN13x", + "ubfm|ubfiz|ubfx|lsr|lsl|uxth|uxtbDN12x" + } + } +} + +local map_datai = { -- Data processing - immediate. + shift = 23, mask = 7, + [0] = map_adr, map_adr, map_addsubi, false, + map_logi, map_movwi, map_bitf, + { + shift = 15, mask = 0x1c0c1, + [0] = "extr|rorDNM4w", [0x10080] = "extr|rorDNM4x", + [0x10081] = "extr|rorDNM4x" + } +} + +local map_logsr = { -- Logical, shifted register. + shift = 31, mask = 1, + [0] = { + shift = 15, mask = 1, + [0] = { + shift = 29, mask = 3, + [0] = { + shift = 21, mask = 7, + [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg", + "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg" + }, + { + shift = 21, mask = 7, + [0] ="orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg", + "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg" + }, + { + shift = 21, mask = 7, + [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg", + "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg" + }, + { + shift = 21, mask = 7, + [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg", + "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg" + } + }, + false -- unallocated + }, + { + shift = 29, mask = 3, + [0] = { + shift = 21, mask = 7, + [0] = "andDNMSg", "bicDNMSg", "andDNMSg", "bicDNMSg", + "andDNMSg", "bicDNMSg", "andDNMg", "bicDNMg" + }, + { + shift = 21, mask = 7, + [0] = "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0MSg", "orn|mvnDN0MSg", + "orr|movDN0MSg", "orn|mvnDN0MSg", "orr|movDN0Mg", "orn|mvnDN0Mg" + }, + { + shift = 21, mask = 7, + [0] = "eorDNMSg", "eonDNMSg", "eorDNMSg", "eonDNMSg", + "eorDNMSg", "eonDNMSg", "eorDNMg", "eonDNMg" + }, + { + shift = 21, mask = 7, + [0] = "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMSg", "bicsDNMSg", + "ands|tstD0NMSg", "bicsDNMSg", "ands|tstD0NMg", "bicsDNMg" + } + } +} + +local map_assh = { + shift = 31, mask = 1, + [0] = { + shift = 15, mask = 1, + [0] = { + shift = 29, mask = 3, + [0] = { + shift = 22, mask = 3, + [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg" + }, + { + shift = 22, mask = 3, + [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg", + "adds|cmnD0NMSg", "adds|cmnD0NMg" + }, + { + shift = 22, mask = 3, + [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg" + }, + { + shift = 22, mask = 3, + [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg", + "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg" + }, + }, + false -- unallocated + }, + { + shift = 29, mask = 3, + [0] = { + shift = 22, mask = 3, + [0] = "addDNMSg", "addDNMSg", "addDNMSg", "addDNMg" + }, + { + shift = 22, mask = 3, + [0] = "adds|cmnD0NMSg", "adds|cmnD0NMSg", "adds|cmnD0NMSg", + "adds|cmnD0NMg" + }, + { + shift = 22, mask = 3, + [0] = "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0MSg", "sub|negDN0Mg" + }, + { + shift = 22, mask = 3, + [0] = "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0MzSg", + "subs|cmp|negsD0N0MzSg", "subs|cmp|negsD0N0Mzg" + } + } +} + +local map_addsubsh = { -- Add/subtract, shifted register. + shift = 22, mask = 3, + [0] = map_assh, map_assh, map_assh +} + +local map_addsubex = { -- Add/subtract, extended register. + shift = 22, mask = 3, + [0] = { + shift = 29, mask = 3, + [0] = "addDNMXg", "adds|cmnD0NMXg", "subDNMXg", "subs|cmpD0NMzXg", + } +} + +local map_addsubc = { -- Add/subtract, with carry. + shift = 10, mask = 63, + [0] = { + shift = 29, mask = 3, + [0] = "adcDNMg", "adcsDNMg", "sbc|ngcDN0Mg", "sbcs|ngcsDN0Mg", + } +} + +local map_ccomp = { + shift = 4, mask = 1, + [0] = { + shift = 10, mask = 3, + [0] = { -- Conditional compare register. + shift = 29, mask = 3, + "ccmnNMVCg", false, "ccmpNMVCg", + }, + [2] = { -- Conditional compare immediate. + shift = 29, mask = 3, + "ccmnN5VCg", false, "ccmpN5VCg", + } + } +} + +local map_csel = { -- Conditional select. + shift = 11, mask = 1, + [0] = { + shift = 10, mask = 1, + [0] = { + shift = 29, mask = 3, + [0] = "cselDNMzCg", false, "csinv|cinv|csetmDNMcg", false, + }, + { + shift = 29, mask = 3, + [0] = "csinc|cinc|csetDNMcg", false, "csneg|cnegDNMcg", false, + } + } +} + +local map_data1s = { -- Data processing, 1 source. + shift = 29, mask = 1, + [0] = { + shift = 31, mask = 1, + [0] = { + shift = 10, mask = 0x7ff, + [0] = "rbitDNg", "rev16DNg", "revDNw", false, "clzDNg", "clsDNg" + }, + { + shift = 10, mask = 0x7ff, + [0] = "rbitDNg", "rev16DNg", "rev32DNx", "revDNx", "clzDNg", "clsDNg" + } + } +} + +local map_data2s = { -- Data processing, 2 sources. + shift = 29, mask = 1, + [0] = { + shift = 10, mask = 63, + false, "udivDNMg", "sdivDNMg", false, false, false, false, "lslDNMg", + "lsrDNMg", "asrDNMg", "rorDNMg" + } +} + +local map_data3s = { -- Data processing, 3 sources. + shift = 29, mask = 7, + [0] = { + shift = 21, mask = 7, + [0] = { + shift = 15, mask = 1, + [0] = "madd|mulDNMA0g", "msub|mnegDNMA0g" + } + }, false, false, false, + { + shift = 15, mask = 1, + [0] = { + shift = 21, mask = 7, + [0] = "madd|mulDNMA0g", "smaddl|smullDxNMwA0x", "smulhDNMx", false, + false, "umaddl|umullDxNMwA0x", "umulhDNMx" + }, + { + shift = 21, mask = 7, + [0] = "msub|mnegDNMA0g", "smsubl|smneglDxNMwA0x", false, false, + false, "umsubl|umneglDxNMwA0x" + } + } +} + +local map_datar = { -- Data processing, register. + shift = 28, mask = 1, + [0] = { + shift = 24, mask = 1, + [0] = map_logsr, + { + shift = 21, mask = 1, + [0] = map_addsubsh, map_addsubex + } + }, + { + shift = 21, mask = 15, + [0] = map_addsubc, false, map_ccomp, false, map_csel, false, + { + shift = 30, mask = 1, + [0] = map_data2s, map_data1s + }, + false, map_data3s, map_data3s, map_data3s, map_data3s, map_data3s, + map_data3s, map_data3s, map_data3s + } +} + +local map_lrl = { -- Load register, literal. + shift = 26, mask = 1, + [0] = { + shift = 30, mask = 3, + [0] = "ldrDwB", "ldrDxB", "ldrswDxB" + }, + { + shift = 30, mask = 3, + [0] = "ldrDsB", "ldrDdB" + } +} + +local map_lsriind = { -- Load/store register, immediate pre/post-indexed. + shift = 30, mask = 3, + [0] = { + shift = 26, mask = 1, + [0] = { + shift = 22, mask = 3, + [0] = "strbDwzL", "ldrbDwzL", "ldrsbDxzL", "ldrsbDwzL" + } + }, + { + shift = 26, mask = 1, + [0] = { + shift = 22, mask = 3, + [0] = "strhDwzL", "ldrhDwzL", "ldrshDxzL", "ldrshDwzL" + } + }, + { + shift = 26, mask = 1, + [0] = { + shift = 22, mask = 3, + [0] = "strDwzL", "ldrDwzL", "ldrswDxzL" + }, + { + shift = 22, mask = 3, + [0] = "strDszL", "ldrDszL" + } + }, + { + shift = 26, mask = 1, + [0] = { + shift = 22, mask = 3, + [0] = "strDxzL", "ldrDxzL" + }, + { + shift = 22, mask = 3, + [0] = "strDdzL", "ldrDdzL" + } + } +} + +local map_lsriro = { + shift = 21, mask = 1, + [0] = { -- Load/store register immediate. + shift = 10, mask = 3, + [0] = { -- Unscaled immediate. + shift = 26, mask = 1, + [0] = { + shift = 30, mask = 3, + [0] = { + shift = 22, mask = 3, + [0] = "sturbDwK", "ldurbDwK" + }, + { + shift = 22, mask = 3, + [0] = "sturhDwK", "ldurhDwK" + }, + { + shift = 22, mask = 3, + [0] = "sturDwK", "ldurDwK" + }, + { + shift = 22, mask = 3, + [0] = "sturDxK", "ldurDxK" + } + } + }, map_lsriind, false, map_lsriind + }, + { -- Load/store register, register offset. + shift = 10, mask = 3, + [2] = { + shift = 26, mask = 1, + [0] = { + shift = 30, mask = 3, + [1] = { + shift = 22, mask = 3, + [0] = "strhDwO", "ldrhDwO", "ldrshDwO", "ldrshDxO" + }, + [2] = { + shift = 22, mask = 3, + [0] = "strDwO", "ldrDwO", "ldrswDxO" + }, + [3] = { + shift = 22, mask = 3, + [0] = "strDxO", "ldrDxO" + } + }, + { + shift = 30, mask = 3, + [2] = { + shift = 22, mask = 3, + [0] = "strDsO", "ldrDsO" + }, + [3] = { + shift = 22, mask = 3, + [0] = "strDdO", "ldrDdO" + } + } + } + } +} + +local map_lsp = { -- Load/store register pair, offset. + shift = 22, mask = 1, + [0] = { + shift = 30, mask = 3, + [0] = { + shift = 26, mask = 1, + [0] = "stpDzAzwP", "stpDzAzsP", + }, + { + shift = 26, mask = 1, + "stpDzAzdP" + }, + { + shift = 26, mask = 1, + [0] = "stpDzAzxP" + } + }, + { + shift = 30, mask = 3, + [0] = { + shift = 26, mask = 1, + [0] = "ldpDzAzwP", "ldpDzAzsP", + }, + { + shift = 26, mask = 1, + [0] = "ldpswDAxP", "ldpDzAzdP" + }, + { + shift = 26, mask = 1, + [0] = "ldpDzAzxP" + } + } +} + +local map_ls = { -- Loads and stores. + shift = 24, mask = 0x31, + [0x10] = map_lrl, [0x30] = map_lsriro, + [0x20] = { + shift = 23, mask = 3, + map_lsp, map_lsp, map_lsp + }, + [0x21] = { + shift = 23, mask = 3, + map_lsp, map_lsp, map_lsp + }, + [0x31] = { + shift = 26, mask = 1, + [0] = { + shift = 30, mask = 3, + [0] = { + shift = 22, mask = 3, + [0] = "strbDwzU", "ldrbDwzU" + }, + { + shift = 22, mask = 3, + [0] = "strhDwzU", "ldrhDwzU" + }, + { + shift = 22, mask = 3, + [0] = "strDwzU", "ldrDwzU" + }, + { + shift = 22, mask = 3, + [0] = "strDxzU", "ldrDxzU" + } + }, + { + shift = 30, mask = 3, + [2] = { + shift = 22, mask = 3, + [0] = "strDszU", "ldrDszU" + }, + [3] = { + shift = 22, mask = 3, + [0] = "strDdzU", "ldrDdzU" + } + } + }, +} + +local map_datafp = { -- Data processing, SIMD and FP. + shift = 28, mask = 7, + { -- 001 + shift = 24, mask = 1, + [0] = { + shift = 21, mask = 1, + { + shift = 10, mask = 3, + [0] = { + shift = 12, mask = 1, + [0] = { + shift = 13, mask = 1, + [0] = { + shift = 14, mask = 1, + [0] = { + shift = 15, mask = 1, + [0] = { -- FP/int conversion. + shift = 31, mask = 1, + [0] = { + shift = 16, mask = 0xff, + [0x20] = "fcvtnsDwNs", [0x21] = "fcvtnuDwNs", + [0x22] = "scvtfDsNw", [0x23] = "ucvtfDsNw", + [0x24] = "fcvtasDwNs", [0x25] = "fcvtauDwNs", + [0x26] = "fmovDwNs", [0x27] = "fmovDsNw", + [0x28] = "fcvtpsDwNs", [0x29] = "fcvtpuDwNs", + [0x30] = "fcvtmsDwNs", [0x31] = "fcvtmuDwNs", + [0x38] = "fcvtzsDwNs", [0x39] = "fcvtzuDwNs", + [0x60] = "fcvtnsDwNd", [0x61] = "fcvtnuDwNd", + [0x62] = "scvtfDdNw", [0x63] = "ucvtfDdNw", + [0x64] = "fcvtasDwNd", [0x65] = "fcvtauDwNd", + [0x68] = "fcvtpsDwNd", [0x69] = "fcvtpuDwNd", + [0x70] = "fcvtmsDwNd", [0x71] = "fcvtmuDwNd", + [0x78] = "fcvtzsDwNd", [0x79] = "fcvtzuDwNd" + }, + { + shift = 16, mask = 0xff, + [0x20] = "fcvtnsDxNs", [0x21] = "fcvtnuDxNs", + [0x22] = "scvtfDsNx", [0x23] = "ucvtfDsNx", + [0x24] = "fcvtasDxNs", [0x25] = "fcvtauDxNs", + [0x28] = "fcvtpsDxNs", [0x29] = "fcvtpuDxNs", + [0x30] = "fcvtmsDxNs", [0x31] = "fcvtmuDxNs", + [0x38] = "fcvtzsDxNs", [0x39] = "fcvtzuDxNs", + [0x60] = "fcvtnsDxNd", [0x61] = "fcvtnuDxNd", + [0x62] = "scvtfDdNx", [0x63] = "ucvtfDdNx", + [0x64] = "fcvtasDxNd", [0x65] = "fcvtauDxNd", + [0x66] = "fmovDxNd", [0x67] = "fmovDdNx", + [0x68] = "fcvtpsDxNd", [0x69] = "fcvtpuDxNd", + [0x70] = "fcvtmsDxNd", [0x71] = "fcvtmuDxNd", + [0x78] = "fcvtzsDxNd", [0x79] = "fcvtzuDxNd" + } + } + }, + { -- FP data-processing, 1 source. + shift = 31, mask = 1, + [0] = { + shift = 22, mask = 3, + [0] = { + shift = 15, mask = 63, + [0] = "fmovDNf", "fabsDNf", "fnegDNf", + "fsqrtDNf", false, "fcvtDdNs", false, false, + "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf", + "frintaDNf", false, "frintxDNf", "frintiDNf", + }, + { + shift = 15, mask = 63, + [0] = "fmovDNf", "fabsDNf", "fnegDNf", + "fsqrtDNf", "fcvtDsNd", false, false, false, + "frintnDNf", "frintpDNf", "frintmDNf", "frintzDNf", + "frintaDNf", false, "frintxDNf", "frintiDNf", + } + } + } + }, + { -- FP compare. + shift = 31, mask = 1, + [0] = { + shift = 14, mask = 3, + [0] = { + shift = 23, mask = 1, + [0] = { + shift = 0, mask = 31, + [0] = "fcmpNMf", [8] = "fcmpNZf", + [16] = "fcmpeNMf", [24] = "fcmpeNZf", + } + } + } + } + }, + { -- FP immediate. + shift = 31, mask = 1, + [0] = { + shift = 5, mask = 31, + [0] = { + shift = 23, mask = 1, + [0] = "fmovDFf" + } + } + } + }, + { -- FP conditional compare. + shift = 31, mask = 1, + [0] = { + shift = 23, mask = 1, + [0] = { + shift = 4, mask = 1, + [0] = "fccmpNMVCf", "fccmpeNMVCf" + } + } + }, + { -- FP data-processing, 2 sources. + shift = 31, mask = 1, + [0] = { + shift = 23, mask = 1, + [0] = { + shift = 12, mask = 15, + [0] = "fmulDNMf", "fdivDNMf", "faddDNMf", "fsubDNMf", + "fmaxDNMf", "fminDNMf", "fmaxnmDNMf", "fminnmDNMf", + "fnmulDNMf" + } + } + }, + { -- FP conditional select. + shift = 31, mask = 1, + [0] = { + shift = 23, mask = 1, + [0] = "fcselDNMCf" + } + } + } + }, + { -- FP data-processing, 3 sources. + shift = 31, mask = 1, + [0] = { + shift = 15, mask = 1, + [0] = { + shift = 21, mask = 5, + [0] = "fmaddDNMAf", "fnmaddDNMAf" + }, + { + shift = 21, mask = 5, + [0] = "fmsubDNMAf", "fnmsubDNMAf" + } + } + } + } +} + +local map_br = { -- Branches, exception generating and system instructions. + shift = 29, mask = 7, + [0] = "bB", + { -- Compare & branch, immediate. + shift = 24, mask = 3, + [0] = "cbzDBg", "cbnzDBg", "tbzDTBw", "tbnzDTBw" + }, + { -- Conditional branch, immediate. + shift = 24, mask = 3, + [0] = { + shift = 4, mask = 1, + [0] = { + shift = 0, mask = 15, + [0] = "beqB", "bneB", "bhsB", "bloB", "bmiB", "bplB", "bvsB", "bvcB", + "bhiB", "blsB", "bgeB", "bltB", "bgtB", "bleB", "balB" + } + } + }, false, "blB", + { -- Compare & branch, immediate. + shift = 24, mask = 3, + [0] = "cbzDBg", "cbnzDBg", "tbzDTBx", "tbnzDTBx" + }, + { + shift = 24, mask = 3, + [0] = { -- Exception generation. + shift = 0, mask = 0xe0001f, + [0x200000] = "brkW" + }, + { -- System instructions. + shift = 0, mask = 0x3fffff, + [0x03201f] = "nop" + }, + { -- Unconditional branch, register. + shift = 0, mask = 0xfffc1f, + [0x1f0000] = "brNx", [0x3f0000] = "blrNx", + [0x5f0000] = "retNx" + }, + } +} + +local map_init = { + shift = 25, mask = 15, + [0] = false, false, false, false, map_ls, map_datar, map_ls, map_datafp, + map_datai, map_datai, map_br, map_br, map_ls, map_datar, map_ls, map_datafp +} + +------------------------------------------------------------------------------ + +local map_regs = { x = {}, w = {}, d = {}, s = {} } + +for i=0,30 do + map_regs.x[i] = "x"..i + map_regs.w[i] = "w"..i + map_regs.d[i] = "d"..i + map_regs.s[i] = "s"..i +end +map_regs.x[31] = "sp" +map_regs.w[31] = "wsp" +map_regs.d[31] = "d31" +map_regs.s[31] = "s31" + +local map_cond = { + [0] = "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", "gt", "le", "al", +} + +local map_shift = { [0] = "lsl", "lsr", "asr", } + +local map_extend = { + [0] = "uxtb", "uxth", "uxtw", "uxtx", "sxtb", "sxth", "sxtw", "sxtx", +} + +------------------------------------------------------------------------------ + +-- Output a nicely formatted line with an opcode and operands. +local function putop(ctx, text, operands) + local pos = ctx.pos + local extra = "" + if ctx.rel then + local sym = ctx.symtab[ctx.rel] + if sym then + extra = "\t->"..sym + end + end + if ctx.hexdump > 0 then + ctx.out(format("%08x %s %-5s %s%s\n", + ctx.addr+pos, tohex(ctx.op), text, concat(operands, ", "), extra)) + else + ctx.out(format("%08x %-5s %s%s\n", + ctx.addr+pos, text, concat(operands, ", "), extra)) + end + ctx.pos = pos + 4 +end + +-- Fallback for unknown opcodes. +local function unknown(ctx) + return putop(ctx, ".long", { "0x"..tohex(ctx.op) }) +end + +local function match_reg(p, pat, regnum) + return map_regs[match(pat, p.."%w-([xwds])")][regnum] +end + +local function fmt_hex32(x) + if x < 0 then + return tohex(x) + else + return format("%x", x) + end +end + +local imm13_rep = { 0x55555555, 0x11111111, 0x01010101, 0x00010001, 0x00000001 } + +local function decode_imm13(op) + local imms = band(rshift(op, 10), 63) + local immr = band(rshift(op, 16), 63) + if band(op, 0x00400000) == 0 then + local len = 5 + if imms >= 56 then + if imms >= 60 then len = 1 else len = 2 end + elseif imms >= 48 then len = 3 elseif imms >= 32 then len = 4 end + local l = lshift(1, len)-1 + local s = band(imms, l) + local r = band(immr, l) + local imm = ror(rshift(-1, 31-s), r) + if len ~= 5 then imm = band(imm, lshift(1, l)-1) + rshift(imm, 31-l) end + imm = imm * imm13_rep[len] + local ix = fmt_hex32(imm) + if rshift(op, 31) ~= 0 then + return ix..tohex(imm) + else + return ix + end + else + local lo, hi = -1, 0 + if imms < 32 then lo = rshift(-1, 31-imms) else hi = rshift(-1, 63-imms) end + if immr ~= 0 then + lo, hi = ror(lo, immr), ror(hi, immr) + local x = immr == 32 and 0 or band(bxor(lo, hi), lshift(-1, 32-immr)) + lo, hi = bxor(lo, x), bxor(hi, x) + if immr >= 32 then lo, hi = hi, lo end + end + if hi ~= 0 then + return fmt_hex32(hi)..tohex(lo) + else + return fmt_hex32(lo) + end + end +end + +local function parse_immpc(op, name) + if name == "b" or name == "bl" then + return arshift(lshift(op, 6), 4) + elseif name == "adr" or name == "adrp" then + local immlo = band(rshift(op, 29), 3) + local immhi = lshift(arshift(lshift(op, 8), 13), 2) + return bor(immhi, immlo) + elseif name == "tbz" or name == "tbnz" then + return lshift(arshift(lshift(op, 13), 18), 2) + else + return lshift(arshift(lshift(op, 8), 13), 2) + end +end + +local function parse_fpimm8(op) + local sign = band(op, 0x100000) == 0 and 1 or -1 + local exp = bxor(rshift(arshift(lshift(op, 12), 5), 24), 0x80) - 131 + local frac = 16+band(rshift(op, 13), 15) + return sign * frac * 2^exp +end + +local function prefer_bfx(sf, uns, imms, immr) + if imms < immr or imms == 31 or imms == 63 then + return false + end + if immr == 0 then + if sf == 0 and (imms == 7 or imms == 15) then + return false + end + if sf ~= 0 and uns == 0 and (imms == 7 or imms == 15 or imms == 31) then + return false + end + end + return true +end + +-- Disassemble a single instruction. +local function disass_ins(ctx) + local pos = ctx.pos + local b0, b1, b2, b3 = byte(ctx.code, pos+1, pos+4) + local op = bor(lshift(b3, 24), lshift(b2, 16), lshift(b1, 8), b0) + local operands = {} + local suffix = "" + local last, name, pat + local vr + local map_reg + ctx.op = op + ctx.rel = nil + last = nil + local opat + opat = map_init[band(rshift(op, 25), 15)] + while type(opat) ~= "string" do + if not opat then return unknown(ctx) end + opat = opat[band(rshift(op, opat.shift), opat.mask)] or opat._ + end + name, pat = match(opat, "^([a-z0-9]*)(.*)") + local altname, pat2 = match(pat, "|([a-z0-9_.|]*)(.*)") + if altname then pat = pat2 end + if sub(pat, 1, 1) == "." then + local s2, p2 = match(pat, "^([a-z0-9.]*)(.*)") + suffix = suffix..s2 + pat = p2 + end + + local rt = match(pat, "[gf]") + if rt then + if rt == "g" then + map_reg = band(op, 0x80000000) ~= 0 and map_regs.x or map_regs.w + else + map_reg = band(op, 0x400000) ~= 0 and map_regs.d or map_regs.s + end + end + + local second0, immr + + for p in gmatch(pat, ".") do + local x = nil + if p == "D" then + local regnum = band(op, 31) + x = rt and map_reg[regnum] or match_reg(p, pat, regnum) + elseif p == "N" then + local regnum = band(rshift(op, 5), 31) + x = rt and map_reg[regnum] or match_reg(p, pat, regnum) + elseif p == "M" then + local regnum = band(rshift(op, 16), 31) + x = rt and map_reg[regnum] or match_reg(p, pat, regnum) + elseif p == "A" then + local regnum = band(rshift(op, 10), 31) + x = rt and map_reg[regnum] or match_reg(p, pat, regnum) + elseif p == "B" then + local addr = ctx.addr + pos + parse_immpc(op, name) + ctx.rel = addr + x = "0x"..tohex(addr) + elseif p == "T" then + x = bor(band(rshift(op, 26), 32), band(rshift(op, 19), 31)) + elseif p == "V" then + x = band(op, 15) + elseif p == "C" then + x = map_cond[band(rshift(op, 12), 15)] + elseif p == "c" then + local rn = band(rshift(op, 5), 31) + local rm = band(rshift(op, 16), 31) + local cond = band(rshift(op, 12), 15) + local invc = bxor(cond, 1) + x = map_cond[cond] + if altname and cond ~= 14 and cond ~= 15 then + local a1, a2 = match(altname, "([^|]*)|(.*)") + if rn == rm then + local n = #operands + operands[n] = nil + x = map_cond[invc] + if rn ~= 31 then + if a1 then name = a1 else name = altname end + else + operands[n-1] = nil + name = a2 + end + end + end + elseif p == "W" then + x = band(rshift(op, 5), 0xffff) + elseif p == "Y" then + x = band(rshift(op, 5), 0xffff) + local hw = band(rshift(op, 21), 3) + if altname and (hw == 0 or x ~= 0) then + name = altname + end + elseif p == "L" then + local rn = map_regs.x[band(rshift(op, 5), 31)] + local imm9 = arshift(lshift(op, 11), 23) + if band(op, 0x800) ~= 0 then + x = "["..rn..", #"..imm9.."]!" + else + x = "["..rn.."], #"..imm9 + end + elseif p == "U" then + local rn = map_regs.x[band(rshift(op, 5), 31)] + local sz = band(rshift(op, 30), 3) + local imm12 = lshift(arshift(lshift(op, 10), 20), sz) + if imm12 ~= 0 then + x = "["..rn..", #"..imm12.."]" + else + x = "["..rn.."]" + end + elseif p == "K" then + local rn = map_regs.x[band(rshift(op, 5), 31)] + local imm9 = arshift(lshift(op, 11), 23) + if imm9 ~= 0 then + x = "["..rn..", #"..imm9.."]" + else + x = "["..rn.."]" + end + elseif p == "O" then + local rn, rm = map_regs.x[band(rshift(op, 5), 31)] + local m = band(rshift(op, 13), 1) + if m == 0 then + rm = map_regs.w[band(rshift(op, 16), 31)] + else + rm = map_regs.x[band(rshift(op, 16), 31)] + end + x = "["..rn..", "..rm + local opt = band(rshift(op, 13), 7) + local s = band(rshift(op, 12), 1) + local sz = band(rshift(op, 30), 3) + -- extension to be applied + if opt == 3 then + if s == 0 then x = nil + else x = x..", lsl #"..sz.."]" end + elseif opt == 2 or opt == 6 or opt == 7 then + if s == 0 then x = x..", "..map_extend[opt].."]" + else x = x..", "..map_extend[opt].." #"..sz.."]" end + else + x = x.."]" + end + elseif p == "P" then + local opcv, sh = rshift(op, 26), 2 + if opcv >= 0x2a then sh = 4 elseif opcv >= 0x1b then sh = 3 end + local imm7 = lshift(arshift(lshift(op, 10), 25), sh) + local rn = map_regs.x[band(rshift(op, 5), 31)] + local ind = band(rshift(op, 23), 3) + if ind == 1 then + x = "["..rn.."], #"..imm7 + elseif ind == 2 then + if imm7 == 0 then + x = "["..rn.."]" + else + x = "["..rn..", #"..imm7.."]" + end + elseif ind == 3 then + x = "["..rn..", #"..imm7.."]!" + end + elseif p == "I" then + local shf = band(rshift(op, 22), 3) + local imm12 = band(rshift(op, 10), 0x0fff) + local n = #operands + local rn, rd = band(rshift(op, 5), 31), band(op, 31) + if altname == "mov" and shf == 0 and imm12 == 0 and (rn == 31 or rd == 31) then + name = altname + x = nil + elseif shf == 0 then + x = imm12 + elseif shf == 1 then + x = imm12..", lsl #12" + end + elseif p == "i" then + x = "#0x"..decode_imm13(op) + elseif p == "1" then + immr = band(rshift(op, 16), 63) + x = immr + elseif p == "2" then + x = band(rshift(op, 10), 63) + if altname then + local a1, a2, a3, a4, a5, a6 = + match(altname, "([^|]*)|([^|]*)|([^|]*)|([^|]*)|([^|]*)|(.*)") + local sf = band(rshift(op, 26), 32) + local uns = band(rshift(op, 30), 1) + if prefer_bfx(sf, uns, x, immr) then + name = a2 + x = x - immr + 1 + elseif immr == 0 and x == 7 then + local n = #operands + operands[n] = nil + if sf ~= 0 then + operands[n-1] = gsub(operands[n-1], "x", "w") + end + last = operands[n-1] + name = a6 + x = nil + elseif immr == 0 and x == 15 then + local n = #operands + operands[n] = nil + if sf ~= 0 then + operands[n-1] = gsub(operands[n-1], "x", "w") + end + last = operands[n-1] + name = a5 + x = nil + elseif x == 31 or x == 63 then + if x == 31 and immr == 0 and name == "sbfm" then + name = a4 + local n = #operands + operands[n] = nil + if sf ~= 0 then + operands[n-1] = gsub(operands[n-1], "x", "w") + end + last = operands[n-1] + else + name = a3 + end + x = nil + elseif band(x, 31) ~= 31 and immr == x+1 and name == "ubfm" then + name = a4 + last = "#"..(sf+32 - immr) + operands[#operands] = last + x = nil + elseif x < immr then + name = a1 + last = "#"..(sf+32 - immr) + operands[#operands] = last + x = x + 1 + end + end + elseif p == "3" then + x = band(rshift(op, 10), 63) + if altname then + local a1, a2 = match(altname, "([^|]*)|(.*)") + if x < immr then + name = a1 + local sf = band(rshift(op, 26), 32) + last = "#"..(sf+32 - immr) + operands[#operands] = last + x = x + 1 + elseif x >= immr then + name = a2 + x = x - immr + 1 + end + end + elseif p == "4" then + x = band(rshift(op, 10), 63) + local rn = band(rshift(op, 5), 31) + local rm = band(rshift(op, 16), 31) + if altname and rn == rm then + local n = #operands + operands[n] = nil + last = operands[n-1] + name = altname + end + elseif p == "5" then + x = band(rshift(op, 16), 31) + elseif p == "S" then + x = band(rshift(op, 10), 63) + if x == 0 then x = nil + else x = map_shift[band(rshift(op, 22), 3)].." #"..x end + elseif p == "X" then + local opt = band(rshift(op, 13), 7) + -- Width specifier . + if opt ~= 3 and opt ~= 7 then + last = map_regs.w[band(rshift(op, 16), 31)] + operands[#operands] = last + end + x = band(rshift(op, 10), 7) + -- Extension. + if opt == 2 + band(rshift(op, 31), 1) and + band(rshift(op, second0 and 5 or 0), 31) == 31 then + if x == 0 then x = nil + else x = "lsl #"..x end + else + if x == 0 then x = map_extend[band(rshift(op, 13), 7)] + else x = map_extend[band(rshift(op, 13), 7)].." #"..x end + end + elseif p == "R" then + x = band(rshift(op,21), 3) + if x == 0 then x = nil + else x = "lsl #"..x*16 end + elseif p == "z" then + local n = #operands + if operands[n] == "sp" then operands[n] = "xzr" + elseif operands[n] == "wsp" then operands[n] = "wzr" + end + elseif p == "Z" then + x = 0 + elseif p == "F" then + x = parse_fpimm8(op) + elseif p == "g" or p == "f" or p == "x" or p == "w" or + p == "d" or p == "s" then + -- These are handled in D/N/M/A. + elseif p == "0" then + if last == "sp" or last == "wsp" then + local n = #operands + operands[n] = nil + last = operands[n-1] + if altname then + local a1, a2 = match(altname, "([^|]*)|(.*)") + if not a1 then + name = altname + elseif second0 then + name, altname = a2, a1 + else + name, altname = a1, a2 + end + end + end + second0 = true + else + assert(false) + end + if x then + last = x + if type(x) == "number" then x = "#"..x end + operands[#operands+1] = x + end + end + + return putop(ctx, name..suffix, operands) +end + +------------------------------------------------------------------------------ + +-- Disassemble a block of code. +local function disass_block(ctx, ofs, len) + if not ofs then ofs = 0 end + local stop = len and ofs+len or #ctx.code + ctx.pos = ofs + ctx.rel = nil + while ctx.pos < stop do disass_ins(ctx) end +end + +-- Extended API: create a disassembler context. Then call ctx:disass(ofs, len). +local function create(code, addr, out) + local ctx = {} + ctx.code = code + ctx.addr = addr or 0 + ctx.out = out or io.write + ctx.symtab = {} + ctx.disass = disass_block + ctx.hexdump = 8 + return ctx +end + +-- Simple API: disassemble code (a string) at address and output via out. +local function disass(code, addr, out) + create(code, addr, out):disass() +end + +-- Return register name for RID. +local function regname(r) + if r < 32 then return map_regs.x[r] end + return map_regs.d[r-32] +end + +-- Public module functions. +return { + create = create, + disass = disass, + regname = regname +} + diff --git a/src/lj_arch.h b/src/lj_arch.h index cc5a0a66..3df602e3 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -226,7 +226,6 @@ #define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */ #define LJ_TARGET_GC64 1 #define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL -#define LJ_ARCH_NOJIT 1 /* NYI */ #define LJ_ARCH_VERSION 80 diff --git a/src/lj_asm.c b/src/lj_asm.c index 7ce58924..2cb5abea 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -171,6 +171,8 @@ IRFLDEF(FLOFS) #include "lj_emit_x86.h" #elif LJ_TARGET_ARM #include "lj_emit_arm.h" +#elif LJ_TARGET_ARM64 +#include "lj_emit_arm64.h" #elif LJ_TARGET_PPC #include "lj_emit_ppc.h" #elif LJ_TARGET_MIPS @@ -1563,6 +1565,8 @@ static void asm_loop(ASMState *as) #include "lj_asm_x86.h" #elif LJ_TARGET_ARM #include "lj_asm_arm.h" +#elif LJ_TARGET_ARM64 +#include "lj_asm_arm64.h" #elif LJ_TARGET_PPC #include "lj_asm_ppc.h" #elif LJ_TARGET_MIPS diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h new file mode 100644 index 00000000..0a2f5306 --- /dev/null +++ b/src/lj_asm_arm64.h @@ -0,0 +1,1823 @@ +/* +** ARM64 IR assembler (SSA IR -> machine code). +** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h +** +** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. +** Sponsored by Cisco Systems, Inc. +*/ + +/* -- Register allocator extensions --------------------------------------- */ + +/* Allocate a register with a hint. */ +static Reg ra_hintalloc(ASMState *as, IRRef ref, Reg hint, RegSet allow) +{ + Reg r = IR(ref)->r; + if (ra_noreg(r)) { + if (!ra_hashint(r) && !iscrossref(as, ref)) + ra_sethint(IR(ref)->r, hint); /* Propagate register hint. */ + r = ra_allocref(as, ref, allow); + } + ra_noweak(as, r); + return r; +} + +/* Allocate two source registers for three-operand instructions. */ +static Reg ra_alloc2(ASMState *as, IRIns *ir, RegSet allow) +{ + IRIns *irl = IR(ir->op1), *irr = IR(ir->op2); + Reg left = irl->r, right = irr->r; + if (ra_hasreg(left)) { + ra_noweak(as, left); + if (ra_noreg(right)) + right = ra_allocref(as, ir->op2, rset_exclude(allow, left)); + else + ra_noweak(as, right); + } else if (ra_hasreg(right)) { + ra_noweak(as, right); + left = ra_allocref(as, ir->op1, rset_exclude(allow, right)); + } else if (ra_hashint(right)) { + right = ra_allocref(as, ir->op2, allow); + left = ra_alloc1(as, ir->op1, rset_exclude(allow, right)); + } else { + left = ra_allocref(as, ir->op1, allow); + right = ra_alloc1(as, ir->op2, rset_exclude(allow, left)); + } + return left | (right << 8); +} + +/* -- Guard handling ------------------------------------------------------ */ + +/* Generate an exit stub group at the bottom of the reserved MCode memory. */ +static MCode *asm_exitstub_gen(ASMState *as, ExitNo group) +{ + MCode *mxp = as->mcbot; + int i; + if (mxp + 3*4+4*EXITSTUBS_PER_GROUP >= as->mctop) + asm_mclimit(as); + /* str lr, [sp]; bl ->vm_exit_handler; .long group. */ + *mxp++ = A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP); + *mxp = A64I_BL | (((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu); + mxp++; + *mxp++ = group*EXITSTUBS_PER_GROUP; + for (i = 0; i < EXITSTUBS_PER_GROUP; i++) + *mxp++ = A64I_B | ((-3-i)&0x03ffffffu); + lj_mcode_sync(as->mcbot, mxp); + lj_mcode_commitbot(as->J, mxp); + as->mcbot = mxp; + as->mclim = as->mcbot + MCLIM_REDZONE; + return mxp - EXITSTUBS_PER_GROUP; +} + +/* Setup all needed exit stubs. */ +static void asm_exitstub_setup(ASMState *as, ExitNo nexits) +{ + ExitNo i; + if (nexits >= EXITSTUBS_PER_GROUP*LJ_MAX_EXITSTUBGR) + lj_trace_err(as->J, LJ_TRERR_SNAPOV); + for (i = 0; i < (nexits+EXITSTUBS_PER_GROUP-1)/EXITSTUBS_PER_GROUP; i++) + if (as->J->exitstubgroup[i] == NULL) + as->J->exitstubgroup[i] = asm_exitstub_gen(as, i); +} + +/* Emit conditional branch to exit for guard. */ +static void asm_guardcc(ASMState *as, A64CC cc) +{ + MCode *target = exitstub_addr(as->J, as->snapno); + MCode *p = as->mcp; + if (LJ_UNLIKELY(p == as->invmcp)) { + as->loopinv = 1; + *p = A64I_BL | ((target-p) & 0x03ffffffu); + emit_cond_branch(as, cc^1, p-1); + return; + } + /* No conditional calls. Emit b.cc/bl instead. */ + /* That's a bad idea. NYI: emit per-trace exit stubs instead, see PPC. */ + emit_branch(as, A64I_BL, target); + emit_cond_branch(as, cc^1, p); +} + +/* -- Operand fusion ------------------------------------------------------ */ + +/* Limit linear search to this distance. Avoids O(n^2) behavior. */ +#define CONFLICT_SEARCH_LIM 31 + +static int asm_isk32(ASMState *as, IRRef ref, int32_t *k) +{ + if (irref_isk(ref)) { + IRIns *ir = IR(ref); + if (ir->o == IR_KNULL || !irt_is64(ir->t)) { + *k = ir->i; + return 1; + } else if (checki32((int64_t)ir_k64(ir)->u64)) { + *k = (int32_t)ir_k64(ir)->u64; + return 1; + } + } + return 0; +} + +/* Check if there's no conflicting instruction between curins and ref. */ +static int noconflict(ASMState *as, IRRef ref, IROp conflict) +{ + IRIns *ir = as->ir; + IRRef i = as->curins; + if (i > ref + CONFLICT_SEARCH_LIM) + return 0; /* Give up, ref is too far away. */ + while (--i > ref) + if (ir[i].o == conflict) + return 0; /* Conflict found. */ + return 1; /* Ok, no conflict. */ +} + +/* Fuse the array base of colocated arrays. */ +static int32_t asm_fuseabase(ASMState *as, IRRef ref) +{ + IRIns *ir = IR(ref); + if (ir->o == IR_TNEW && ir->op1 <= LJ_MAX_COLOSIZE && + !neverfuse(as) && noconflict(as, ref, IR_NEWREF)) + return (int32_t)sizeof(GCtab); + return 0; +} + +#define FUSE_REG 0x40000000 + +/* Fuse array/hash/upvalue reference into register+offset operand. */ +static Reg asm_fuseahuref(ASMState *as, IRRef ref, int32_t *ofsp, RegSet allow, + A64Ins ins) +{ + IRIns *ir = IR(ref); + if (ra_noreg(ir->r)) { + if (ir->o == IR_AREF) { + if (mayfuse(as, ref)) { + if (irref_isk(ir->op2)) { + IRRef tab = IR(ir->op1)->op1; + int32_t ofs = asm_fuseabase(as, tab); + IRRef refa = ofs ? tab : ir->op1; + ofs += 8*IR(ir->op2)->i; + if (emit_checkofs(ins, ofs)) { + *ofsp = ofs; + return ra_alloc1(as, refa, allow); + } + } else { + Reg base = ra_alloc1(as, ir->op1, allow); + *ofsp = FUSE_REG|ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base)); + return base; + } + } + } else if (ir->o == IR_HREFK) { + if (mayfuse(as, ref)) { + int32_t ofs = (int32_t)(IR(ir->op2)->op2 * sizeof(Node)); + if (emit_checkofs(ins, ofs)) { + *ofsp = ofs; + return ra_alloc1(as, ir->op1, allow); + } + } + } else if (ir->o == IR_UREFC) { + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + GCupval *uv = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv; + int64_t ofs = glofs(as, &uv->tv); + if (emit_checkofs(ins, ofs)) { + *ofsp = (int32_t)ofs; + return RID_GL; + } + } + } + } + *ofsp = 0; + return ra_alloc1(as, ref, allow); +} + +/* Fuse m operand into arithmetic/logic instructions. */ +static uint32_t asm_fuseopm(ASMState *as, A64Ins ai, IRRef ref, RegSet allow) +{ + IRIns *ir = IR(ref); + if (ra_hasreg(ir->r)) { + ra_noweak(as, ir->r); + return A64F_M(ir->r); + } else if (irref_isk(ref)) { + uint32_t m; + int64_t k = get_k64val(ir); + if ((ai & 0x1f000000) == 0x0a000000) + m = emit_isk13(k, irt_is64(ir->t)); + else + m = emit_isk12(k); + if (m) + return m; + } else if (mayfuse(as, ref)) { + if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR && irref_isk(ir->op2)) || + (ir->o == IR_ADD && ir->op1 == ir->op2)) { + A64Shift sh = ir->o == IR_BSHR ? A64SH_LSR : + ir->o == IR_BSAR ? A64SH_ASR : A64SH_LSL; + int shift = ir->o == IR_ADD ? 1 : + (IR(ir->op2)->i & (irt_is64(ir->t) ? 63 : 31)); + IRIns *irl = IR(ir->op1); + if (sh == A64SH_LSL && + irl->o == IR_CONV && + irl->op2 == ((IRT_I64<op1)) { + Reg m = ra_alloc1(as, irl->op1, allow); + return A64F_M(m) | A64F_EXSH(A64EX_SXTW, shift); + } else { + Reg m = ra_alloc1(as, ir->op1, allow); + return A64F_M(m) | A64F_SH(sh, shift); + } + } else if (ir->o == IR_CONV && + ir->op2 == ((IRT_I64<op1, allow); + return A64F_M(m) | A64F_EX(A64EX_SXTW); + } + } + return A64F_M(ra_allocref(as, ref, allow)); +} + +/* Fuse XLOAD/XSTORE reference into load/store operand. */ +static void asm_fusexref(ASMState *as, A64Ins ai, Reg rd, IRRef ref, + RegSet allow) +{ + IRIns *ir = IR(ref); + Reg base; + int32_t ofs = 0; + if (ra_noreg(ir->r) && canfuse(as, ir)) { + if (ir->o == IR_ADD) { + if (asm_isk32(as, ir->op2, &ofs) && emit_checkofs(ai, ofs)) + ref = ir->op1; + /* NYI: Fuse add with two registers. */ + } else if (ir->o == IR_STRREF) { + if (asm_isk32(as, ir->op2, &ofs)) { + ref = ir->op1; + } else if (asm_isk32(as, ir->op1, &ofs)) { + ref = ir->op2; + } else { + /* NYI: Fuse ADD with constant. */ + Reg rn = ra_alloc1(as, ir->op1, allow); + uint32_t m = asm_fuseopm(as, 0, ir->op2, rset_exclude(allow, rn)); + emit_lso(as, ai, rd, rd, sizeof(GCstr)); + emit_dn(as, A64I_ADDx^m, rd, rn); + return; + } + ofs += sizeof(GCstr); + if (!emit_checkofs(ai, ofs)) { + Reg rn = ra_alloc1(as, ref, allow); + Reg rm = ra_allock(as, ofs, rset_exclude(allow, rn)); + emit_dnm(as, (ai ^ 0x01204800), rd, rn, rm); + return; + } + } + } + base = ra_alloc1(as, ref, allow); + emit_lso(as, ai, (rd & 31), base, ofs); +} + +/* -- Calls --------------------------------------------------------------- */ + +/* Generate a call to a C function. */ +static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) +{ + uint32_t n, nargs = CCI_XNARGS(ci); + int32_t ofs = 0; + Reg gpr, fpr = REGARG_FIRSTFPR; + if ((void *)ci->func) + emit_call(as, (void *)ci->func); + for (gpr = REGARG_FIRSTGPR; gpr <= REGARG_LASTGPR; gpr++) + as->cost[gpr] = REGCOST(~0u, ASMREF_L); + gpr = REGARG_FIRSTGPR; + for (n = 0; n < nargs; n++) { /* Setup args. */ + IRRef ref = args[n]; + IRIns *ir = IR(ref); + if (ref) { + if (irt_isfp(ir->t)) { + if (fpr <= REGARG_LASTFPR) { + lua_assert(rset_test(as->freeset, fpr)); /* Must have been evicted. */ + ra_leftov(as, fpr, ref); + fpr++; + } else { + Reg r = ra_alloc1(as, ref, RSET_FPR); + emit_spstore(as, ir, r, ofs); + ofs += 8; + } + } else { + if (gpr <= REGARG_LASTGPR) { + lua_assert(rset_test(as->freeset, gpr)); /* Must have been evicted. */ + ra_leftov(as, gpr, ref); + gpr++; + } else { + Reg r = ra_alloc1(as, ref, RSET_GPR); + emit_spstore(as, ir, r, ofs); + ofs += 8; + } + } + } + } +} + +/* Setup result reg/sp for call. Evict scratch regs. */ +static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) +{ + RegSet drop = RSET_SCRATCH; + if (ra_hasreg(ir->r)) + rset_clear(drop, ir->r); /* Dest reg handled below. */ + ra_evictset(as, drop); /* Evictions must be performed first. */ + if (ra_used(ir)) { + lua_assert(!irt_ispri(ir->t)); + if (irt_isfp(ir->t)) { + if (ci->flags & CCI_CASTU64) { + Reg dest = ra_dest(as, ir, RSET_FPR) & 31; + emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D_R : A64I_FMOV_S_R, + dest, RID_RET); + } else { + ra_destreg(as, ir, RID_FPRET); + } + } else { + ra_destreg(as, ir, RID_RET); + } + } + UNUSED(ci); +} + +static void asm_callx(ASMState *as, IRIns *ir) +{ + IRRef args[CCI_NARGS_MAX*2]; + CCallInfo ci; + IRRef func; + IRIns *irf; + ci.flags = asm_callx_flags(as, ir); + asm_collectargs(as, ir, &ci, args); + asm_setupresult(as, ir, &ci); + func = ir->op2; irf = IR(func); + if (irf->o == IR_CARG) { func = irf->op1; irf = IR(func); } + if (irref_isk(func)) { /* Call to constant address. */ + ci.func = (ASMFunction)(ir_k64(irf)->u64); + } else { /* Need a non-argument register for indirect calls. */ + Reg freg = ra_alloc1(as, func, RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED); + emit_n(as, A64I_BLR, freg); + ci.func = (ASMFunction)(void *)0; + } + asm_gencall(as, &ci, args); +} + +/* -- Returns ------------------------------------------------------------- */ + +/* Return to lower frame. Guard that it goes to the right spot. */ +static void asm_retf(ASMState *as, IRIns *ir) +{ + Reg base = ra_alloc1(as, REF_BASE, RSET_GPR); + void *pc = ir_kptr(IR(ir->op2)); + int32_t delta = 1+LJ_FR2+bc_a(*((const BCIns *)pc - 1)); + as->topslot -= (BCReg)delta; + if ((int32_t)as->topslot < 0) as->topslot = 0; + irt_setmark(IR(REF_BASE)->t); /* Children must not coalesce with BASE reg. */ + /* Need to force a spill on REF_BASE now to update the stack slot. */ + emit_lso(as, A64I_STRx, base, RID_SP, ra_spill(as, IR(REF_BASE))); + emit_setgl(as, base, jit_base); + emit_addptr(as, base, -8*delta); + asm_guardcc(as, CC_NE); + emit_nm(as, A64I_CMPx, RID_TMP, + ra_allock(as, i64ptr(pc), rset_exclude(RSET_GPR, base))); + emit_lso(as, A64I_LDRx, RID_TMP, base, -8); +} + +/* -- Type conversions ---------------------------------------------------- */ + +static void asm_tointg(ASMState *as, IRIns *ir, Reg left) +{ + Reg tmp = ra_scratch(as, rset_exclude(RSET_FPR, left)); + Reg dest = ra_dest(as, ir, RSET_GPR); + asm_guardcc(as, CC_NE); + emit_nm(as, A64I_FCMPd, (tmp & 31), (left & 31)); + emit_dn(as, A64I_FCVT_F64_S32, (tmp & 31), dest); + emit_dn(as, A64I_FCVT_S32_F64, dest, (left & 31)); +} + +static void asm_tobit(ASMState *as, IRIns *ir) +{ + RegSet allow = RSET_FPR; + Reg left = ra_alloc1(as, ir->op1, allow); + Reg right = ra_alloc1(as, ir->op2, rset_clear(allow, left)); + Reg tmp = ra_scratch(as, rset_clear(allow, right)); + Reg dest = ra_dest(as, ir, RSET_GPR); + emit_dn(as, A64I_FMOV_R_S, dest, (tmp & 31)); + emit_dnm(as, A64I_FADDd, (tmp & 31), (left & 31), (right & 31)); +} + +static void asm_conv(ASMState *as, IRIns *ir) +{ + IRType st = (IRType)(ir->op2 & IRCONV_SRCMASK); + int st64 = (st == IRT_I64 || st == IRT_U64 || st == IRT_P64); + int stfp = (st == IRT_NUM || st == IRT_FLOAT); + IRRef lref = ir->op1; + lua_assert(irt_type(ir->t) != st); + if (irt_isfp(ir->t)) { + Reg dest = ra_dest(as, ir, RSET_FPR); + if (stfp) { /* FP to FP conversion. */ + emit_dn(as, st == IRT_NUM ? A64I_FCVT_F32_F64 : A64I_FCVT_F64_F32, + (dest & 31), (ra_alloc1(as, lref, RSET_FPR) & 31)); + } else { /* Integer to FP conversion. */ + Reg left = ra_alloc1(as, lref, RSET_GPR); + A64Ins ai = irt_isfloat(ir->t) ? + (((IRT_IS64 >> st) & 1) ? + (st == IRT_I64 ? A64I_FCVT_F32_S64 : A64I_FCVT_F32_U64) : + (st == IRT_INT ? A64I_FCVT_F32_S32 : A64I_FCVT_F32_U32)) : + (((IRT_IS64 >> st) & 1) ? + (st == IRT_I64 ? A64I_FCVT_F64_S64 : A64I_FCVT_F64_U64) : + (st == IRT_INT ? A64I_FCVT_F64_S32 : A64I_FCVT_F64_U32)); + emit_dn(as, ai, (dest & 31), left); + } + } else if (stfp) { /* FP to integer conversion. */ + if (irt_isguard(ir->t)) { + /* Checked conversions are only supported from number to int. */ + lua_assert(irt_isint(ir->t) && st == IRT_NUM); + asm_tointg(as, ir, ra_alloc1(as, lref, RSET_FPR)); + } else { + Reg left = ra_alloc1(as, lref, RSET_FPR); + Reg dest = ra_dest(as, ir, RSET_GPR); + A64Ins ai = irt_is64(ir->t) ? + (st == IRT_NUM ? + (irt_isi64(ir->t) ? A64I_FCVT_S64_F64 : A64I_FCVT_U64_F64) : + (irt_isi64(ir->t) ? A64I_FCVT_S64_F32 : A64I_FCVT_U64_F32)) : + (st == IRT_NUM ? + (irt_isint(ir->t) ? A64I_FCVT_S32_F64 : A64I_FCVT_U32_F64) : + (irt_isint(ir->t) ? A64I_FCVT_S32_F32 : A64I_FCVT_U32_F32)); + emit_dn(as, ai, dest, (left & 31)); + } + } else if (st >= IRT_I8 && st <= IRT_U16) { /* Extend to 32 bit integer. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, lref, RSET_GPR); + A64Ins ai = st == IRT_I8 ? A64I_SXTBw : + st == IRT_U8 ? A64I_UXTBw : + st == IRT_I16 ? A64I_SXTHw : A64I_UXTHw; + lua_assert(irt_isint(ir->t) || irt_isu32(ir->t)); + emit_dn(as, ai, dest, left); + } else { + Reg dest = ra_dest(as, ir, RSET_GPR); + if (irt_is64(ir->t)) { + if (st64 || !(ir->op2 & IRCONV_SEXT)) { + /* 64/64 bit no-op (cast) or 32 to 64 bit zero extension. */ + ra_leftov(as, dest, lref); /* Do nothing, but may need to move regs. */ + } else { /* 32 to 64 bit sign extension. */ + Reg left = ra_alloc1(as, lref, RSET_GPR); + emit_dn(as, A64I_SXTW, dest, left); + } + } else { + if (st64) { + /* This is either a 32 bit reg/reg mov which zeroes the hiword + ** or a load of the loword from a 64 bit address. + */ + Reg left = ra_alloc1(as, lref, RSET_GPR); + emit_dm(as, A64I_MOVw, dest, left); + } else { /* 32/32 bit no-op (cast). */ + ra_leftov(as, dest, lref); /* Do nothing, but may need to move regs. */ + } + } + } +} + +static void asm_strto(ASMState *as, IRIns *ir) +{ + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_strscan_num]; + IRRef args[2]; + Reg dest = 0, tmp; + int destused = ra_used(ir); + int32_t ofs = 0; + ra_evictset(as, RSET_SCRATCH); + if (destused) { + if (ra_hasspill(ir->s)) { + ofs = sps_scale(ir->s); + destused = 0; + if (ra_hasreg(ir->r)) { + ra_free(as, ir->r); + ra_modified(as, ir->r); + emit_spload(as, ir, ir->r, ofs); + } + } else { + dest = ra_dest(as, ir, RSET_FPR); + } + } + asm_guardcc(as, CC_EQ); + if (destused) + emit_lso(as, A64I_LDRd, (dest & 31), RID_SP, 0); + emit_n(as, (A64I_CMPw^A64I_K12)|A64F_U12(0), RID_RET); + args[0] = ir->op1; /* GCstr *str */ + args[1] = ASMREF_TMP1; /* TValue *n */ + asm_gencall(as, ci, args); + tmp = ra_releasetmp(as, ASMREF_TMP1); + emit_opk(as, A64I_ADDx, tmp, RID_SP, ofs, RSET_GPR); +} + +/* -- Memory references --------------------------------------------------- */ + +/* Get pointer to TValue. */ +static void asm_tvptr(ASMState *as, Reg dest, IRRef ref) +{ + IRIns *ir = IR(ref); + if (irt_isnum(ir->t)) { + if (irref_isk(ref)) { + /* Use the number constant itself as a TValue. */ + ra_allockreg(as, i64ptr(ir_knum(ir)), dest); + } else { + /* Otherwise force a spill and use the spill slot. */ + emit_opk(as, A64I_ADDx, dest, RID_SP, ra_spill(as, ir), RSET_GPR); + } + } else { + /* Otherwise use g->tmptv to hold the TValue. */ + RegSet allow = rset_exclude(RSET_GPR, dest); + Reg src; + if (irref_isk(ref)) { + TValue k; + lj_ir_kvalue(as->J->L, &k, ir); + src = ra_allock(as, k.u64, allow); + emit_lso(as, A64I_STRx, src, dest, 0); + } else { + Reg type; + if (irt_ispri(ir->t)) { + src = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow); + emit_lso(as, A64I_STRx, src, dest, 0); + } else if (irt_isint(ir->t)) { + src = ra_alloc1(as, ref, allow); + type = ra_allock(as, (int64_t)irt_toitype(ir->t) << 47, allow); + emit_lso(as, A64I_STRx, RID_TMP, dest, 0); + emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src); + } else { + src = ra_alloc1(as, ref, allow); + type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); + emit_lso(as, A64I_STRx, RID_TMP, dest, 0); + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type); + } + } + ra_allockreg(as, i64ptr(&J2G(as->J)->tmptv), dest); + } +} + +static void asm_aref(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg idx, base; + if (irref_isk(ir->op2)) { + IRRef tab = IR(ir->op1)->op1; + int32_t ofs = asm_fuseabase(as, tab); + IRRef refa = ofs ? tab : ir->op1; + uint32_t k = emit_isk12(ofs + 8*IR(ir->op2)->i); + if (k) { + base = ra_alloc1(as, refa, RSET_GPR); + emit_dn(as, A64I_ADDx^k, dest, base); + return; + } + } + base = ra_alloc1(as, ir->op1, RSET_GPR); + idx = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, base)); + emit_dnm(as, A64I_ADDx | A64F_EXSH(A64EX_UXTW, 3), dest, base, idx); +} + +/* Inlined hash lookup. Specialized for key type and for const keys. +** The equivalent C code is: +** Node *n = hashkey(t, key); +** do { +** if (lj_obj_equal(&n->key, key)) return &n->val; +** } while ((n = nextnode(n))); +** return niltv(L); +*/ +static void asm_href(ASMState *as, IRIns *ir, IROp merge) +{ + RegSet allow = RSET_GPR; + int destused = ra_used(ir); + Reg dest = ra_dest(as, ir, allow); + Reg tab = ra_alloc1(as, ir->op1, rset_clear(allow, dest)); + Reg key = 0, tmp = RID_TMP; + IRRef refkey = ir->op2; + IRIns *irkey = IR(refkey); + int isk = irref_isk(ir->op2); + IRType1 kt = irkey->t; + uint32_t k = 0; + uint32_t khash; + MCLabel l_end, l_loop, l_next; + rset_clear(allow, tab); + + if (!isk) { + key = ra_alloc1(as, ir->op2, irt_isnum(kt) ? RSET_FPR : allow); + rset_clear(allow, key); + if (!irt_isstr(kt)) { + tmp = ra_scratch(as, allow); + rset_clear(allow, tmp); + } + } else if (irt_isnum(kt)) { + int64_t val = (int64_t)ir_knum(irkey)->u64; + if (!(k = emit_isk12(val))) { + key = ra_allock(as, val, allow); + rset_clear(allow, key); + } + } else if (!irt_ispri(kt)) { + if (!(k = emit_isk12(irkey->i))) { + key = ra_alloc1(as, refkey, allow); + rset_clear(allow, key); + } + } + + /* Key not found in chain: jump to exit (if merged) or load niltv. */ + l_end = emit_label(as); + as->invmcp = NULL; + if (merge == IR_NE) + asm_guardcc(as, CC_AL); + else if (destused) + emit_loada(as, dest, niltvg(J2G(as->J))); + + /* Follow hash chain until the end. */ + l_loop = --as->mcp; + emit_n(as, A64I_CMPx^A64I_K12^0, dest); + emit_lso(as, A64I_LDRx, dest, dest, offsetof(Node, next)); + l_next = emit_label(as); + + /* Type and value comparison. */ + if (merge == IR_EQ) + asm_guardcc(as, CC_EQ); + else + emit_cond_branch(as, CC_EQ, l_end); + + if (irt_isnum(kt)) { + if (isk) { + /* Assumes -0.0 is already canonicalized to +0.0. */ + if (k) + emit_n(as, A64I_CMPx^k, tmp); + else + emit_nm(as, A64I_CMPx, key, tmp); + emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64)); + } else { + Reg tisnum = ra_allock(as, LJ_TISNUM << 15, allow); + Reg ftmp = ra_scratch(as, rset_exclude(RSET_FPR, key)); + rset_clear(allow, tisnum); + emit_nm(as, A64I_FCMPd, key, ftmp); + emit_dn(as, A64I_FMOV_D_R, (ftmp & 31), (tmp & 31)); + emit_cond_branch(as, CC_LO, l_next); + emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), tisnum, tmp); + emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.n)); + } + } else if (irt_isaddr(kt)) { + Reg scr; + if (isk) { + int64_t kk = ((int64_t)irt_toitype(irkey->t) << 47) | irkey[1].tv.u64; + scr = ra_allock(as, kk, allow); + emit_nm(as, A64I_CMPx, scr, tmp); + emit_lso(as, A64I_LDRx, tmp, dest, offsetof(Node, key.u64)); + } else { + scr = ra_scratch(as, allow); + emit_nm(as, A64I_CMPx, tmp, scr); + emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key.u64)); + } + rset_clear(allow, scr); + } else { + Reg type, scr; + lua_assert(irt_ispri(kt) && !irt_isnil(kt)); + type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow); + scr = ra_scratch(as, rset_clear(allow, type)); + rset_clear(allow, scr); + emit_nm(as, A64I_CMPw, scr, type); + emit_lso(as, A64I_LDRx, scr, dest, offsetof(Node, key)); + } + + *l_loop = A64I_BCC | A64F_S19((as->mcp-l_loop) & 0x0007ffffu) | CC_NE; + if (!isk && irt_isaddr(kt)) { + Reg type = ra_allock(as, (int32_t)irt_toitype(kt), allow); + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, key, type); + rset_clear(allow, type); + } + /* Load main position relative to tab->node into dest. */ + khash = isk ? ir_khash(irkey) : 1; + if (khash == 0) { + emit_lso(as, A64I_LDRx, dest, tab, offsetof(GCtab, node)); + } else { + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 3), dest, tmp, dest); + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 1), dest, dest, dest); + emit_lso(as, A64I_LDRx, tmp, tab, offsetof(GCtab, node)); + if (isk) { + Reg tmphash = ra_allock(as, khash, allow); + emit_dnm(as, A64I_ANDw, dest, dest, tmphash); + emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask)); + } else if (irt_isstr(kt)) { + /* Fetch of str->hash is cheaper than ra_allock. */ + emit_dnm(as, A64I_ANDw, dest, dest, tmp); + emit_lso(as, A64I_LDRw, tmp, key, offsetof(GCstr, hash)); + emit_lso(as, A64I_LDRw, dest, tab, offsetof(GCtab, hmask)); + } else { /* Must match with hash*() in lj_tab.c. */ + emit_dnm(as, A64I_ANDw, dest, dest, tmp); + emit_lso(as, A64I_LDRw, tmp, tab, offsetof(GCtab, hmask)); + emit_dnm(as, A64I_SUBw, dest, dest, tmp); + emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT3)), tmp, tmp, tmp); + emit_dnm(as, A64I_EORw, dest, dest, tmp); + emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT2)), dest, dest, dest); + emit_dnm(as, A64I_SUBw, tmp, tmp, dest); + emit_dnm(as, A64I_EXTRw | (A64F_IMMS(32-HASH_ROT1)), dest, dest, dest); + emit_dnm(as, A64I_EORw, tmp, tmp, dest); + if (irt_isnum(kt)) { + emit_dnm(as, A64I_ADDw, dest, dest, dest); + emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest); + emit_dm(as, A64I_MOVw, tmp, dest); + emit_dn(as, A64I_FMOV_R_D, dest, (key & 31)); + } else { + checkmclim(as); + emit_dm(as, A64I_MOVw, tmp, key); + emit_dnm(as, A64I_EORw, dest, dest, + ra_allock(as, irt_toitype(kt) << 15, allow)); + emit_dn(as, A64I_LSRx | A64F_IMMR(32)|A64F_IMMS(32), dest, dest); + emit_dm(as, A64I_MOVx, dest, key); + } + } + } +} + +static void asm_hrefk(ASMState *as, IRIns *ir) +{ + IRIns *kslot = IR(ir->op2); + IRIns *irkey = IR(kslot->op1); + int32_t ofs = (int32_t)(kslot->op2 * sizeof(Node)); + int32_t kofs = ofs + (int32_t)offsetof(Node, key); + int bigofs = !emit_checkofs(A64I_LDRx, ofs); + RegSet allow = RSET_GPR; + Reg dest = (ra_used(ir) || bigofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE; + Reg node = ra_alloc1(as, ir->op1, allow); + Reg key = ra_scratch(as, rset_clear(allow, node)); + Reg idx = node; + uint64_t k; + lua_assert(ofs % sizeof(Node) == 0); + rset_clear(allow, key); + if (bigofs) { + idx = dest; + rset_clear(allow, dest); + kofs = (int32_t)offsetof(Node, key); + } else if (ra_hasreg(dest)) { + emit_opk(as, A64I_ADDx, dest, node, ofs, allow); + } + asm_guardcc(as, CC_NE); + if (irt_ispri(irkey->t)) { + k = ~((int64_t)~irt_toitype(irkey->t) << 47); + } else if (irt_isnum(irkey->t)) { + k = ir_knum(irkey)->u64; + } else { + k = ((uint64_t)irt_toitype(irkey->t) << 47) | (uint64_t)ir_kgc(irkey); + } + emit_nm(as, A64I_CMPx, key, ra_allock(as, k, allow)); + emit_lso(as, A64I_LDRx, key, idx, kofs); + if (bigofs) + emit_opk(as, A64I_ADDx, dest, node, ofs, RSET_GPR); +} + +static void asm_uref(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + if (irref_isk(ir->op1)) { + GCfunc *fn = ir_kfunc(IR(ir->op1)); + MRef *v = &gcref(fn->l.uvptr[(ir->op2 >> 8)])->uv.v; + emit_lsptr(as, A64I_LDRx, dest, v); + } else { + Reg uv = ra_scratch(as, RSET_GPR); + Reg func = ra_alloc1(as, ir->op1, RSET_GPR); + if (ir->o == IR_UREFC) { + asm_guardcc(as, CC_NE); + emit_n(as, (A64I_CMPx^A64I_K12) | A64F_U12(1), RID_TMP); + emit_opk(as, A64I_ADDx, dest, uv, + (int32_t)offsetof(GCupval, tv), RSET_GPR); + emit_lso(as, A64I_LDRB, RID_TMP, uv, (int32_t)offsetof(GCupval, closed)); + } else { + emit_lso(as, A64I_LDRx, dest, uv, (int32_t)offsetof(GCupval, v)); + } + emit_lso(as, A64I_LDRx, uv, func, + (int32_t)offsetof(GCfuncL, uvptr) + 8*(int32_t)(ir->op2 >> 8)); + } +} + +static void asm_fref(ASMState *as, IRIns *ir) +{ + UNUSED(as); UNUSED(ir); + lua_assert(!ra_used(ir)); +} + +static void asm_strref(ASMState *as, IRIns *ir) +{ + RegSet allow = RSET_GPR; + Reg dest = ra_dest(as, ir, allow); + Reg base = ra_alloc1(as, ir->op1, allow); + IRIns *irr = IR(ir->op2); + int32_t ofs = sizeof(GCstr); + uint32_t m; + rset_clear(allow, base); + if (irref_isk(ir->op2) && (m = emit_isk12(ofs + irr->i))) { + emit_dn(as, A64I_ADDx^m, dest, base); + } else { + emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, dest); + emit_dnm(as, A64I_ADDx, dest, base, ra_alloc1(as, ir->op2, allow)); + } +} + +/* -- Loads and stores ---------------------------------------------------- */ + +static A64Ins asm_fxloadins(IRIns *ir) +{ + switch (irt_type(ir->t)) { + case IRT_I8: return A64I_LDRB ^ A64I_LS_S; + case IRT_U8: return A64I_LDRB; + case IRT_I16: return A64I_LDRH ^ A64I_LS_S; + case IRT_U16: return A64I_LDRH; + case IRT_NUM: return A64I_LDRd; + case IRT_FLOAT: return A64I_LDRs; + default: return irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw; + } +} + +static A64Ins asm_fxstoreins(IRIns *ir) +{ + switch (irt_type(ir->t)) { + case IRT_I8: case IRT_U8: return A64I_STRB; + case IRT_I16: case IRT_U16: return A64I_STRH; + case IRT_NUM: return A64I_STRd; + case IRT_FLOAT: return A64I_STRs; + default: return irt_is64(ir->t) ? A64I_STRx : A64I_STRw; + } +} + +static void asm_fload(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg idx; + A64Ins ai = asm_fxloadins(ir); + int32_t ofs; + if (ir->op1 == REF_NIL) { + idx = RID_GL; + ofs = (ir->op2 << 2) - GG_OFS(g); + } else { + idx = ra_alloc1(as, ir->op1, RSET_GPR); + if (ir->op2 == IRFL_TAB_ARRAY) { + ofs = asm_fuseabase(as, ir->op1); + if (ofs) { /* Turn the t->array load into an add for colocated arrays. */ + emit_dn(as, (A64I_ADDx^A64I_K12) | A64F_U12(ofs), dest, idx); + return; + } + } + ofs = field_ofs[ir->op2]; + } + emit_lso(as, ai, (dest & 31), idx, ofs); +} + +static void asm_fstore(ASMState *as, IRIns *ir) +{ + if (ir->r != RID_SINK) { + Reg src = ra_alloc1(as, ir->op2, RSET_GPR); + IRIns *irf = IR(ir->op1); + Reg idx = ra_alloc1(as, irf->op1, rset_exclude(RSET_GPR, src)); + int32_t ofs = field_ofs[irf->op2]; + emit_lso(as, asm_fxstoreins(ir), (src & 31), idx, ofs); + } +} + +static void asm_xload(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); + lua_assert(!(ir->op2 & IRXLOAD_UNALIGNED)); + asm_fusexref(as, asm_fxloadins(ir), dest, ir->op1, RSET_GPR); +} + +static void asm_xstore(ASMState *as, IRIns *ir) +{ + if (ir->r != RID_SINK) { + Reg src = ra_alloc1(as, ir->op2, irt_isfp(ir->t) ? RSET_FPR : RSET_GPR); + asm_fusexref(as, asm_fxstoreins(ir), src, ir->op1, + rset_exclude(RSET_GPR, src)); + } +} + +static void asm_ahuvload(ASMState *as, IRIns *ir) +{ + Reg idx, tmp, type; + int32_t ofs = 0; + RegSet gpr = RSET_GPR, allow = irt_isnum(ir->t) ? RSET_FPR : RSET_GPR; + lua_assert(irt_isnum(ir->t) || irt_ispri(ir->t) || irt_isaddr(ir->t) || + irt_isint(ir->t)); + if (ra_used(ir)) { + Reg dest = ra_dest(as, ir, allow); + tmp = irt_isnum(ir->t) ? ra_scratch(as, rset_clear(gpr, dest)) : dest; + if (irt_isaddr(ir->t)) { + emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest); + } else if (irt_isnum(ir->t)) { + emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp); + } else if (irt_isint(ir->t)) { + emit_dm(as, A64I_MOVw, dest, dest); + } + } else { + tmp = ra_scratch(as, gpr); + } + type = ra_scratch(as, rset_clear(gpr, tmp)); + idx = asm_fuseahuref(as, ir->op1, &ofs, rset_clear(gpr, type), A64I_LDRx); + /* Always do the type check, even if the load result is unused. */ + asm_guardcc(as, irt_isnum(ir->t) ? CC_LS : CC_NE); + if (irt_type(ir->t) >= IRT_NUM) { + lua_assert(irt_isinteger(ir->t) || irt_isnum(ir->t)); + emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), + ra_allock(as, LJ_TISNUM << 15, rset_exclude(gpr, idx)), tmp); + } else if (irt_isaddr(ir->t)) { + emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(ir->t)), type); + emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp); + } else if (irt_isnil(ir->t)) { + emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp); + } else { + emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), + ra_allock(as, (irt_toitype(ir->t) << 15) | 0x7fff, allow), tmp); + } + if (ofs & FUSE_REG) + emit_dnm(as, (A64I_LDRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31)); + else + emit_lso(as, A64I_LDRx, tmp, idx, ofs); +} + +static void asm_ahustore(ASMState *as, IRIns *ir) +{ + if (ir->r != RID_SINK) { + RegSet allow = RSET_GPR; + Reg idx, src = RID_NONE, tmp = RID_TMP, type = RID_NONE; + int32_t ofs = 0; + if (irt_isnum(ir->t)) { + src = ra_alloc1(as, ir->op2, RSET_FPR); + idx = asm_fuseahuref(as, ir->op1, &ofs, allow, A64I_STRd); + if (ofs & FUSE_REG) + emit_dnm(as, (A64I_STRd^A64I_LS_R)|A64I_LS_UXTWx, (src & 31), idx, (ofs &31)); + else + emit_lso(as, A64I_STRd, (src & 31), idx, ofs); + } else { + if (!irt_ispri(ir->t)) { + src = ra_alloc1(as, ir->op2, allow); + rset_clear(allow, src); + if (irt_isinteger(ir->t)) + type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow); + else + type = ra_allock(as, irt_toitype(ir->t), allow); + } else { + tmp = type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t)<<47), allow); + } + idx = asm_fuseahuref(as, ir->op1, &ofs, rset_exclude(allow, type), + A64I_STRx); + if (ofs & FUSE_REG) + emit_dnm(as, (A64I_STRx^A64I_LS_R)|A64I_LS_UXTWx, tmp, idx, (ofs & 31)); + else + emit_lso(as, A64I_STRx, tmp, idx, ofs); + if (ra_hasreg(src)) { + if (irt_isinteger(ir->t)) { + emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), tmp, type, src); + } else { + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), tmp, src, type); + } + } + } + } +} + +static void asm_sload(ASMState *as, IRIns *ir) +{ + int32_t ofs = 8*((int32_t)ir->op1-2); + IRType1 t = ir->t; + Reg dest = RID_NONE, base; + RegSet allow = RSET_GPR; + lua_assert(!(ir->op2 & IRSLOAD_PARENT)); /* Handled by asm_head_side(). */ + lua_assert(irt_isguard(t) || !(ir->op2 & IRSLOAD_TYPECHECK)); + if ((ir->op2 & IRSLOAD_CONVERT) && irt_isguard(t) && irt_isint(t)) { + dest = ra_scratch(as, RSET_FPR); + asm_tointg(as, ir, dest); + t.irt = IRT_NUM; /* Continue with a regular number type check. */ + } else if (ra_used(ir)) { + Reg tmp = RID_NONE; + if ((ir->op2 & IRSLOAD_CONVERT)) + tmp = ra_scratch(as, irt_isint(t) ? RSET_FPR : RSET_GPR); + lua_assert((irt_isnum(t)) || irt_isint(t) || irt_isaddr(t)); + dest = ra_dest(as, ir, irt_isnum(t) ? RSET_FPR : allow); + base = ra_alloc1(as, REF_BASE, rset_clear(allow, dest)); + if (irt_isaddr(t)) { + emit_dn(as, A64I_ANDx^emit_isk13(LJ_GCVMASK, 1), dest, dest); + } else if ((ir->op2 & IRSLOAD_CONVERT)) { + if (irt_isint(t)) { + emit_dn(as, A64I_FCVT_S32_F64, dest, (tmp & 31)); + /* If value is already loaded for type check, move it to FPR. */ + if ((ir->op2 & IRSLOAD_TYPECHECK)) + emit_dn(as, A64I_FMOV_D_R, (tmp & 31), dest); + else + dest = tmp; + t.irt = IRT_NUM; /* Check for original type. */ + } else { + emit_dn(as, A64I_FCVT_F64_S32, (dest & 31), tmp); + dest = tmp; + t.irt = IRT_INT; /* Check for original type. */ + } + } else if (irt_isint(t) && (ir->op2 & IRSLOAD_TYPECHECK)) { + emit_dm(as, A64I_MOVw, dest, dest); + } + goto dotypecheck; + } + base = ra_alloc1(as, REF_BASE, allow); +dotypecheck: + rset_clear(allow, base); + if ((ir->op2 & IRSLOAD_TYPECHECK)) { + Reg tmp; + if (ra_hasreg(dest) && rset_test(RSET_GPR, dest)) { + tmp = dest; + } else { + tmp = ra_scratch(as, allow); + rset_clear(allow, tmp); + } + if (irt_isnum(t) && !(ir->op2 & IRSLOAD_CONVERT)) + emit_dn(as, A64I_FMOV_D_R, (dest & 31), tmp); + /* Need type check, even if the load result is unused. */ + asm_guardcc(as, irt_isnum(t) ? CC_LS : CC_NE); + if (irt_type(t) >= IRT_NUM) { + lua_assert(irt_isinteger(t) || irt_isnum(t)); + emit_nm(as, A64I_CMPx | A64F_SH(A64SH_LSR, 32), + ra_allock(as, LJ_TISNUM << 15, allow), tmp); + } else if (irt_isnil(t)) { + emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(1), tmp); + } else if (irt_ispri(t)) { + emit_nm(as, A64I_CMPx, + ra_allock(as, ~((int64_t)~irt_toitype(t) << 47) , allow), tmp); + } else { + Reg type = ra_scratch(as, allow); + emit_n(as, (A64I_CMNx^A64I_K12) | A64F_U12(-irt_toitype(t)), type); + emit_dn(as, A64I_ASRx | A64F_IMMR(47), type, tmp); + } + emit_lso(as, A64I_LDRx, tmp, base, ofs); + return; + } + if (ra_hasreg(dest)) { + emit_lso(as, irt_isnum(t) ? A64I_LDRd : + (irt_isint(t) ? A64I_LDRw : A64I_LDRx), (dest & 31), base, ofs); + } +} + +/* -- Allocations --------------------------------------------------------- */ + +#if LJ_HASFFI +static void asm_cnew(ASMState *as, IRIns *ir) +{ + CTState *cts = ctype_ctsG(J2G(as->J)); + CTypeID id = (CTypeID)IR(ir->op1)->i; + CTSize sz; + CTInfo info = lj_ctype_info(cts, id, &sz); + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco]; + IRRef args[4]; + RegSet allow = (RSET_GPR & ~RSET_SCRATCH); + lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL)); + + as->gcsteps++; + asm_setupresult(as, ir, ci); /* GCcdata * */ + /* Initialize immutable cdata object. */ + if (ir->o == IR_CNEWI) { + int32_t ofs = sizeof(GCcdata); + Reg r = ra_alloc1(as, ir->op2, allow); + lua_assert(sz == 4 || sz == 8); + emit_lso(as, sz == 8 ? A64I_STRx : A64I_STRw, r, RID_RET, ofs); + } else if (ir->op2 != REF_NIL) { /* Create VLA/VLS/aligned cdata. */ + ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv]; + args[0] = ASMREF_L; /* lua_State *L */ + args[1] = ir->op1; /* CTypeID id */ + args[2] = ir->op2; /* CTSize sz */ + args[3] = ASMREF_TMP1; /* CTSize align */ + asm_gencall(as, ci, args); + emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info)); + return; + } + + /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */ + { + Reg r = (id < 65536) ? RID_X1 : ra_allock(as, id, allow); + emit_lso(as, A64I_STRB, RID_TMP, RID_RET, offsetof(GCcdata, gct)); + emit_lso(as, A64I_STRH, r, RID_RET, offsetof(GCcdata, ctypeid)); + emit_d(as, A64I_MOVZw | A64F_U16(~LJ_TCDATA), RID_TMP); + if (id < 65536) emit_d(as, A64I_MOVZw | A64F_U16(id), RID_X1); + } + args[0] = ASMREF_L; /* lua_State *L */ + args[1] = ASMREF_TMP1; /* MSize size */ + asm_gencall(as, ci, args); + ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)), + ra_releasetmp(as, ASMREF_TMP1)); +} +#else +#define asm_cnew(as, ir) ((void)0) +#endif + +/* -- Write barriers ------------------------------------------------------ */ + +static void asm_tbar(ASMState *as, IRIns *ir) +{ + Reg tab = ra_alloc1(as, ir->op1, RSET_GPR); + Reg link = ra_scratch(as, rset_exclude(RSET_GPR, tab)); + Reg gr = ra_allock(as, i64ptr(J2G(as->J)), + rset_exclude(rset_exclude(RSET_GPR, tab), link)); + Reg mark = RID_TMP; + MCLabel l_end = emit_label(as); + emit_lso(as, A64I_STRx, link, tab, (int32_t)offsetof(GCtab, gclist)); + emit_lso(as, A64I_STRB, mark, tab, (int32_t)offsetof(GCtab, marked)); + emit_lso(as, A64I_STRx, tab, gr, + (int32_t)offsetof(global_State, gc.grayagain)); + emit_dn(as, A64I_ANDw^emit_isk13(~LJ_GC_BLACK, 0), mark, mark); + emit_lso(as, A64I_LDRx, link, gr, + (int32_t)offsetof(global_State, gc.grayagain)); + emit_cond_branch(as, CC_EQ, l_end); + emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), mark); + emit_lso(as, A64I_LDRB, mark, tab, (int32_t)offsetof(GCtab, marked)); +} + +static void asm_obar(ASMState *as, IRIns *ir) +{ + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv]; + IRRef args[2]; + MCLabel l_end; + RegSet allow = RSET_GPR; + Reg obj, val, tmp; + /* No need for other object barriers (yet). */ + lua_assert(IR(ir->op1)->o == IR_UREFC); + ra_evictset(as, RSET_SCRATCH); + l_end = emit_label(as); + args[0] = ASMREF_TMP1; /* global_State *g */ + args[1] = ir->op1; /* TValue *tv */ + asm_gencall(as, ci, args); + ra_allockreg(as, i64ptr(J2G(as->J)), ra_releasetmp(as, ASMREF_TMP1) ); + obj = IR(ir->op1)->r; + tmp = ra_scratch(as, rset_exclude(allow, obj)); + emit_cond_branch(as, CC_EQ, l_end); + emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_BLACK, 0), tmp); + emit_cond_branch(as, CC_EQ, l_end); + emit_n(as, A64I_TSTw^emit_isk13(LJ_GC_WHITES, 0), RID_TMP); + val = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, obj)); + emit_lso(as, A64I_LDRB, tmp, obj, + (int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv)); + emit_lso(as, A64I_LDRB, RID_TMP, val, (int32_t)offsetof(GChead, marked)); +} + +/* -- Arithmetic and logic operations ------------------------------------- */ + +static void asm_fparith(ASMState *as, IRIns *ir, A64Ins ai) +{ + Reg dest = ra_dest(as, ir, RSET_FPR); + Reg right, left = ra_alloc2(as, ir, RSET_FPR); + right = (left >> 8); left &= 255; + emit_dnm(as, ai, (dest & 31), (left & 31), (right & 31)); +} + +static void asm_fpunary(ASMState *as, IRIns *ir, A64Ins ai) +{ + Reg dest = ra_dest(as, ir, RSET_FPR); + Reg left = ra_hintalloc(as, ir->op1, dest, RSET_FPR); + emit_dn(as, ai, (dest & 31), (left & 31)); +} + +static void asm_fpmath(ASMState *as, IRIns *ir) +{ + IRFPMathOp fpm = (IRFPMathOp)ir->op2; + if (fpm == IRFPM_SQRT) { + asm_fpunary(as, ir, A64I_FSQRTd); + } else if (fpm <= IRFPM_TRUNC) { + asm_fpunary(as, ir, fpm == IRFPM_FLOOR ? A64I_FRINTMd : + fpm == IRFPM_CEIL ? A64I_FRINTPd : A64I_FRINTZd); + } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) { + return; + } else { + asm_callid(as, ir, IRCALL_lj_vm_floor + fpm); + } +} + +static int asm_swapops(ASMState *as, IRRef lref, IRRef rref) +{ + IRIns *ir; + if (irref_isk(rref)) + return 0; /* Don't swap constants to the left. */ + if (irref_isk(lref)) + return 1; /* But swap constants to the right. */ + ir = IR(rref); + if ((ir->o >= IR_BSHL && ir->o <= IR_BSAR) || + (ir->o == IR_ADD && ir->op1 == ir->op2) || + (ir->o == IR_CONV && ir->op2 == ((IRT_I64<o >= IR_BSHL && ir->o <= IR_BSAR) || + (ir->o == IR_ADD && ir->op1 == ir->op2) || + (ir->o == IR_CONV && ir->op2 == ((IRT_I64<op1, rref = ir->op2; + Reg left, dest = ra_dest(as, ir, RSET_GPR); + uint32_t m; + if ((ai & ~A64I_S) != A64I_SUBw && asm_swapops(as, lref, rref)) { + IRRef tmp = lref; lref = rref; rref = tmp; + } + left = ra_hintalloc(as, lref, dest, RSET_GPR); + if (irt_is64(ir->t)) ai |= A64I_X; + m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left)); + if (irt_isguard(ir->t)) { /* For IR_ADDOV etc. */ + asm_guardcc(as, CC_VS); + ai |= A64I_S; + } + emit_dn(as, ai^m, dest, left); +} + +static void asm_intop_s(ASMState *as, IRIns *ir, A64Ins ai) +{ + if (as->flagmcp == as->mcp) { /* Drop cmp r, #0. */ + as->flagmcp = NULL; + as->mcp++; + ai |= A64I_S; + } + asm_intop(as, ir, ai); +} + +static void asm_intneg(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); + emit_dm(as, irt_is64(ir->t) ? A64I_NEGx : A64I_NEGw, dest, left); +} + +/* NYI: use add/shift for MUL(OV) with constants. FOLD only does 2^k. */ +static void asm_intmul(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, ir->op1, rset_exclude(RSET_GPR, dest)); + Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left)); + if (irt_isguard(ir->t)) { /* IR_MULOV */ + asm_guardcc(as, CC_NE); + emit_dm(as, A64I_MOVw, dest, dest); /* Zero-extend. */ + emit_nm(as, A64I_CMPw | A64F_SH(A64SH_ASR, 31), RID_TMP, dest); + emit_dn(as, A64I_ASRx | A64F_IMMR(32), RID_TMP, dest); + emit_dnm(as, A64I_SMULL, dest, right, left); + } else { + emit_dnm(as, irt_is64(ir->t) ? A64I_MULx : A64I_MULw, dest, left, right); + } +} + +static void asm_add(ASMState *as, IRIns *ir) +{ + if (irt_isnum(ir->t)) { + asm_fparith(as, ir, A64I_FADDd); + return; + } + asm_intop_s(as, ir, A64I_ADDw); +} + +static void asm_sub(ASMState *as, IRIns *ir) +{ + if (irt_isnum(ir->t)) { + asm_fparith(as, ir, A64I_FSUBd); + return; + } + asm_intop_s(as, ir, A64I_SUBw); +} + +static void asm_mul(ASMState *as, IRIns *ir) +{ + if (irt_isnum(ir->t)) { + asm_fparith(as, ir, A64I_FMULd); + return; + } + asm_intmul(as, ir); +} + +static void asm_div(ASMState *as, IRIns *ir) +{ +#if LJ_HASFFI + if (!irt_isnum(ir->t)) + asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_divi64 : + IRCALL_lj_carith_divu64); + else +#endif + asm_fparith(as, ir, A64I_FDIVd); +} + +static void asm_pow(ASMState *as, IRIns *ir) +{ +#if LJ_HASFFI + if (!irt_isnum(ir->t)) + asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_powi64 : + IRCALL_lj_carith_powu64); + else +#endif + asm_callid(as, ir, IRCALL_lj_vm_powi); +} + +#define asm_addov(as, ir) asm_add(as, ir) +#define asm_subov(as, ir) asm_sub(as, ir) +#define asm_mulov(as, ir) asm_mul(as, ir) + +#define asm_abs(as, ir) asm_fpunary(as, ir, A64I_FABS) +#define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2) +#define asm_ldexp(as, ir) asm_callid(as, ir, IRCALL_ldexp) + +static void asm_mod(ASMState *as, IRIns *ir) +{ +#if LJ_HASFFI + if (!irt_isint(ir->t)) + asm_callid(as, ir, irt_isi64(ir->t) ? IRCALL_lj_carith_modi64 : + IRCALL_lj_carith_modu64); + else +#endif + asm_callid(as, ir, IRCALL_lj_vm_modi); +} + +static void asm_neg(ASMState *as, IRIns *ir) +{ + if (irt_isnum(ir->t)) { + asm_fpunary(as, ir, A64I_FNEGd); + return; + } + asm_intneg(as, ir); +} + +static void asm_bitop(ASMState *as, IRIns *ir, A64Ins ai) +{ + if (as->flagmcp == as->mcp && ai == A64I_ANDw) { + /* Try to drop cmp r, #0. */ + as->flagmcp = NULL; + as->mcp++; + ai += A64I_ANDSw - A64I_ANDw; + } + if (ir->op2 == 0) { + Reg dest = ra_dest(as, ir, RSET_GPR); + uint32_t m = asm_fuseopm(as, ai, ir->op1, RSET_GPR); + if (irt_is64(ir->t)) ai |= A64I_X; + emit_d(as, ai^m, dest); + } else { + asm_intop(as, ir, ai); + } +} + +#define asm_bnot(as, ir) asm_bitop(as, ir, A64I_MVNw) +#define asm_band(as, ir) asm_bitop(as, ir, A64I_ANDw) +#define asm_bor(as, ir) asm_bitop(as, ir, A64I_ORRw) +#define asm_bxor(as, ir) asm_bitop(as, ir, A64I_EORw) + +static void asm_bswap(ASMState *as, IRIns *ir) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, ir->op1, RSET_GPR); + emit_dn(as, irt_is64(ir->t) ? A64I_REVx : A64I_REVw, dest, left); +} + +static void asm_bitshift(ASMState *as, IRIns *ir, A64Ins ai, A64Shift sh) +{ + int shmask = irt_is64(ir->t) ? 63 : 31; + if (irref_isk(ir->op2)) { /* Constant shifts. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, ir->op1, RSET_GPR); + int32_t shift = (IR(ir->op2)->i & shmask); + + if (shmask == 63) ai += A64I_UBFMx - A64I_UBFMw; + switch (sh) { + case A64SH_LSL: + emit_dn(as, ai | A64F_IMMS(shmask-shift) | A64F_IMMR(shmask-shift+1), dest, left); + break; + case A64SH_LSR: case A64SH_ASR: + emit_dn(as, ai | A64F_IMMS(shmask) | A64F_IMMR(shift), dest, left); + break; + case A64SH_ROR: + emit_dnm(as, ai | A64F_IMMS(shift), dest, left, left); + break; + } + } else { /* Variable-length shifts. */ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_alloc1(as, ir->op1, RSET_GPR); + Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left)); + emit_dnm(as, (shmask == 63 ? A64I_SHRx : A64I_SHRw) | A64F_BSH(sh), dest, left, right); + } +} + +#define asm_bshl(as, ir) asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSL) +#define asm_bshr(as, ir) asm_bitshift(as, ir, A64I_UBFMw, A64SH_LSR) +#define asm_bsar(as, ir) asm_bitshift(as, ir, A64I_SBFMw, A64SH_ASR) +#define asm_bror(as, ir) asm_bitshift(as, ir, A64I_EXTRw, A64SH_ROR) +#define asm_brol(as, ir) lua_assert(0) + +static void asm_intmin_max(ASMState *as, IRIns *ir, A64CC cc) +{ + Reg dest = ra_dest(as, ir, RSET_GPR); + Reg left = ra_hintalloc(as, ir->op1, dest, RSET_GPR); + Reg right = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, left)); + emit_dnm(as, A64I_CSELw|A64F_CC(cc), dest, left, right); + emit_nm(as, A64I_CMPw, left, right); +} + +static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc) +{ + Reg dest = (ra_dest(as, ir, RSET_FPR) & 31); + Reg right, left = ra_alloc2(as, ir, RSET_FPR); + right = ((left >> 8) & 31); left &= 31; + emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right); + emit_nm(as, A64I_FCMPd, left, right); +} + +static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc) +{ + if (irt_isnum(ir->t)) + asm_fpmin_max(as, ir, fcc); + else + asm_intmin_max(as, ir, cc); +} + +#define asm_max(as, ir) asm_min_max(as, ir, CC_GT, CC_HI) +#define asm_min(as, ir) asm_min_max(as, ir, CC_LT, CC_LO) + +/* -- Comparisons --------------------------------------------------------- */ + +/* Map of comparisons to flags. ORDER IR. */ +static const uint8_t asm_compmap[IR_ABC+1] = { + /* op FP swp int cc FP cc */ + /* LT */ CC_GE + (CC_HS << 4), + /* GE x */ CC_LT + (CC_HI << 4), + /* LE */ CC_GT + (CC_HI << 4), + /* GT x */ CC_LE + (CC_HS << 4), + /* ULT x */ CC_HS + (CC_LS << 4), + /* UGE */ CC_LO + (CC_LO << 4), + /* ULE x */ CC_HI + (CC_LO << 4), + /* UGT */ CC_LS + (CC_LS << 4), + /* EQ */ CC_NE + (CC_NE << 4), + /* NE */ CC_EQ + (CC_EQ << 4), + /* ABC */ CC_LS + (CC_LS << 4) /* Same as UGT. */ +}; + +/* FP comparisons. */ +static void asm_fpcomp(ASMState *as, IRIns *ir) +{ + Reg left, right; + A64Ins ai; + int swp = ((ir->o ^ (ir->o >> 2)) & ~(ir->o >> 3) & 1); + if (!swp && irref_isk(ir->op2) && ir_knum(IR(ir->op2))->u64 == 0) { + left = (ra_alloc1(as, ir->op1, RSET_FPR) & 31); + right = 0; + ai = A64I_FCMPZd; + } else { + left = ra_alloc2(as, ir, RSET_FPR); + if (swp) { + right = (left & 31); left = ((left >> 8) & 31); + } else { + right = ((left >> 8) & 31); left &= 31; + } + ai = A64I_FCMPd; + } + asm_guardcc(as, (asm_compmap[ir->o] >> 4)); + emit_nm(as, ai, left, right); +} + +/* Integer comparisons. */ +static void asm_intcomp(ASMState *as, IRIns *ir) +{ + A64CC oldcc, cc = (asm_compmap[ir->o] & 15); + A64Ins ai = irt_is64(ir->t) ? A64I_CMPx : A64I_CMPw; + IRRef lref = ir->op1, rref = ir->op2; + Reg left; + uint32_t m; + int cmpprev0 = 0; + lua_assert(irt_is64(ir->t) || irt_isint(ir->t) || + irt_isu32(ir->t) || irt_isaddr(ir->t) || irt_isu8(ir->t)); + if (asm_swapops(as, lref, rref)) { + IRRef tmp = lref; lref = rref; rref = tmp; + if (cc >= CC_GE) cc ^= 7; /* LT <-> GT, LE <-> GE */ + else if (cc > CC_NE) cc ^= 11; /* LO <-> HI, LS <-> HS */ + } + oldcc = cc; + if (irref_isk(rref) && IR(rref)->i == 0) { + IRIns *irl = IR(lref); + if (cc == CC_GE) cc = CC_PL; + else if (cc == CC_LT) cc = CC_MI; + else if (cc > CC_NE) goto notst; /* Other conds don't work with tst. */ + cmpprev0 = (irl+1 == ir); + /* Combine comp(BAND(left, right), 0) into tst left, right. */ + if (cmpprev0 && irl->o == IR_BAND && !ra_used(irl)) { + IRRef blref = irl->op1, brref = irl->op2; + uint32_t m2 = 0; + Reg bleft; + if (asm_swapops(as, blref, brref)) { + Reg tmp = blref; blref = brref; brref = tmp; + } + if (irref_isk(brref)) { + /* NYI: use tbz/tbnz, if applicable. */ + m2 = emit_isk13(IR(brref)->i, irt_is64(irl->t)); + if (!m2) + goto notst; /* Not beneficial if we miss a constant operand. */ + } + bleft = ra_alloc1(as, blref, RSET_GPR); + ai = (irt_is64(irl->t) ? A64I_TSTx : A64I_TSTw); + if (!m2) + m2 = asm_fuseopm(as, ai, brref, rset_exclude(RSET_GPR, bleft)); + asm_guardcc(as, cc); + emit_n(as, ai^m2, bleft); + return; + } + /* NYI: use cbz/cbnz for EQ/NE 0. */ + } +notst: + left = ra_alloc1(as, lref, RSET_GPR); + m = asm_fuseopm(as, ai, rref, rset_exclude(RSET_GPR, left)); + asm_guardcc(as, cc); + emit_n(as, ai^m, left); + /* Signed comparison with zero and referencing previous ins? */ + if (cmpprev0 && (oldcc <= CC_NE || oldcc >= CC_GE)) + as->flagmcp = as->mcp; /* Allow elimination of the compare. */ +} + +static void asm_comp(ASMState *as, IRIns *ir) +{ + if (irt_isnum(ir->t)) + asm_fpcomp(as, ir); + else + asm_intcomp(as, ir); +} + +#define asm_equal(as, ir) asm_comp(as, ir) + +/* -- Support for 64 bit ops in 32 bit mode ------------------------------- */ + +/* Hiword op of a split 64 bit op. Previous op must be the loword op. */ +static void asm_hiop(ASMState *as, IRIns *ir) +{ + UNUSED(as); UNUSED(ir); lua_assert(0); /* Unused on 64 bit. */ +} + +/* -- Profiling ----------------------------------------------------------- */ + +static void asm_prof(ASMState *as, IRIns *ir) +{ + uint32_t k = emit_isk13(HOOK_PROFILE, 0); + lua_assert(k != 0); + UNUSED(ir); + asm_guardcc(as, CC_NE); + emit_n(as, A64I_TSTw^k, RID_TMP); + emit_lsptr(as, A64I_LDRB, RID_TMP, (void *)&J2G(as->J)->hookmask); +} + +/* -- Stack handling ------------------------------------------------------ */ + +/* Check Lua stack size for overflow. Use exit handler as fallback. */ +static void asm_stack_check(ASMState *as, BCReg topslot, + IRIns *irp, RegSet allow, ExitNo exitno) +{ + Reg pbase; + uint32_t k; + if (irp) { + if (!ra_hasspill(irp->s)) { + pbase = irp->r; + lua_assert(ra_hasreg(pbase)); + } else if (allow) { + pbase = rset_pickbot(allow); + } else { + pbase = RID_RET; + emit_lso(as, A64I_LDRx, RID_RET, RID_SP, 0); /* Restore temp register. */ + } + } else { + pbase = RID_BASE; + } + emit_branch(as, A64I_BL, exitstub_addr(as->J, exitno)); + emit_cond_branch(as, CC_LS^1, as->mcp+1); + k = emit_isk12((8*topslot)); + lua_assert(k); + emit_n(as, A64I_CMPx^k, RID_TMP); + emit_dnm(as, A64I_SUBx, RID_TMP, RID_TMP, pbase); + emit_lso(as, A64I_LDRx, RID_TMP, RID_TMP, + (int32_t)offsetof(lua_State, maxstack)); + if (irp) { /* Must not spill arbitrary registers in head of side trace. */ + if (ra_hasspill(irp->s)) + emit_lso(as, A64I_LDRx, pbase, RID_SP, sps_scale(irp->s)); + emit_lso(as, A64I_LDRx, RID_TMP, RID_GL, glofs(as, &J2G(as->J)->cur_L)); + if (ra_hasspill(irp->s) && !allow) + emit_lso(as, A64I_STRx, RID_RET, RID_SP, 0); /* Save temp register. */ + } else { + emit_getgl(as, RID_TMP, cur_L); + } +} + +/* Restore Lua stack from on-trace state. */ +static void asm_stack_restore(ASMState *as, SnapShot *snap) +{ + SnapEntry *map = &as->T->snapmap[snap->mapofs]; +#ifdef LUA_USE_ASSERT + SnapEntry *flinks = &as->T->snapmap[snap_nextofs(as->T, snap)-1-LJ_FR2]; +#endif + MSize n, nent = snap->nent; + /* Store the value of all modified slots to the Lua stack. */ + for (n = 0; n < nent; n++) { + SnapEntry sn = map[n]; + BCReg s = snap_slot(sn); + int32_t ofs = 8*((int32_t)s-1-LJ_FR2); + IRRef ref = snap_ref(sn); + IRIns *ir = IR(ref); + if ((sn & SNAP_NORESTORE)) + continue; + if (irt_isnum(ir->t)) { + Reg src = ra_alloc1(as, ref, RSET_FPR); + emit_lso(as, A64I_STRd, (src & 31), RID_BASE, ofs); + } else { + RegSet allow = rset_exclude(RSET_GPR, RID_BASE); + lua_assert(irt_ispri(ir->t) || irt_isaddr(ir->t) || irt_isinteger(ir->t)); + if (!irref_isk(ref)) { + Reg type, src; + if (irt_is64(ir->t)) { + type = ra_allock(as, (int32_t)irt_toitype(ir->t), allow); + src = ra_alloc1(as, ref, rset_exclude(allow, type)); + emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs); + emit_dnm(as, A64I_ADDx | A64F_SH(A64SH_LSL, 47), RID_TMP, src, type); + } else if (irt_isinteger(ir->t)) { + type = ra_allock(as, (int64_t)LJ_TISNUM << 47, allow); + src = ra_alloc1(as, ref, rset_exclude(allow, type)); + emit_lso(as, A64I_STRx, RID_TMP, RID_BASE, ofs); + emit_dnm(as, A64I_ADDx | A64F_EX(A64EX_UXTW), RID_TMP, type, src); + } else { + type = ra_allock(as, ~((int64_t)~irt_toitype(ir->t) << 47), allow); + emit_lso(as, A64I_STRx, type, RID_BASE, ofs); + } + } else { + TValue k; + lj_ir_kvalue(as->J->L, &k, ir); + emit_lso(as, A64I_STRx, + ra_allock(as, tvisnil(&k) ? -1 : (int64_t)k.u64, allow), + RID_BASE, ofs); + } + } + checkmclim(as); + } + lua_assert(map + nent == flinks); +} + +/* -- GC handling --------------------------------------------------------- */ + +/* Check GC threshold and do one or more GC steps. */ +static void asm_gc_check(ASMState *as) +{ + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit]; + IRRef args[2]; + MCLabel l_end; + Reg tmp1, tmp2; + ra_evictset(as, RSET_SCRATCH); + l_end = emit_label(as); + /* Exit trace if in GCSatomic or GCSfinalize. Avoids syncing GC objects. */ + asm_guardcc(as, CC_NE); /* Assumes asm_snap_prep() already done. */ + emit_n(as, A64I_CMPx^A64I_K12, RID_RET); + args[0] = ASMREF_TMP1; /* global_State *g */ + args[1] = ASMREF_TMP2; /* MSize steps */ + asm_gencall(as, ci, args); + tmp1 = ra_releasetmp(as, ASMREF_TMP1); + tmp2 = ra_releasetmp(as, ASMREF_TMP2); + emit_loadi(as, tmp2, as->gcsteps); + /* Jump around GC step if GC total < GC threshold. */ + emit_cond_branch(as, CC_LS, l_end); + emit_nm(as, A64I_CMPx, RID_TMP, tmp2); + emit_lso(as, A64I_LDRx, tmp2, tmp1, + (int32_t)offsetof(global_State, gc.threshold)); + emit_lso(as, A64I_LDRx, RID_TMP, tmp1, + (int32_t)offsetof(global_State, gc.total)); + ra_allockreg(as, i64ptr(J2G(as->J)), tmp1); + as->gcsteps = 0; + checkmclim(as); +} + +/* -- Loop handling ------------------------------------------------------- */ + +/* Fixup the loop branch. */ +static void asm_loop_fixup(ASMState *as) +{ + MCode *p = as->mctop; + MCode *target = as->mcp; + if (as->loopinv) { /* Inverted loop branch? */ + ptrdiff_t delta = target - (p - 2); + lua_assert(((delta + 0x40000) >> 19) == 0); + /* asm_guardcc already inverted the b.cc and patched the final bl. */ + p[-2] |= ((uint32_t)delta & 0x7ffff) << 5; + } else { + ptrdiff_t delta = target - (p - 1); + p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu); + } +} + +/* -- Head of trace ------------------------------------------------------- */ + +/* Reload L register from g->cur_L. */ +static void asm_head_lreg(ASMState *as) +{ + IRIns *ir = IR(ASMREF_L); + if (ra_used(ir)) { + Reg r = ra_dest(as, ir, RSET_GPR); + emit_getgl(as, r, cur_L); + ra_evictk(as); + } +} + +/* Coalesce BASE register for a root trace. */ +static void asm_head_root_base(ASMState *as) +{ + IRIns *ir; + asm_head_lreg(as); + ir = IR(REF_BASE); + if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t))) + ra_spill(as, ir); + ra_destreg(as, ir, RID_BASE); +} + +/* Coalesce BASE register for a side trace. */ +static RegSet asm_head_side_base(ASMState *as, IRIns *irp, RegSet allow) +{ + IRIns *ir; + asm_head_lreg(as); + ir = IR(REF_BASE); + if (ra_hasreg(ir->r) && (rset_test(as->modset, ir->r) || irt_ismarked(ir->t))) + ra_spill(as, ir); + if (ra_hasspill(irp->s)) { + rset_clear(allow, ra_dest(as, ir, allow)); + } else { + Reg r = irp->r; + lua_assert(ra_hasreg(r)); + rset_clear(allow, r); + if (r != ir->r && !rset_test(as->freeset, r)) + ra_restore(as, regcost_ref(as->cost[r])); + ra_destreg(as, ir, r); + } + return allow; +} + +/* -- Tail of trace ------------------------------------------------------- */ + +/* Fixup the tail code. */ +static void asm_tail_fixup(ASMState *as, TraceNo lnk) +{ + MCode *p = as->mctop; + MCode *target; + /* Undo the sp adjustment in BC_JLOOP when exiting to the interpreter. */ + int32_t spadj = as->T->spadjust + (lnk ? 0 : sps_scale(SPS_FIXED)); + if (spadj == 0) { + as->mctop = --p; + } else { + /* Patch stack adjustment. */ + uint32_t k = emit_isk12(spadj); + lua_assert(k); + p[-2] = (A64I_ADDx^k) | A64F_D(RID_SP) | A64F_N(RID_SP); + } + /* Patch exit branch. */ + target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp; + p[-1] = A64I_B | (((target-p)+1)&0x03ffffffu); +} + +/* Prepare tail of code. */ +static void asm_tail_prep(ASMState *as) +{ + MCode *p = as->mctop - 1; /* Leave room for exit branch. */ + if (as->loopref) { + as->invmcp = as->mcp = p; + } else { + as->mcp = p-1; /* Leave room for stack pointer adjustment. */ + as->invmcp = NULL; + } + *p = 0; /* Prevent load/store merging. */ +} + +/* -- Trace setup --------------------------------------------------------- */ + +/* Ensure there are enough stack slots for call arguments. */ +static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) +{ + IRRef args[CCI_NARGS_MAX*2]; + uint32_t i, nargs = CCI_XNARGS(ci); + int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; + asm_collectargs(as, ir, ci, args); + for (i = 0; i < nargs; i++) { + if (args[i] && irt_isfp(IR(args[i])->t)) { + if (nfpr > 0) nfpr--; else nslots += 2; + } else { + if (ngpr > 0) ngpr--; else nslots += 2; + } + } + if (nslots > as->evenspill) /* Leave room for args in stack slots. */ + as->evenspill = nslots; + return REGSP_HINT(RID_RET); +} + +static void asm_setup_target(ASMState *as) +{ + /* May need extra exit for asm_stack_check on side traces. */ + asm_exitstub_setup(as, as->T->nsnap + (as->parent ? 1 : 0)); +} + +/* -- Trace patching ------------------------------------------------------ */ + +/* Patch exit jumps of existing machine code to a new target. */ +void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target) +{ + MCode *p = T->mcode; + MCode *pe = (MCode *)((char *)p + T->szmcode); + MCode *cstart = NULL, *cend = p; + MCode *mcarea = lj_mcode_patch(J, p, 0); + MCode *px = exitstub_addr(J, exitno); + for (; p < pe; p++) { + /* Look for bl exitstub, replace with b target. */ + uint32_t ins = *p; + if ((ins & 0xfc000000u) == 0x94000000u && + ((ins ^ (px-p)) & 0x03ffffffu) == 0) { + *p = (ins & 0x7c000000u) | ((target-p) & 0x03ffffffu); + cend = p+1; + if (!cstart) cstart = p; + } + } + lua_assert(cstart != NULL); + lj_mcode_sync(cstart, cend); + lj_mcode_patch(J, mcarea, 1); +} + diff --git a/src/lj_ccall.c b/src/lj_ccall.c index b599be33..a3ae8b05 100644 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -331,7 +331,7 @@ #define CCALL_HANDLE_COMPLEXARG \ /* Pass complex by value in separate (!) FPRs or on stack. */ \ - isfp = ctr->size == 2*sizeof(float) ? 2 : 1; + isfp = sz == 2*sizeof(float) ? 2 : 1; #define CCALL_HANDLE_REGARG \ if (LJ_TARGET_IOS && isva) { \ diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h index 82708077..362d6202 100644 --- a/src/lj_dispatch.h +++ b/src/lj_dispatch.h @@ -107,6 +107,7 @@ typedef struct GG_State { #define J2G(J) (&J2GG(J)->g) #define G2J(gl) (&G2GG(gl)->J) #define L2J(L) (&L2GG(L)->J) +#define GG_G2J (GG_OFS(J) - GG_OFS(g)) #define GG_G2DISP (GG_OFS(dispatch) - GG_OFS(g)) #define GG_DISP2G (GG_OFS(g) - GG_OFS(dispatch)) #define GG_DISP2J (GG_OFS(J) - GG_OFS(dispatch)) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h new file mode 100644 index 00000000..eb8f7fc7 --- /dev/null +++ b/src/lj_emit_arm64.h @@ -0,0 +1,397 @@ +/* +** ARM64 instruction emitter. +** Copyright (C) 2005-2016 Mike Pall. See Copyright Notice in luajit.h +** +** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. +** Sponsored by Cisco Systems, Inc. +*/ + +/* -- Constant encoding --------------------------------------------------- */ + +static uint64_t get_k64val(IRIns *ir) +{ + if (ir->o == IR_KINT64) { + return ir_kint64(ir)->u64; + } else if (ir->o == IR_KGC) { + return (uint64_t)ir_kgc(ir); + } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) { + return (uint64_t)ir_kptr(ir); + } else { + lua_assert(ir->o == IR_KINT || ir->o == IR_KNULL); + return ir->i; /* Sign-extended. */ + } +} + +/* Encode constant in K12 format for data processing instructions. */ +static uint32_t emit_isk12(int64_t n) +{ + uint64_t k = (n < 0) ? -n : n; + uint32_t m = (n < 0) ? 0x40000000 : 0; + if (k < 0x1000) { + return A64I_K12|m|A64F_U12(k); + } else if ((k & 0xfff000) == k) { + return A64I_K12|m|0x400000|A64F_U12(k>>12); + } + return 0; +} + +#define emit_clz64(n) __builtin_clzll(n) +#define emit_ctz64(n) __builtin_ctzll(n) + +/* Encode constant in K13 format for logical data processing instructions. */ +static uint32_t emit_isk13(uint64_t n, int is64) +{ + int inv = 0, w = 128, lz, tz; + if (n & 1) { n = ~n; w = 64; inv = 1; } /* Avoid wrap-around of ones. */ + if (!n) return 0; /* Neither all-zero nor all-ones are allowed. */ + do { /* Find the repeat width. */ + if (is64 && (uint32_t)(n^(n>>32))) break; + n = (uint32_t)n; w = 32; if ((n^(n>>16)) & 0xffff) break; + n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break; + n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break; + n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break; + n = n & 0x3; w = 2; + } while (0); + lz = emit_clz64(n); + tz = emit_ctz64(n); + if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */ + if (inv) + return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10); + else + return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10); +} + +static uint32_t emit_isfpk64(uint64_t n) +{ + uint64_t etop9 = ((n >> 54) & 0x1ff); + if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) { + return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80)); + } + return ~0u; +} + +/* -- Emit basic instructions --------------------------------------------- */ + +static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm) +{ + *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm); +} + +static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm) +{ + *--as->mcp = ai | A64F_D(rd) | A64F_M(rm); +} + +static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn) +{ + *--as->mcp = ai | A64F_D(rd) | A64F_N(rn); +} + +static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm) +{ + *--as->mcp = ai | A64F_N(rn) | A64F_M(rm); +} + +static void emit_d(ASMState *as, A64Ins ai, Reg rd) +{ + *--as->mcp = ai | A64F_D(rd); +} + +static void emit_n(ASMState *as, A64Ins ai, Reg rn) +{ + *--as->mcp = ai | A64F_N(rn); +} + +static int emit_checkofs(A64Ins ai, int64_t ofs) +{ + int scale = (ai >> 30) & 3; + if (ofs < 0 || (ofs & ((1<= -256 && ofs <= 255) ? -1 : 0; + } else { + return (ofs < (4096<> 30) & 3; + lua_assert(ot); + /* Combine LDR/STR pairs to LDP/STP. */ + if ((sc == 2 || sc == 3) && + (!(ai & 0x400000) || rd != rn) && + as->mcp != as->mcloop) { + uint32_t prev = *as->mcp & ~A64F_D(31); + int ofsm = ofs - (1<>sc)) || + prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) { + aip = (A64F_A(rd) | A64F_D(*as->mcp & 31)); + } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) || + prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) { + aip = (A64F_D(rd) | A64F_A(*as->mcp & 31)); + ofsm = ofs; + } else { + goto nopair; + } + if (ofsm >= (-64<mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) | + (ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000)); + return; + } + } +nopair: + if (ot == 1) + *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc); + else + *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff); +} + +/* -- Emit loads/stores --------------------------------------------------- */ + +/* Prefer rematerialization of BASE/L from global_State over spills. */ +#define emit_canremat(ref) ((ref) <= ASMREF_L) + +/* Try to find an N-step delta relative to other consts with N < lim. */ +static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim) +{ + RegSet work = ~as->freeset & RSET_GPR; + if (lim <= 1) return 0; /* Can't beat that. */ + while (work) { + Reg r = rset_picktop(work); + IRRef ref = regcost_ref(as->cost[r]); + lua_assert(r != rd); + if (ref < REF_TRUE) { + uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) : + get_k64val(IR(ref)); + int64_t delta = (int64_t)(k - kx); + if (delta == 0) { + emit_dm(as, A64I_MOVx, rd, r); + return 1; + } else { + uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta); + if (k12) { + emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r); + return 1; + } + /* Do other ops or multi-step deltas pay off? Probably not. + ** E.g. XOR rarely helps with pointer consts. + */ + } + } + rset_clear(work, r); + } + return 0; /* Failed. */ +} + +static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64) +{ + uint32_t k13 = emit_isk13(u64, is64); + if (k13) { /* Can the constant be represented as a bitmask immediate? */ + emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO); + } else { + int i, zeros = 0, ones = 0, neg; + if (!is64) u64 = (int64_t)(int32_t)u64; /* Sign-extend. */ + /* Count homogeneous 16 bit fragments. */ + for (i = 0; i < 4; i++) { + uint64_t frag = (u64 >> i*16) & 0xffff; + zeros += (frag == 0); + ones += (frag == 0xffff); + } + neg = ones > zeros; /* Use MOVN if it pays off. */ + if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) { + int shift = 0, lshift = 0; + uint64_t n64 = neg ? ~u64 : u64; + if (n64 != 0) { + /* Find first/last fragment to be filled. */ + shift = (63-emit_clz64(n64)) & ~15; + lshift = emit_ctz64(n64) & ~15; + } + /* MOVK requires the original value (u64). */ + while (shift > lshift) { + uint32_t u16 = (u64 >> shift) & 0xffff; + /* Skip fragments that are correctly filled by MOVN/MOVZ. */ + if (u16 != (neg ? 0xffff : 0)) + emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd); + shift -= 16; + } + /* But MOVN needs an inverted value (n64). */ + emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) | + A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd); + } + } +} + +/* Load a 32 bit constant into a GPR. */ +#define emit_loadi(as, rd, i) emit_loadk(as, rd, i, 0) + +/* Load a 64 bit constant into a GPR. */ +#define emit_loadu64(as, rd, i) emit_loadk(as, rd, i, A64I_X) + +#define emit_loada(as, r, addr) emit_loadu64(as, (r), (uintptr_t)(addr)) + +#define glofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g)) +#define mcpofs(as, k) \ + ((intptr_t)((uintptr_t)(k) - (uintptr_t)as->mcp)) +#define checkmcpofs(as, k) \ + ((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0) + +static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow); + +/* Get/set from constant pointer. */ +static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p) +{ + /* First, check if ip + offset is in range. */ + if ((ai & 0x00400000) && checkmcpofs(as, p)) { + emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r); + } else { + Reg base = RID_GL; /* Next, try GL + offset. */ + int64_t ofs = glofs(as, p); + if (!emit_checkofs(ai, ofs)) { /* Else split up into base reg + offset. */ + int64_t i64 = i64ptr(p); + base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r)); + ofs = i64 & 0x7fffull; + } + emit_lso(as, ai, r, base, ofs); + } +} + +/* Load 64 bit IR constant into register. */ +static void emit_loadk64(ASMState *as, Reg r, IRIns *ir) +{ + const uint64_t *k = &ir_k64(ir)->u64; + int64_t ofs; + if (r >= RID_MAX_GPR) { + uint32_t fpk = emit_isfpk64(*k); + if (fpk != ~0u) { + emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31)); + return; + } + } + ofs = glofs(as, k); + if (emit_checkofs(A64I_LDRx, ofs)) { + emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx, + (r & 31), RID_GL, ofs); + } else { + if (r >= RID_MAX_GPR) { + emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP); + r = RID_TMP; + } + if (checkmcpofs(as, k)) + emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r); + else + emit_loadu64(as, r, *k); + } +} + +/* Get/set global_State fields. */ +#define emit_getgl(as, r, field) \ + emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field) +#define emit_setgl(as, r, field) \ + emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field) + +/* Trace number is determined from pc of exit instruction. */ +#define emit_setvmstate(as, i) UNUSED(i) + +/* -- Emit control-flow instructions -------------------------------------- */ + +/* Label for internal jumps. */ +typedef MCode *MCLabel; + +/* Return label pointing to current PC. */ +#define emit_label(as) ((as)->mcp) + +static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - (p - 1); + lua_assert(((delta + 0x40000) >> 19) == 0); + *--p = A64I_BCC | A64F_S19((uint32_t)delta & 0x7ffff) | cond; + as->mcp = p; +} + +static void emit_branch(ASMState *as, A64Ins ai, MCode *target) +{ + MCode *p = as->mcp; + ptrdiff_t delta = target - (p - 1); + lua_assert(((delta + 0x02000000) >> 26) == 0); + *--p = ai | ((uint32_t)delta & 0x03ffffffu); + as->mcp = p; +} + +#define emit_jmp(as, target) emit_branch(as, A64I_B, (target)) + +static void emit_call(ASMState *as, void *target) +{ + MCode *p = --as->mcp; + ptrdiff_t delta = (char *)target - (char *)p; + if ((((delta>>2) + 0x02000000) >> 26) == 0) { + *p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu); + } else { /* Target out of range: need indirect call. But don't use R0-R7. */ + Reg r = ra_allock(as, i64ptr(target), + RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED); + *p = A64I_BLR | A64F_N(r); + } +} + +/* -- Emit generic operations --------------------------------------------- */ + +/* Generic move between two regs. */ +static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src) +{ + if (dst >= RID_MAX_GPR) { + emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S, + (dst & 31), (src & 31)); + return; + } + if (as->mcp != as->mcloop) { /* Swap early registers for loads/stores. */ + MCode ins = *as->mcp, swp = (src^dst); + if ((ins & 0xbf800000) == 0xb9000000) { + if (!((ins ^ (dst << 5)) & 0x000003e0)) + *as->mcp = ins ^ (swp << 5); /* Swap N in load/store. */ + if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f)) + *as->mcp = ins ^ swp; /* Swap D in store. */ + } + } + emit_dm(as, A64I_MOVx, dst, src); +} + +/* Generic load of register with base and (small) offset address. */ +static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs) +{ + if (r >= RID_MAX_GPR) + emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs); + else + emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs); +} + +/* Generic store of register with base and (small) offset address. */ +static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs) +{ + if (r >= RID_MAX_GPR) + emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs); + else + emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs); +} + +/* Emit an arithmetic operation with a constant operand. */ +static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src, + int32_t i, RegSet allow) +{ + uint32_t k = emit_isk12(i); + if (k) + emit_dn(as, ai^k, dest, src); + else + emit_dnm(as, ai, dest, src, ra_allock(as, i, allow)); +} + +/* Add offset to pointer. */ +static void emit_addptr(ASMState *as, Reg r, int32_t ofs) +{ + if (ofs) + emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r, + ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r)); +} + +#define emit_spsub(as, ofs) emit_addptr(as, RID_SP, -(ofs)) + diff --git a/src/lj_gdbjit.c b/src/lj_gdbjit.c index 8b72be7d..8bc2474c 100644 --- a/src/lj_gdbjit.c +++ b/src/lj_gdbjit.c @@ -296,6 +296,9 @@ enum { #elif LJ_TARGET_ARM DW_REG_SP = 13, DW_REG_RA = 14, +#elif LJ_TARGET_ARM64 + DW_REG_SP = 31, + DW_REG_RA = 30, #elif LJ_TARGET_PPC DW_REG_SP = 1, DW_REG_RA = 65, @@ -374,6 +377,8 @@ static const ELFheader elfhdr_template = { .machine = 62, #elif LJ_TARGET_ARM .machine = 40, +#elif LJ_TARGET_ARM64 + .machine = 183, #elif LJ_TARGET_PPC .machine = 20, #elif LJ_TARGET_MIPS @@ -563,6 +568,13 @@ static void LJ_FASTCALL gdbjit_ehframe(GDBJITctx *ctx) int i; for (i = 11; i >= 4; i--) { DB(DW_CFA_offset|i); DUV(2+(11-i)); } } +#elif LJ_TARGET_ARM64 + { + int i; + DB(DW_CFA_offset|31); DUV(2); + for (i = 28; i >= 19; i--) { DB(DW_CFA_offset|i); DUV(3+(28-i)); } + for (i = 15; i >= 8; i--) { DB(DW_CFA_offset|32|i); DUV(28-i); } + } #elif LJ_TARGET_PPC { int i; diff --git a/src/lj_target.h b/src/lj_target.h index abea8d5b..c069eb95 100644 --- a/src/lj_target.h +++ b/src/lj_target.h @@ -55,7 +55,7 @@ typedef uint32_t RegSP; /* Bitset for registers. 32 registers suffice for most architectures. ** Note that one set holds bits for both GPRs and FPRs. */ -#if LJ_TARGET_PPC || LJ_TARGET_MIPS +#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 typedef uint64_t RegSet; #else typedef uint32_t RegSet; @@ -69,7 +69,7 @@ typedef uint32_t RegSet; #define rset_set(rs, r) (rs |= RID2RSET(r)) #define rset_clear(rs, r) (rs &= ~RID2RSET(r)) #define rset_exclude(rs, r) (rs & ~RID2RSET(r)) -#if LJ_TARGET_PPC || LJ_TARGET_MIPS +#if LJ_TARGET_PPC || LJ_TARGET_MIPS || LJ_TARGET_ARM64 #define rset_picktop(rs) ((Reg)(__builtin_clzll(rs)^63)) #define rset_pickbot(rs) ((Reg)__builtin_ctzll(rs)) #else diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index 57ab134f..0cef06d5 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -55,7 +55,8 @@ enum { /* Make use of all registers, except for x18, fp, lr and sp. */ #define RSET_FIXED \ - (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)) + (RID2RSET(RID_X18)|RID2RSET(RID_FP)|RID2RSET(RID_LR)|RID2RSET(RID_SP)|\ + RID2RSET(RID_GL)) #define RSET_GPR (RSET_RANGE(RID_MIN_GPR, RID_MAX_GPR) - RSET_FIXED) #define RSET_FPR RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR) #define RSET_ALL (RSET_GPR|RSET_FPR) @@ -73,25 +74,235 @@ enum { #define REGARG_LASTFPR RID_D7 #define REGARG_NUMFPR 8 +/* -- Spill slots --------------------------------------------------------- */ + +/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs. +** +** SPS_FIXED: Available fixed spill slots in interpreter frame. +** This definition must match with the vm_arm64.dasc file. +** Pre-allocate some slots to avoid sp adjust in every root trace. +** +** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots. +*/ +#define SPS_FIXED 4 +#define SPS_FIRST 2 + +#define SPOFS_TMP 0 + +#define sps_scale(slot) (4 * (int32_t)(slot)) +#define sps_align(slot) (((slot) - SPS_FIXED + 3) & ~3) + +/* -- Exit state ---------------------------------------------------------- */ + +/* This definition must match with the *.dasc file(s). */ +typedef struct { + lua_Number fpr[RID_NUM_FPR]; /* Floating-point registers. */ + intptr_t gpr[RID_NUM_GPR]; /* General-purpose registers. */ + int32_t spill[256]; /* Spill slots. */ +} ExitState; + +/* PC after instruction that caused an exit. Used to find the trace number. */ +#define EXITSTATE_PCREG RID_LR +/* Highest exit + 1 indicates stack check. */ +#define EXITSTATE_CHECKEXIT 1 + +#define EXITSTUB_SPACING 4 +#define EXITSTUBS_PER_GROUP 32 + + /* -- Instructions -------------------------------------------------------- */ /* Instruction fields. */ #define A64F_D(r) (r) -#define A64F_N(r) ((r) << 5) -#define A64F_A(r) ((r) << 10) -#define A64F_M(r) ((r) << 16) +#define A64F_N(r) ((r) << 5) +#define A64F_A(r) ((r) << 10) +#define A64F_M(r) ((r) << 16) +#define A64F_IMMS(x) ((x) << 10) +#define A64F_IMMR(x) ((x) << 16) #define A64F_U16(x) ((x) << 5) +#define A64F_U12(x) ((x) << 10) #define A64F_S26(x) (x) #define A64F_S19(x) ((x) << 5) +#define A64F_S9(x) ((x) << 12) +#define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10)) +#define A64F_EX(ex) (A64I_EX | ((ex) << 13)) +#define A64F_EXSH(ex,x) (A64I_EX | ((ex) << 13) | ((x) << 10)) +#define A64F_FP8(x) ((x) << 13) +#define A64F_CC(cc) ((cc) << 12) +#define A64F_LSL16(x) (((x) / 16) << 21) +#define A64F_BSH(sh) ((sh) << 10) typedef enum A64Ins { + A64I_S = 0x20000000, + A64I_X = 0x80000000, + A64I_EX = 0x00200000, + A64I_K12 = 0x1a000000, + A64I_K13 = 0x18000000, + A64I_LS_U = 0x01000000, + A64I_LS_S = 0x00800000, + A64I_LS_R = 0x01200800, + A64I_LS_UXTWx = 0x00005000, + A64I_LS_LSLx = 0x00007000, + + A64I_ADDw = 0x0b000000, + A64I_ADDx = 0x8b000000, + A64I_ADDSw = 0x2b000000, + A64I_ADDSx = 0xab000000, + A64I_NEGw = 0x4b0003e0, + A64I_NEGx = 0xcb0003e0, + A64I_SUBw = 0x4b000000, + A64I_SUBx = 0xcb000000, + A64I_SUBSw = 0x6b000000, + A64I_SUBSx = 0xeb000000, + + A64I_MULw = 0x1b007c00, + A64I_MULx = 0x9b007c00, + A64I_SMULL = 0x9b207c00, + + A64I_ANDw = 0x0a000000, + A64I_ANDx = 0x8a000000, + A64I_ANDSw = 0x6a000000, + A64I_ANDSx = 0xea000000, + A64I_EORw = 0x4a000000, + A64I_EORx = 0xca000000, + A64I_ORRw = 0x2a000000, + A64I_ORRx = 0xaa000000, + A64I_TSTw = 0x6a00001f, + A64I_TSTx = 0xea00001f, + + A64I_CMPw = 0x6b00001f, + A64I_CMPx = 0xeb00001f, + A64I_CMNw = 0x2b00001f, + A64I_CMNx = 0xab00001f, + A64I_CCMPw = 0x7a400000, + A64I_CCMPx = 0xfa400000, + A64I_CSELw = 0x1a800000, + A64I_CSELx = 0x9a800000, + + A64I_ASRw = 0x13007c00, + A64I_ASRx = 0x9340fc00, + A64I_LSLx = 0xd3400000, + A64I_LSRx = 0xd340fc00, + A64I_SHRw = 0x1ac02000, + A64I_SHRx = 0x9ac02000, /* lsl/lsr/asr/ror x0, x0, x0 */ + A64I_REVw = 0x5ac00800, + A64I_REVx = 0xdac00c00, + + A64I_EXTRw = 0x13800000, + A64I_EXTRx = 0x93c00000, + A64I_SBFMw = 0x13000000, + A64I_SBFMx = 0x93400000, + A64I_SXTBw = 0x13001c00, + A64I_SXTHw = 0x13003c00, + A64I_SXTW = 0x93407c00, + A64I_UBFMw = 0x53000000, + A64I_UBFMx = 0xd3400000, + A64I_UXTBw = 0x53001c00, + A64I_UXTHw = 0x53003c00, + + A64I_MOVw = 0x2a0003e0, + A64I_MOVx = 0xaa0003e0, + A64I_MVNw = 0x2a2003e0, + A64I_MVNx = 0xaa2003e0, + A64I_MOVKw = 0x72800000, + A64I_MOVKx = 0xf2800000, A64I_MOVZw = 0x52800000, A64I_MOVZx = 0xd2800000, + A64I_MOVNw = 0x12800000, + A64I_MOVNx = 0x92800000, + + A64I_LDRB = 0x39400000, + A64I_LDRH = 0x79400000, + A64I_LDRw = 0xb9400000, + A64I_LDRx = 0xf9400000, A64I_LDRLw = 0x18000000, A64I_LDRLx = 0x58000000, - A64I_NOP = 0xd503201f, + A64I_STRB = 0x39000000, + A64I_STRH = 0x79000000, + A64I_STRw = 0xb9000000, + A64I_STRx = 0xf9000000, + A64I_STPw = 0x29000000, + A64I_STPx = 0xa9000000, + A64I_LDPw = 0x29400000, + A64I_LDPx = 0xa9400000, + A64I_B = 0x14000000, + A64I_BCC = 0x54000000, + A64I_BL = 0x94000000, A64I_BR = 0xd61f0000, + A64I_BLR = 0xd63f0000, + + A64I_NOP = 0xd503201f, + + /* FP */ + A64I_FADDd = 0x1e602800, + A64I_FSUBd = 0x1e603800, + A64I_FMADDd = 0x1f400000, + A64I_FMSUBd = 0x1f408000, + A64I_FNMADDd = 0x1f600000, + A64I_FNMSUBd = 0x1f608000, + A64I_FMULd = 0x1e600800, + A64I_FDIVd = 0x1e601800, + A64I_FNEGd = 0x1e614000, + A64I_FABS = 0x1e60c000, + A64I_FSQRTd = 0x1e61c000, + A64I_LDRs = 0xbd400000, + A64I_LDRd = 0xfd400000, + A64I_STRs = 0xbd000000, + A64I_STRd = 0xfd000000, + A64I_LDPs = 0x2d400000, + A64I_LDPd = 0x6d400000, + A64I_STPs = 0x2d000000, + A64I_STPd = 0x6d000000, + A64I_FCMPd = 0x1e602000, + A64I_FCMPZd = 0x1e602008, + A64I_FCSELd = 0x1e600c00, + A64I_FRINTMd = 0x1e654000, + A64I_FRINTPd = 0x1e64c000, + A64I_FRINTZd = 0x1e65c000, + + A64I_FCVT_F32_F64 = 0x1e624000, + A64I_FCVT_F64_F32 = 0x1e22c000, + A64I_FCVT_F32_S32 = 0x1e220000, + A64I_FCVT_F64_S32 = 0x1e620000, + A64I_FCVT_F32_U32 = 0x1e230000, + A64I_FCVT_F64_U32 = 0x1e630000, + A64I_FCVT_F32_S64 = 0x9e220000, + A64I_FCVT_F64_S64 = 0x9e620000, + A64I_FCVT_F32_U64 = 0x9e230000, + A64I_FCVT_F64_U64 = 0x9e630000, + A64I_FCVT_S32_F64 = 0x1e780000, + A64I_FCVT_S32_F32 = 0x1e380000, + A64I_FCVT_U32_F64 = 0x1e790000, + A64I_FCVT_U32_F32 = 0x1e390000, + A64I_FCVT_S64_F64 = 0x9e780000, + A64I_FCVT_S64_F32 = 0x9e380000, + A64I_FCVT_U64_F64 = 0x9e790000, + A64I_FCVT_U64_F32 = 0x9e390000, + + A64I_FMOV_S = 0x1e204000, + A64I_FMOV_D = 0x1e604000, + A64I_FMOV_R_S = 0x1e260000, + A64I_FMOV_S_R = 0x1e270000, + A64I_FMOV_R_D = 0x9e660000, + A64I_FMOV_D_R = 0x9e670000, + A64I_FMOV_DI = 0x1e601000, } A64Ins; +typedef enum A64Shift { + A64SH_LSL, A64SH_LSR, A64SH_ASR, A64SH_ROR +} A64Shift; + +typedef enum A64Extend { + A64EX_UXTB, A64EX_UXTH, A64EX_UXTW, A64EX_UXTX, + A64EX_SXTB, A64EX_SXTH, A64EX_SXTW, A64EX_SXTX, +} A64Extend; + +/* ARM condition codes. */ +typedef enum A64CC { + CC_EQ, CC_NE, CC_CS, CC_CC, CC_MI, CC_PL, CC_VS, CC_VC, + CC_HI, CC_LS, CC_GE, CC_LT, CC_GT, CC_LE, CC_AL, + CC_HS = CC_CS, CC_LO = CC_CC +} A64CC; + #endif diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 7a881bdd..a6227bf7 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -236,12 +236,17 @@ |.macro mov_false, reg; movn reg, #0x8000, lsl #32; .endmacro |.macro mov_true, reg; movn reg, #0x0001, lsl #48; .endmacro | -#define GL_J(field) (GG_OFS(J) + (int)offsetof(jit_State, field)) +#define GL_J(field) (GG_G2J + (int)offsetof(jit_State, field)) | #define PC2PROTO(field) ((int)offsetof(GCproto, field)-(int)sizeof(GCproto)) | |.macro hotcheck, delta -| NYI +| lsr CARG1, PC, #1 +| and CARG1, CARG1, #126 +| add CARG1, CARG1, #GG_G2DISP+GG_DISP2HOT +| ldrh CARG2w, [GL, CARG1] +| subs CARG2, CARG2, #delta +| strh CARG2w, [GL, CARG1] |.endmacro | |.macro hotloop @@ -869,7 +874,7 @@ static void build_subroutines(BuildCtx *ctx) | bl extern lj_meta_for // (lua_State *L, TValue *base) | ldr INSw, [PC, #-4] |.if JIT - | uxtb TMP0, INS + | uxtb TMP0w, INSw |.endif | decode_RA RA, INS | decode_RD RC, INS @@ -1732,7 +1737,20 @@ static void build_subroutines(BuildCtx *ctx) |//----------------------------------------------------------------------- | |->vm_record: // Dispatch target for recording phase. - | NYI + |.if JIT + | ldrb CARG1w, GL->hookmask + | tst CARG1, #HOOK_VMEVENT // No recording while in vmevent. + | bne >5 + | // Decrement the hookcount for consistency, but always do the call. + | ldr CARG2w, GL->hookcount + | tst CARG1, #HOOK_ACTIVE + | bne >1 + | sub CARG2w, CARG2w, #1 + | tst CARG1, #LUA_MASKLINE|LUA_MASKCOUNT + | beq >1 + | str CARG2w, GL->hookcount + | b >1 + |.endif | |->vm_rethook: // Dispatch target for return hooks. | ldrb TMP2w, GL->hookmask @@ -1774,7 +1792,21 @@ static void build_subroutines(BuildCtx *ctx) | b <4 | |->vm_hotloop: // Hot loop counter underflow. - | NYI + |.if JIT + | ldr LFUNC:CARG3, [BASE, FRAME_FUNC] // Same as curr_topL(L). + | add CARG1, GL, #GG_G2DISP+GG_DISP2J + | and LFUNC:CARG3, CARG3, #LJ_GCVMASK + | str PC, SAVE_PC + | ldr CARG3, LFUNC:CARG3->pc + | mov CARG2, PC + | str L, [GL, #GL_J(L)] + | ldrb CARG3w, [CARG3, #PC2PROTO(framesize)] + | str BASE, L->base + | add CARG3, BASE, CARG3, lsl #3 + | str CARG3, L->top + | bl extern lj_trace_hot // (jit_State *J, const BCIns *pc) + | b <3 + |.endif | |->vm_callhook: // Dispatch target for call hooks. | mov CARG2, PC @@ -1804,7 +1836,54 @@ static void build_subroutines(BuildCtx *ctx) | br CRET1 | |->cont_stitch: // Trace stitching. - | NYI + |.if JIT + | // RA = resultptr, CARG4 = meta base + | ldr RB, SAVE_MULTRES + | ldr INSw, [PC, #-4] + | ldr TRACE:CARG3, [CARG4, #-40] // Save previous trace. + | subs RB, RB, #8 + | decode_RA RC, INS // Call base. + | and CARG3, CARG3, #LJ_GCVMASK + | beq >2 + |1: // Move results down. + | ldr CARG1, [RA] + | add RA, RA, #8 + | subs RB, RB, #8 + | str CARG1, [BASE, RC, lsl #3] + | add RC, RC, #1 + | bne <1 + |2: + | decode_RA RA, INS + | decode_RB RB, INS + | add RA, RA, RB + |3: + | cmp RA, RC + | bhi >9 // More results wanted? + | + | ldrh RAw, TRACE:CARG3->traceno + | ldrh RCw, TRACE:CARG3->link + | cmp RCw, RAw + | beq ->cont_nop // Blacklisted. + | cmp RCw, #0 + | bne =>BC_JLOOP // Jump to stitched trace. + | + | // Stitch a new trace to the previous trace. + | mov CARG1, #GL_J(exitno) + | str RA, [GL, CARG1] + | mov CARG1, #GL_J(L) + | str L, [GL, CARG1] + | str BASE, L->base + | add CARG1, GL, #GG_G2J + | mov CARG2, PC + | bl extern lj_dispatch_stitch // (jit_State *J, const BCIns *pc) + | ldr BASE, L->base + | b ->cont_nop + | + |9: // Fill up results with nil. + | str TISNIL, [BASE, RC, lsl #3] + | add RC, RC, #1 + | b <3 + |.endif | |->vm_profhook: // Dispatch target for profiler hook. #if LJ_HASPROFILE @@ -1822,10 +1901,120 @@ static void build_subroutines(BuildCtx *ctx) |//-- Trace exit handler ------------------------------------------------- |//----------------------------------------------------------------------- | + |.macro savex_, a, b + | stp d..a, d..b, [sp, #a*8] + | stp x..a, x..b, [sp, #32*8+a*8] + |.endmacro + | |->vm_exit_handler: - | NYI + |.if JIT + | sub sp, sp, #(64*8) + | savex_, 0, 1 + | savex_, 2, 3 + | savex_, 4, 5 + | savex_, 6, 7 + | savex_, 8, 9 + | savex_, 10, 11 + | savex_, 12, 13 + | savex_, 14, 15 + | savex_, 16, 17 + | savex_, 18, 19 + | savex_, 20, 21 + | savex_, 22, 23 + | savex_, 24, 25 + | savex_, 26, 27 + | savex_, 28, 29 + | stp d30, d31, [sp, #30*8] + | ldr CARG1, [sp, #64*8] // Load original value of lr. + | add CARG3, sp, #64*8 // Recompute original value of sp. + | mv_vmstate CARG4, EXIT + | ldr CARG2w, [CARG1, #-4]! // Get exit instruction. + | stp CARG1, CARG3, [sp, #62*8] // Store exit pc/sp in RID_LR/RID_SP. + | lsl CARG2, CARG2, #38 + | add CARG1, CARG1, CARG2, asr #36 + | ldr CARG2w, [lr] // Load exit stub group offset. + | sub CARG1, CARG1, lr + | sub CARG1, CARG1, #4 + | ldr L, GL->cur_L + | add CARG1, CARG2, CARG1, lsr #2 // Compute exit number. + | ldr BASE, GL->jit_base + | st_vmstate CARG4 + | str CARG1w, [GL, #GL_J(exitno)] + | str BASE, L->base + | str L, [GL, #GL_J(L)] + | str xzr, GL->jit_base + | add CARG1, GL, #GG_G2J + | mov CARG2, sp + | bl extern lj_trace_exit // (jit_State *J, ExitState *ex) + | // Returns MULTRES (unscaled) or negated error code. + | ldr CARG2, L->cframe + | ldr BASE, L->base + | and sp, CARG2, #CFRAME_RAWMASK + | ldr PC, SAVE_PC // Get SAVE_PC. + | str L, SAVE_L // Set SAVE_L (on-trace resume/yield). + | b >1 + |.endif + | |->vm_exit_interp: - | NYI + | // CARG1 = MULTRES or negated error code, BASE, PC and GL set. + |.if JIT + | ldr L, SAVE_L + |1: + | cmp CARG1w, #0 + | blt >9 // Check for error from exit. + | lsl RC, CARG1, #3 + | ldr LFUNC:CARG2, [BASE, FRAME_FUNC] + | movz TISNUM, #(LJ_TISNUM>>1)&0xffff, lsl #48 + | movz TISNUMhi, #(LJ_TISNUM>>1)&0xffff, lsl #16 + | movn TISNIL, #0 + | and LFUNC:CARG2, CARG2, #LJ_GCVMASK + | str RC, SAVE_MULTRES + | str BASE, L->base + | ldr CARG2, LFUNC:CARG2->pc + | str xzr, GL->jit_base + | mv_vmstate CARG4, INTERP + | ldr KBASE, [CARG2, #PC2PROTO(k)] + | // Modified copy of ins_next which handles function header dispatch, too. + | ldrb RBw, [PC] + | ldr INSw, [PC], #4 + | st_vmstate CARG4 + | cmp RBw, #BC_FUNCC+2 // Fast function? + | add TMP1, GL, INS, uxtb #3 + | bhs >4 + |2: + | cmp RBw, #BC_FUNCF // Function header? + | add TMP0, GL, RB, uxtb #3 + | ldr RB, [TMP0, #GG_G2DISP] + | decode_RA RA, INS + | lsr TMP0, INS, #16 + | csel RC, TMP0, RC, lo + | blo >5 + | ldr CARG3, [BASE, FRAME_FUNC] + | sub RC, RC, #8 + | add RA, BASE, RA, lsl #3 // Yes: RA = BASE+framesize*8, RC = nargs*8 + | and LFUNC:CARG3, CARG3, #LJ_GCVMASK + |5: + | br RB + | + |4: // Check frame below fast function. + | ldr CARG1, [BASE, FRAME_PC] + | ands CARG2, CARG1, #FRAME_TYPE + | bne <2 // Trace stitching continuation? + | // Otherwise set KBASE for Lua function below fast function. + | ldr CARG3, [CARG1, #-4] + | decode_RA CARG1, CARG3 + | sub CARG2, BASE, CARG1, lsl #3 + | ldr LFUNC:CARG3, [CARG2, #-32] + | and LFUNC:CARG3, CARG3, #LJ_GCVMASK + | ldr CARG3, LFUNC:CARG3->pc + | ldr KBASE, [CARG3, #PC2PROTO(k)] + | b <2 + | + |9: // Rethrow error from the right C frame. + | neg CARG2, CARG1 + | mov CARG1, L + | bl extern lj_err_throw // (lua_State *L, int errcode) + |.endif | |//----------------------------------------------------------------------- |//-- Math helper functions ---------------------------------------------- @@ -3387,6 +3576,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) if (op == BC_FORI) { | csel PC, RC, PC, gt } else if (op == BC_JFORI) { + | mov PC, RC | ldrh RCw, [RC, #-2] } else if (op == BC_IFORL) { | csel PC, RC, PC, le @@ -3488,7 +3678,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_JLOOP: |.if JIT - | NYI + | // RA = base (ignored), RC = traceno + | ldr CARG1, [GL, #GL_J(trace)] + | mov CARG2, #0 // Traces on ARM64 don't store the trace #, so use 0. + | ldr TRACE:RC, [CARG1, RC, lsl #3] + | st_vmstate CARG2 + | ldr RA, TRACE:RC->mcode + | str BASE, GL->jit_base + | str L, GL->tmpbuf.L + | sub sp, sp, #16 // See SPS_FIXED. Avoids sp adjust in every root trace. + | br RA |.endif break; @@ -3546,10 +3745,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_IFUNCV: | // BASE = new base, RA = BASE+framesize*8, CARG3 = LFUNC, RC = nargs*8 | ldr CARG1, L->maxstack + | movn TMP0, #~LJ_TFUNC | add TMP2, BASE, RC + | add LFUNC:CARG3, CARG3, TMP0, lsl #47 | add RA, RA, RC | add TMP0, RC, #16+FRAME_VARG - | str LFUNC:CARG3, [TMP2], #8 // Store (untagged) copy of LFUNC. + | str LFUNC:CARG3, [TMP2], #8 // Store (tagged) copy of LFUNC. | ldr KBASE, [PC, #-4+PC2PROTO(k)] | cmp RA, CARG1 | str TMP0, [TMP2], #8 // Store delta + FRAME_VARG. @@ -3736,8 +3937,8 @@ static void emit_asm_debug(BuildCtx *ctx) "\t.uleb128 0x1\n" "\t.sleb128 -8\n" "\t.byte 30\n" /* Return address is in lr. */ - "\t.uleb128 1\n" /* augmentation length */ - "\t.byte 0x1b\n" /* pcrel|sdata4 */ + "\t.uleb128 1\n" /* augmentation length */ + "\t.byte 0x1b\n" /* pcrel|sdata4 */ "\t.byte 0xc\n\t.uleb128 31\n\t.uleb128 0\n" /* def_cfa sp */ "\t.align 3\n" ".LECIE2:\n\n"); @@ -3748,7 +3949,7 @@ static void emit_asm_debug(BuildCtx *ctx) "\t.long .LASFDE3-.Lframe2\n" "\t.long lj_vm_ffi_call-.\n" "\t.long %d\n" - "\t.uleb128 0\n" /* augmentation length */ + "\t.uleb128 0\n" /* augmentation length */ "\t.byte 0xe\n\t.uleb128 32\n" /* def_cfa_offset */ "\t.byte 0x9d\n\t.uleb128 4\n" /* offset fp */ "\t.byte 0x9e\n\t.uleb128 3\n" /* offset lr */