DynASM/x86: Add AVX and AVX2 opcodes.

Thanks to Peter Cawley.
This commit is contained in:
Mike Pall 2015-10-24 18:43:47 +02:00
parent 7e22082480
commit 8a13c9cebf
2 changed files with 338 additions and 71 deletions

View File

@ -391,7 +391,12 @@ int dasm_encode(Dst_DECL, void *buffer)
case DASM_IMM_D: wd: dasmd(n); break; case DASM_IMM_D: wd: dasmd(n); break;
case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL; case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL;
case DASM_IMM_W: dasmw(n); break; case DASM_IMM_W: dasmw(n); break;
case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; } case DASM_VREG: {
int t = *p++;
if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3;
cp[-1] ^= n;
break;
}
case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
b++; n = (int)(ptrdiff_t)D->globals[-n]; b++; n = (int)(ptrdiff_t)D->globals[-n];
case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */

View File

@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
local _s = string local _s = string
local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
local concat, sort = table.concat, table.sort local concat, sort, remove = table.concat, table.sort, table.remove
local bit = bit or require("bit") local bit = bit or require("bit")
local band, shl, shr = bit.band, bit.lshift, bit.rshift local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
-- Inherited tables and callbacks. -- Inherited tables and callbacks.
local g_opt, g_arch local g_opt, g_arch
@ -299,7 +299,7 @@ local function mkrmap(sz, cl, names)
local iname = format("@%s%x%s", sz, i, needrex and "R" or "") local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
if needrex then map_reg_needrex[iname] = true end if needrex then map_reg_needrex[iname] = true end
local name local name
if sz == "o" then name = format("xmm%d", i) if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
elseif sz == "f" then name = format("st%d", i) elseif sz == "f" then name = format("st%d", i)
else name = format("r%d%s", i, sz == addrsize and "" or sz) end else name = format("r%d%s", i, sz == addrsize and "" or sz) end
map_archdef[name] = iname map_archdef[name] = iname
@ -334,21 +334,24 @@ mkrmap("f", "Rf")
-- SSE registers (oword sized, but qword and dword accessible). -- SSE registers (oword sized, but qword and dword accessible).
mkrmap("o", "xmm") mkrmap("o", "xmm")
-- AVX registers (yword sized, but oword, qword and dword accessible).
mkrmap("y", "ymm")
-- Operand size prefixes to codes. -- Operand size prefixes to codes.
local map_opsize = { local map_opsize = {
byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
aword = addrsize, tword = "t", aword = addrsize,
} }
-- Operand size code to number. -- Operand size code to number.
local map_opsizenum = { local map_opsizenum = {
b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
} }
-- Operand size code to name. -- Operand size code to name.
local map_opsizename = { local map_opsizename = {
b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
f = "fpword", t = "tword", f = "fpword",
} }
-- Valid index register scale factors. -- Valid index register scale factors.
@ -460,7 +463,29 @@ local function wputszarg(sz, n)
end end
-- Put multi-byte opcode with operand-size dependent modifications. -- Put multi-byte opcode with operand-size dependent modifications.
local function wputop(sz, op, rex) local function wputop(sz, op, rex, vex)
if vex then
local tail
if vex.m == 1 and band(rex, 11) == 0 then
wputb(0xc5)
tail = shl(bxor(band(rex, 4), 4), 5)
else
wputb(0xc4)
wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
tail = shl(band(rex, 8), 4)
end
local reg, vreg = 0, nil
if vex.v then
reg = vex.v.reg
if not reg then werror("bad vex operand") end
if reg < 0 then reg = 0; vreg = vex.v.vreg end
end
if sz == "y" or vex.l then tail = tail + 4 end
wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
if vreg then waction("VREG", vreg); wputxb(4) end
rex = 0
if op >= 256 then werror("bad vex opcode") end
end
local r local r
if rex ~= 0 and not x64 then werror("bad operand size") end if rex ~= 0 and not x64 then werror("bad operand size") end
if sz == "w" then wputb(102) end if sz == "w" then wputb(102) end
@ -881,9 +906,15 @@ end
-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand.
-- The spare 3 bits are either filled with the last hex digit or -- The spare 3 bits are either filled with the last hex digit or
-- the result from a previous "r"/"R". The opcode is restored. -- the result from a previous "r"/"R". The opcode is restored.
-- "u" Use VEX encoding, vvvv unused.
-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is
-- removed from the list used by future characters).
-- "L" Force VEX.L
-- --
-- All of the following characters force a flush of the opcode: -- All of the following characters force a flush of the opcode:
-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. -- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand.
-- "s" stores a 4 bit immediate from the last register operand,
-- followed by 4 zero bits.
-- "S" stores a signed 8 bit immediate from the last operand. -- "S" stores a signed 8 bit immediate from the last operand.
-- "U" stores an unsigned 8 bit immediate from the last operand. -- "U" stores an unsigned 8 bit immediate from the last operand.
-- "W" stores an unsigned 16 bit immediate from the last operand. -- "W" stores an unsigned 16 bit immediate from the last operand.
@ -1225,46 +1256,14 @@ local map_op = {
movups_2 = "rmo:0F10rM|mro:0F11Rm", movups_2 = "rmo:0F10rM|mro:0F11Rm",
orpd_2 = "rmo:660F56rM", orpd_2 = "rmo:660F56rM",
orps_2 = "rmo:0F56rM", orps_2 = "rmo:0F56rM",
packssdw_2 = "rmo:660F6BrM",
packsswb_2 = "rmo:660F63rM",
packuswb_2 = "rmo:660F67rM",
paddb_2 = "rmo:660FFCrM",
paddd_2 = "rmo:660FFErM",
paddq_2 = "rmo:660FD4rM",
paddsb_2 = "rmo:660FECrM",
paddsw_2 = "rmo:660FEDrM",
paddusb_2 = "rmo:660FDCrM",
paddusw_2 = "rmo:660FDDrM",
paddw_2 = "rmo:660FFDrM",
pand_2 = "rmo:660FDBrM",
pandn_2 = "rmo:660FDFrM",
pause_0 = "F390", pause_0 = "F390",
pavgb_2 = "rmo:660FE0rM",
pavgw_2 = "rmo:660FE3rM",
pcmpeqb_2 = "rmo:660F74rM",
pcmpeqd_2 = "rmo:660F76rM",
pcmpeqw_2 = "rmo:660F75rM",
pcmpgtb_2 = "rmo:660F64rM",
pcmpgtd_2 = "rmo:660F66rM",
pcmpgtw_2 = "rmo:660F65rM",
pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
pmaddwd_2 = "rmo:660FF5rM",
pmaxsw_2 = "rmo:660FEErM",
pmaxub_2 = "rmo:660FDErM",
pminsw_2 = "rmo:660FEArM",
pminub_2 = "rmo:660FDArM",
pmovmskb_2 = "rr/do:660FD7rM", pmovmskb_2 = "rr/do:660FD7rM",
pmulhuw_2 = "rmo:660FE4rM",
pmulhw_2 = "rmo:660FE5rM",
pmullw_2 = "rmo:660FD5rM",
pmuludq_2 = "rmo:660FF4rM",
por_2 = "rmo:660FEBrM",
prefetchnta_1 = "xb:n0F180m", prefetchnta_1 = "xb:n0F180m",
prefetcht0_1 = "xb:n0F181m", prefetcht0_1 = "xb:n0F181m",
prefetcht1_1 = "xb:n0F182m", prefetcht1_1 = "xb:n0F182m",
prefetcht2_1 = "xb:n0F183m", prefetcht2_1 = "xb:n0F183m",
psadbw_2 = "rmo:660FF6rM",
pshufd_3 = "rmio:660F70rMU", pshufd_3 = "rmio:660F70rMU",
pshufhw_3 = "rmio:F30F70rMU", pshufhw_3 = "rmio:F30F70rMU",
pshuflw_3 = "rmio:F20F70rMU", pshuflw_3 = "rmio:F20F70rMU",
@ -1278,23 +1277,6 @@ local map_op = {
psrldq_2 = "rio:660F733mU", psrldq_2 = "rio:660F733mU",
psrlq_2 = "rmo:660FD3rM|rio:660F732mU", psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
psrlw_2 = "rmo:660FD1rM|rio:660F712mU", psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
psubb_2 = "rmo:660FF8rM",
psubd_2 = "rmo:660FFArM",
psubq_2 = "rmo:660FFBrM",
psubsb_2 = "rmo:660FE8rM",
psubsw_2 = "rmo:660FE9rM",
psubusb_2 = "rmo:660FD8rM",
psubusw_2 = "rmo:660FD9rM",
psubw_2 = "rmo:660FF9rM",
punpckhbw_2 = "rmo:660F68rM",
punpckhdq_2 = "rmo:660F6ArM",
punpckhqdq_2 = "rmo:660F6DrM",
punpckhwd_2 = "rmo:660F69rM",
punpcklbw_2 = "rmo:660F60rM",
punpckldq_2 = "rmo:660F62rM",
punpcklqdq_2 = "rmo:660F6CrM",
punpcklwd_2 = "rmo:660F61rM",
pxor_2 = "rmo:660FEFrM",
rcpps_2 = "rmo:0F53rM", rcpps_2 = "rmo:0F53rM",
rcpss_2 = "rro:F30F53rM|rx/od:", rcpss_2 = "rro:F30F53rM|rx/od:",
rsqrtps_2 = "rmo:0F52rM", rsqrtps_2 = "rmo:0F52rM",
@ -1421,6 +1403,223 @@ local map_op = {
aesimc_2 = "rmo:660F38DBrM", aesimc_2 = "rmo:660F38DBrM",
aeskeygenassist_3 = "rmio:660F3ADFrMU", aeskeygenassist_3 = "rmio:660F3ADFrMU",
pclmulqdq_3 = "rmio:660F3A44rMU", pclmulqdq_3 = "rmio:660F3A44rMU",
-- AVX FP ops
vaddsubpd_3 = "rrmoy:660FVD0rM",
vaddsubps_3 = "rrmoy:F20FVD0rM",
vandpd_3 = "rrmoy:660FV54rM",
vandps_3 = "rrmoy:0FV54rM",
vandnpd_3 = "rrmoy:660FV55rM",
vandnps_3 = "rrmoy:0FV55rM",
vblendpd_4 = "rrmioy:660F3AV0DrMU",
vblendps_4 = "rrmioy:660F3AV0CrMU",
vblendvpd_4 = "rrmroy:660F3AV4BrMs",
vblendvps_4 = "rrmroy:660F3AV4ArMs",
vbroadcastf128_2 = "rx/yo:660F38u1ArM",
vcmppd_4 = "rrmioy:660FVC2rMU",
vcmpps_4 = "rrmioy:0FVC2rMU",
vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:",
vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:",
vcomisd_2 = "rro:660Fu2FrM|rx/oq:",
vcomiss_2 = "rro:0Fu2FrM|rx/od:",
vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:",
vcvtdq2ps_2 = "rmoy:0Fu5BrM",
vcvtpd2dq_2 = "rmoy:F20FuE6rM",
vcvtpd2ps_2 = "rmoy:660Fu5ArM",
vcvtps2dq_2 = "rmoy:660Fu5BrM",
vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:",
vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:",
vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:",
vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
vcvttps2dq_2 = "rmoy:F30Fu5BrM",
vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
vdppd_4 = "rrmio:660F3AV41rMU",
vdpps_4 = "rrmioy:660F3AV40rMU",
vextractf128_3 = "mri/oy:660F3AuL19RmU",
vextractps_3 = "mri/do:660F3Au17RmU",
vhaddpd_3 = "rrmoy:660FV7CrM",
vhaddps_3 = "rrmoy:F20FV7CrM",
vhsubpd_3 = "rrmoy:660FV7DrM",
vhsubps_3 = "rrmoy:F20FV7DrM",
vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:",
vldmxcsr_1 = "xd:0FuAE2m",
vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm",
vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm",
vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:",
vmovhlps_3 = "rrro:0FV12rM",
vmovhpd_2 = "xr/qo:660Fu17Rm",
vmovhpd_3 = "rrx/ooq:660FV16rM",
vmovhps_2 = "xr/qo:0Fu17Rm",
vmovhps_3 = "rrx/ooq:0FV16rM",
vmovlhps_3 = "rrro:0FV16rM",
vmovlpd_2 = "xr/qo:660Fu13Rm",
vmovlpd_3 = "rrx/ooq:660FV12rM",
vmovlps_2 = "xr/qo:0Fu13Rm",
vmovlps_3 = "rrx/ooq:0FV12rM",
vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM",
vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM",
vmovntpd_2 = "xroy:660Fu2BRm",
vmovntps_2 = "xroy:0Fu2BRm",
vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
vmovsd_3 = "rrro:F20FV10rM",
vmovshdup_2 = "rmoy:F30Fu16rM",
vmovsldup_2 = "rmoy:F30Fu12rM",
vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
vmovss_3 = "rrro:F30FV10rM",
vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm",
vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm",
vorpd_3 = "rrmoy:660FV56rM",
vorps_3 = "rrmoy:0FV56rM",
vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
vperm2f128_4 = "rrmiy:660F3AV06rMU",
vptestpd_2 = "rmoy:660F38u0FrM",
vptestps_2 = "rmoy:660F38u0ErM",
vrcpps_2 = "rmoy:0Fu53rM",
vrcpss_3 = "rrro:F30FV53rM|rrx/ood:",
vrsqrtps_2 = "rmoy:0Fu52rM",
vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:",
vroundpd_3 = "rmioy:660F3AV09rMU",
vroundps_3 = "rmioy:660F3AV08rMU",
vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:",
vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:",
vshufpd_4 = "rrmioy:660FVC6rMU",
vshufps_4 = "rrmioy:0FVC6rMU",
vsqrtps_2 = "rmoy:0Fu51rM",
vsqrtss_2 = "rro:F30Fu51rM|rx/od:",
vsqrtpd_2 = "rmoy:660Fu51rM",
vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:",
vstmxcsr_1 = "xd:0FuAE3m",
vucomisd_2 = "rro:660Fu2ErM|rx/oq:",
vucomiss_2 = "rro:0Fu2ErM|rx/od:",
vunpckhpd_3 = "rrmoy:660FV15rM",
vunpckhps_3 = "rrmoy:0FV15rM",
vunpcklpd_3 = "rrmoy:660FV14rM",
vunpcklps_3 = "rrmoy:0FV14rM",
vxorpd_3 = "rrmoy:660FV57rM",
vxorps_3 = "rrmoy:0FV57rM",
vzeroall_0 = "0FuL77",
vzeroupper_0 = "0Fu77",
-- AVX2 FP ops
vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
-- *vgather* (!vsib)
vpermpd_3 = "rmiy:660F3AuX01rMU",
vpermps_3 = "rrmy:660F38V16rM",
-- AVX, AVX2 integer ops
-- In general, xmm requires AVX, ymm requires AVX2.
vlddqu_2 = "rxoy:F20FuF0rM",
vmaskmovdqu_2 = "rro:660FuF7rM",
vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
vmovntdq_2 = "xroy:660FuE7Rm",
vmovntdqa_2 = "rxoy:660F38u2ArM",
vmpsadbw_4 = "rrmioy:660F3AV42rMU",
vpabsb_2 = "rmoy:660F38u1CrM",
vpabsd_2 = "rmoy:660F38u1ErM",
vpabsw_2 = "rmoy:660F38u1DrM",
vpackusdw_3 = "rrmoy:660F38V2BrM",
vpalignr_4 = "rrmioy:660F3AV0FrMU",
vpblendvb_4 = "rrmroy:660F3AV4CrMs",
vpblendw_4 = "rrmioy:660F3AV0ErMU",
vpclmulqdq_4 = "rrmio:660F3AV44rMU",
vpcmpeqq_3 = "rrmoy:660F38V29rM",
vpcmpestri_3 = "rmio:660F3Au61rMU",
vpcmpestrm_3 = "rmio:660F3Au60rMU",
vpcmpgtq_3 = "rrmoy:660F38V37rM",
vpcmpistri_3 = "rmio:660F3Au63rMU",
vpcmpistrm_3 = "rmio:660F3Au62rMU",
vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
vpextrd_3 = "mri/do:660F3Au16RmU",
vpextrq_3 = "mri/qo:660F3Au16RmU",
vphaddw_3 = "rrmoy:660F38V01rM",
vphaddd_3 = "rrmoy:660F38V02rM",
vphaddsw_3 = "rrmoy:660F38V03rM",
vphminposuw_2 = "rmo:660F38u41rM",
vphsubw_3 = "rrmoy:660F38V05rM",
vphsubd_3 = "rrmoy:660F38V06rM",
vphsubsw_3 = "rrmoy:660F38V07rM",
vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:",
vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:",
vpinsrd_4 = "rrmi/ood:660F3AV22rMU",
vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU",
vpmaddubsw_3 = "rrmoy:660F38V04rM",
vpmaxsb_3 = "rrmoy:660F38V3CrM",
vpmaxsd_3 = "rrmoy:660F38V3DrM",
vpmaxuw_3 = "rrmoy:660F38V3ErM",
vpmaxud_3 = "rrmoy:660F38V3FrM",
vpminsb_3 = "rrmoy:660F38V38rM",
vpminsd_3 = "rrmoy:660F38V39rM",
vpminuw_3 = "rrmoy:660F38V3ArM",
vpminud_3 = "rrmoy:660F38V3BrM",
vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM",
vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:",
vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:",
vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:",
vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:",
vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:",
vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:",
vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:",
vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:",
vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:",
vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:",
vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:",
vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:",
vpmuldq_3 = "rrmoy:660F38V28rM",
vpmulhrsw_3 = "rrmoy:660F38V0BrM",
vpmulld_3 = "rrmoy:660F38V40rM",
vpshufb_3 = "rrmoy:660F38V00rM",
vpshufd_3 = "rmioy:660Fu70rMU",
vpshufhw_3 = "rmioy:F30Fu70rMU",
vpshuflw_3 = "rmioy:F20Fu70rMU",
vpsignb_3 = "rrmoy:660F38V08rM",
vpsignw_3 = "rrmoy:660F38V09rM",
vpsignd_3 = "rrmoy:660F38V0ArM",
vpslldq_3 = "rrioy:660Fv737mU",
vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU",
vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU",
vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU",
vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU",
vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU",
vpsrldq_3 = "rrioy:660Fv733mU",
vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU",
vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU",
vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU",
vptest_2 = "rmoy:660F38u17rM",
-- AVX2 integer ops
vbroadcasti128_2 = "rx/yo:660F38u5ArM",
vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
vextracti128_3 = "mri/oy:660F3AuL39RmU",
vpblendd_4 = "rrmioy:660F3AV02rMU",
vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
vpermd_3 = "rrmy:660F38V36rM",
vpermq_3 = "rmiy:660F3AuX00rMU",
-- *vpgather* (!vsib)
vperm2i128_4 = "rrmiy:660F3AV46rMU",
vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
vpsllvd_3 = "rrmoy:660F38V47rM",
vpsllvq_3 = "rrmoy:660F38VX47rM",
vpsravd_3 = "rrmoy:660F38V46rM",
vpsrlvd_3 = "rrmoy:660F38V45rM",
vpsrlvq_3 = "rrmoy:660F38VX45rM",
} }
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -1471,28 +1670,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
end end
-- SSE FP arithmetic ops. -- SSE / AVX FP arithmetic ops.
for name,n in pairs{ sqrt = 1, add = 8, mul = 9, for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
sub = 12, min = 13, div = 14, max = 15 } do sub = 12, min = 13, div = 14, max = 15 } do
map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
if n ~= 1 then
map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
end
end
-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
for name,n in pairs{
paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
} do
map_op[name.."_2"] = format("rmo:660F%02XrM", n)
map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
end end
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
local map_vexarg = { u = false, v = 1, V = 2 }
-- Process pattern string. -- Process pattern string.
local function dopattern(pat, args, sz, op, needrex) local function dopattern(pat, args, sz, op, needrex)
local digit, addin local digit, addin, vex
local opcode = 0 local opcode = 0
local szov = sz local szov = sz
local narg = 1 local narg = 1
local rex = 0 local rex = 0
-- Limit number of section buffer positions used by a single dasm_put(). -- Limit number of section buffer positions used by a single dasm_put().
-- A single opcode needs a maximum of 5 positions. -- A single opcode needs a maximum of 6 positions.
if secpos+5 > maxsecpos then wflush() end if secpos+6 > maxsecpos then wflush() end
-- Process each character. -- Process each character.
for c in gmatch(pat.."|", ".") do for c in gmatch(pat.."|", ".") do
@ -1506,6 +1735,8 @@ local function dopattern(pat, args, sz, op, needrex)
szov = nil szov = nil
elseif c == "X" then -- Force REX.W. elseif c == "X" then -- Force REX.W.
rex = 8 rex = 8
elseif c == "L" then -- Force VEX.L.
vex.l = true
elseif c == "r" then -- Merge 1st operand regno. into opcode. elseif c == "r" then -- Merge 1st operand regno. into opcode.
addin = args[1]; opcode = opcode + (addin.reg % 8) addin = args[1]; opcode = opcode + (addin.reg % 8)
if narg < 2 then narg = 2 end if narg < 2 then narg = 2 end
@ -1529,21 +1760,41 @@ local function dopattern(pat, args, sz, op, needrex)
if t.xreg and t.xreg > 7 then rex = rex + 2 end if t.xreg and t.xreg > 7 then rex = rex + 2 end
if s > 7 then rex = rex + 4 end if s > 7 then rex = rex + 4 end
if needrex then rex = rex + 16 end if needrex then rex = rex + 16 end
wputop(szov, opcode, rex); opcode = nil wputop(szov, opcode, rex, vex); opcode = nil
local imark = sub(pat, -1) -- Force a mark (ugly). local imark = sub(pat, -1) -- Force a mark (ugly).
-- Put ModRM/SIB with regno/last digit as spare. -- Put ModRM/SIB with regno/last digit as spare.
wputmrmsib(t, imark, s, addin and addin.vreg) wputmrmsib(t, imark, s, addin and addin.vreg)
addin = nil addin = nil
elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
local b = band(opcode, 255); opcode = shr(opcode, 8)
local m = 1
if b == 0x38 then m = 2
elseif b == 0x3a then m = 3 end
if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
if b ~= 0x0f then
werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
"' in pattern `"..pat.."' for `"..op.."'")
end
local v = map_vexarg[c]
if v then v = remove(args, v) end
b = band(opcode, 255)
local p = 0
if b == 0x66 then p = 1
elseif b == 0xf3 then p = 2
elseif b == 0xf2 then p = 3 end
if p ~= 0 then opcode = shr(opcode, 8) end
if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
vex = { m = m, p = p, v = v }
else else
if opcode then -- Flush opcode. if opcode then -- Flush opcode.
if szov == "q" and rex == 0 then rex = rex + 8 end if szov == "q" and rex == 0 then rex = rex + 8 end
if needrex then rex = rex + 16 end if needrex then rex = rex + 16 end
if addin and addin.reg == -1 then if addin and addin.reg == -1 then
wputop(szov, opcode - 7, rex) wputop(szov, opcode - 7, rex, vex)
waction("VREG", addin.vreg); wputxb(0) waction("VREG", addin.vreg); wputxb(0)
else else
if addin and addin.reg > 7 then rex = rex + 1 end if addin and addin.reg > 7 then rex = rex + 1 end
wputop(szov, opcode, rex) wputop(szov, opcode, rex, vex)
end end
opcode = nil opcode = nil
end end
@ -1580,6 +1831,14 @@ local function dopattern(pat, args, sz, op, needrex)
else else
wputlabel("REL_", imm, 2) wputlabel("REL_", imm, 2)
end end
elseif c == "s" then
local reg = a.reg
if reg < 0 then
wputb(0)
waction("VREG", a.vreg); wputxb(5)
else
wputb(shl(reg, 4))
end
else else
werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
end end
@ -1656,11 +1915,14 @@ map_op[".template__"] = function(params, template, nparams)
if pat == "" then pat = lastpat else lastpat = pat end if pat == "" then pat = lastpat else lastpat = pat end
if matchtm(tm, args) then if matchtm(tm, args) then
local prefix = sub(szm, 1, 1) local prefix = sub(szm, 1, 1)
if prefix == "/" then -- Match both operand sizes. if prefix == "/" then -- Exactly match leading operand sizes.
if args[1].opsize == sub(szm, 2, 2) and for i = #szm, 1, -1 do
args[2].opsize == sub(szm, 3, 3) then if i == 1 then
dopattern(pat, args, sz, params.op, needrex) -- Process pattern. dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
return return
elseif args[i-1].opsize ~= sub(szm, i, i) then
break
end
end end
else -- Match common operand size. else -- Match common operand size.
local szp = sz local szp = sz