From 33171e818077838ed673e927ea593f3dc72efde8 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Thu, 10 Dec 2009 05:59:42 +0100 Subject: [PATCH] Add 64 bit instruction templates to DynASM x64 and fix REX encoding. --- dynasm/dasm_x86.lua | 113 ++++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/dynasm/dasm_x86.lua b/dynasm/dasm_x86.lua index f508540e..e43e4675 100644 --- a/dynasm/dasm_x86.lua +++ b/dynasm/dasm_x86.lua @@ -447,7 +447,7 @@ end -- Put operand-size dependent number or arg (defaults to dword). local function wputszarg(sz, n) - if not sz or sz == "d" then wputdarg(n) + if not sz or sz == "d" or sz == "q" then wputdarg(n) elseif sz == "w" then wputwarg(n) elseif sz == "b" then wputbarg(n) elseif sz == "s" then wputsbarg(n) @@ -457,11 +457,20 @@ end -- Put multi-byte opcode with operand-size dependent modifications. local function wputop(sz, op, rex) local r + if rex ~= 0 and not x64 then werror("bad operand size") end if sz == "w" then wputb(102) end -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end if op >= 16777216 then r = op % 16777216 wputb((op-r) / 16777216) op = r end - if op >= 65536 then r = op % 65536 wputb((op-r) / 65536) op = r end + if op >= 65536 then + if rex ~= 0 then + local opc3 = op - op % 256 + if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then + wputb(64 + rex % 15); rex = 0 + end + end + r = op % 65536 wputb((op-r) / 65536) op = r + end if op >= 256 then r = op % 256 local b = (op-r) / 256 @@ -844,6 +853,7 @@ end -- Hex chars are accumulated to form the opcode (left to right). -- "n" disables the standard opcode mods -- (otherwise: -1 for "b", o16 prefix for "w", rex.w for "q") +-- "W" Force REX.W. -- "r"/"R" adds the reg. number from the 1st/2nd operand to the opcode. -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. -- The spare 3 bits are either filled with the last hex digit or @@ -888,18 +898,20 @@ local map_op = { -- 38-3D: cmp... ds_0 = "3E", -- 3F: *aas - inc_1 = "rdw:40r|m:FF0m", - dec_1 = "rdw:48r|m:FF1m", - push_1 = "rdw:50r|mdw:FF6m|S.:6AS|ib:n6Ai|i.:68i", - pop_1 = "rdw:58r|mdw:8F0m", + inc_1 = x64 and "m:FF0m" or "rdw:40r|m:FF0m", + dec_1 = x64 and "m:FF1m" or "rdw:48r|m:FF1m", + push_1 = (x64 and "rqw:50r|mqw:FF6m" or "rdw:50r|mdw:FF6m").."|S.:6AS|ib:n6Ai|i.:68i", + pop_1 = x64 and "rqw:58r|mqw:8F0m" or "rdw:58r|mdw:8F0m", -- 60: *pusha, *pushad, *pushaw -- 61: *popa, *popad, *popaw -- 62: *bound rdw,x - -- 63: *arpl mw,rw + -- 63: x86: *arpl mw,rw + movsxd_2 = x64 and "rm/qd:63rM", fs_0 = "64", gs_0 = "65", o16_0 = "66", - a16_0 = "67", + a16_0 = not x64 and "67" or nil, + a32_0 = x64 and "67", -- 68: push idw -- 69: imul rdw,mdw,idw -- 6A: push ib @@ -925,11 +937,13 @@ local map_op = { -- 8E: *mov seg,mdw -- 8F: pop mdw nop_0 = "90", - xchg_2 = "Rrdw:90R|rRdw:90r|rm:87rM|mr:87Rm", + xchg_2 = "Rrqdw:90R|rRqdw:90r|rm:87rM|mr:87Rm", cbw_0 = "6698", cwde_0 = "98", + cdqe_0 = "4898", cwd_0 = "6699", cdq_0 = "99", + cqo_0 = "4899", -- 9A: *call iw:idw wait_0 = "9B", fwait_0 = "9B", @@ -941,6 +955,7 @@ local map_op = { popfd_0 = "9D", sahf_0 = "9E", lahf_0 = "9F", + -- !x64: mov with 64 bit immediate mov_2 = "OR:A3o|RO:A1O|mr:89Rm|rm:8BrM|rib:nB0ri|ridw:B8ri|mi:C70mi", movsb_0 = "A4", movsw_0 = "66A5", @@ -994,8 +1009,8 @@ local map_op = { -- E5: *in Rdw,ib -- E6: *out ib,Rb -- E7: *out ib,Rdw - call_1 = "md:FF2m|J.:E8J", - jmp_1 = "md:FF4m|J.:E9J", -- short: EB + call_1 = x64 and "mq:FF2m|J.:E8J" or "md:FF2m|J.:E8J", + jmp_1 = x64 and "mq:FF4m|J.:E9J" or "md:FF4m|J.:E9J", -- short: EB -- EA: *jmp iw:idw -- EB: jmp ib -- EC: *in Rb,dx @@ -1029,19 +1044,19 @@ local map_op = { div_1 = "m:F76m", idiv_1 = "m:F77m", - imul_2 = "rmdw:0FAFrM|rIdw:69rmI|rSdw:6BrmS|ridw:69rmi", - imul_3 = "rmIdw:69rMI|rmSdw:6BrMS|rmidw:69rMi", + imul_2 = "rmqdw:0FAFrM|rIqdw:69rmI|rSqdw:6BrmS|riqdw:69rmi", + imul_3 = "rmIqdw:69rMI|rmSqdw:6BrMS|rmiqdw:69rMi", - movzx_2 = "rm/db:0FB6rM|rm/wb:0FB6rM|rm/dw:0FB7rM", - movsx_2 = "rm/db:0FBErM|rm/wb:0FBErM|rm/dw:0FBFrM", + movzx_2 = "rm/db:0FB6rM|rm/qb:|rm/wb:0FB6rM|rm/dw:0FB7rM|rm/qw:", + movsx_2 = "rm/db:0FBErM|rm/qb:|rm/wb:0FBErM|rm/dw:0FBFrM|rm/qw:", - bswap_1 = "rd:0FC8r", - bsf_2 = "rmdw:0FBCrM", - bsr_2 = "rmdw:0FBDrM", - bt_2 = "mrdw:0FA3Rm|midw:0FBA4mU", - btc_2 = "mrdw:0FBBRm|midw:0FBA7mU", - btr_2 = "mrdw:0FB3Rm|midw:0FBA6mU", - bts_2 = "mrdw:0FABRm|midw:0FBA5mU", + bswap_1 = "rqd:0FC8r", + bsf_2 = "rmqdw:0FBCrM", + bsr_2 = "rmqdw:0FBDrM", + bt_2 = "mrqdw:0FA3Rm|miqdw:0FBA4mU", + btc_2 = "mrqdw:0FBBRm|miqdw:0FBA7mU", + btr_2 = "mrqdw:0FB3Rm|miqdw:0FBA6mU", + bts_2 = "mrqdw:0FABRm|miqdw:0FBA5mU", rdtsc_0 = "0F31", -- P1+ cpuid_0 = "0FA2", -- P1+ @@ -1141,23 +1156,23 @@ local map_op = { cvtpi2ps_2 = "rx/oq:0F2ArM", cvtps2dq_2 = "rmo:660F5BrM", cvtps2pd_2 = "rro:0F5ArM|rx/oq:", - cvtsd2si_2 = "rr/do:F20F2DrM|rx/dq:", + cvtsd2si_2 = "rr/do:F20F2DrM|rr/qo:|rx/dq:|rxq:", cvtsd2ss_2 = "rro:F20F5ArM|rx/oq:", - cvtsi2sd_2 = "rm/od:F20F2ArM", - cvtsi2ss_2 = "rm/od:F30F2ArM", + cvtsi2sd_2 = "rm/od:F20F2ArM|rm/oq:F20F2ArWM", + cvtsi2ss_2 = "rm/od:F30F2ArM|rm/oq:F30F2ArWM", cvtss2sd_2 = "rro:F30F5ArM|rx/od:", - cvtss2si_2 = "rr/do:F20F2CrM|rx/dd:", + cvtss2si_2 = "rr/do:F20F2CrM|rr/qo:|rxd:|rx/qd:", cvttpd2dq_2 = "rmo:660FE6rM", cvttps2dq_2 = "rmo:F30F5BrM", - cvttsd2si_2 = "rr/do:F20F2CrM|rx/dq:", - cvttss2si_2 = "rr/do:F30F2CrM|rx/dd:", + cvttsd2si_2 = "rr/do:F20F2CrM|rr/qo:|rx/dq:|rxq:", + cvttss2si_2 = "rr/do:F30F2CrM|rr/qo:|rxd:|rx/qd:", ldmxcsr_1 = "xd:0FAE2m", lfence_0 = "0FAEE8", maskmovdqu_2 = "rro:660FF7rM", mfence_0 = "0FAEF0", movapd_2 = "rmo:660F28rM|mro:660F29Rm", movaps_2 = "rmo:0F28rM|mro:0F29Rm", - movd_2 = "rm/od:660F6ErM|mr/do:660F7ERm", + movd_2 = "rm/od:660F6ErM|rm/oq:660F6EWrM|mr/do:660F7ERm|mr/qo:", movdqa_2 = "rmo:660F6FrM|mro:660F7FRm", movdqu_2 = "rmo:F30F6FrM|mro:F30F7FRm", movhlps_2 = "rro:0F12rM", @@ -1169,7 +1184,7 @@ local map_op = { movmskpd_2 = "rr/do:660F50rM", movmskps_2 = "rr/do:0F50rM", movntdq_2 = "xro:660FE7Rm", - movnti_2 = "xrd:0FC3Rm", + movnti_2 = "xrqd:0FC3Rm", movntpd_2 = "xro:660F2BRm", movntps_2 = "xro:0F2BRm", movq_2 = "rro:F30F7ErM|rx/oq:|xr/qo:n660FD6Rm", @@ -1304,7 +1319,7 @@ local map_op = { blendvps_3 = "rmRo:660F3814rM", dppd_3 = "rmio:660F3A41rMU", dpps_3 = "rmio:660F3A40rMU", - extractps_3 = "mri/do:660F3A17RmU", + extractps_3 = "mri/do:660F3A17RmU|rri/qo:660F3A17RWmU", insertps_3 = "rrio:660F3A41rMU|rxi/od:", movntdqa_2 = "rmo:660F382ArM", mpsadbw_3 = "rmio:660F3A42rMU", @@ -1312,14 +1327,14 @@ local map_op = { pblendvb_3 = "rmRo:660F3810rM", pblendw_3 = "rmio:660F3A0ErMU", pcmpeqq_2 = "rmo:660F3829rM", - pextrb_3 = "rri/do:660F3A14nRmU|xri/bo:", + pextrb_3 = "rri/do:660F3A14nRmU|rri/qo:|xri/bo:", pextrd_3 = "mri/do:660F3A16RmU", - -- x64: pextrq + pextrq_3 = "mri/qo:660F3A16RmU", -- pextrw is SSE2, mem operand is SSE4.1 only phminposuw_2 = "rmo:660F3841rM", - pinsrb_3 = "rri/od:660F3A20nrMU|rxi/ob:", - pinsrd_3 = "rmi/od:660F3A22rMU", - -- x64: pinsrq + pinsrb_3 = "rri/od:660F3A20nrMU|rxi/ob:", + pinsrd_3 = "rmi/od:660F3A22rMU", + pinsrq_3 = "rmi/oq:660F3A22rWMU", pmaxsb_2 = "rmo:660F383CrM", pmaxsd_2 = "rmo:660F383DrM", pmaxud_2 = "rmo:660F383FrM", @@ -1349,20 +1364,20 @@ local map_op = { roundss_3 = "rrio:660F3A0ArMU|rxi/od:", -- SSE4.2 ops - crc32_2 = "rmd:F20F38F1rM|rm/dw:66F20F38F1rM|rm/db:F20F38F0nrM", + crc32_2 = "rmqd:F20F38F1rM|rm/dw:66F20F38F1rM|rm/db:F20F38F0rM|rm/qb:", pcmpestri_3 = "rmio:660F3A61rMU", pcmpestrm_3 = "rmio:660F3A60rMU", pcmpgtq_2 = "rmo:660F3837rM", pcmpistri_3 = "rmio:660F3A63rMU", pcmpistrm_3 = "rmio:660F3A62rMU", - popcnt_2 = "rmdw:F30FB8rM", + popcnt_2 = "rmqdw:F30FB8rM", -- SSE4a extrq_2 = "rro:660F79rM", extrq_3 = "riio:660F780mUU", insertq_2 = "rro:F20F79rM", insertq_4 = "rriio:F20F78rMUU", - lzcnt_2 = "rmdw:F30FBDrM", + lzcnt_2 = "rmqdw:F30FBDrM", movntsd_2 = "xr/qo:nF20F2BRm", movntss_2 = "xr/do:F30F2BRm", -- popcnt is also in SSE4.2 @@ -1375,21 +1390,21 @@ for name,n in pairs{ add = 0, ["or"] = 1, adc = 2, sbb = 3, ["and"] = 4, sub = 5, xor = 6, cmp = 7 } do local n8 = n * 8 map_op[name.."_2"] = format( - "mr:%02XRm|rm:%02XrM|mI1dw:81%XmI|mS1dw:83%XmS|Ri1dwb:%02Xri|mi1dwb:81%Xmi", + "mr:%02XRm|rm:%02XrM|mI1qdw:81%XmI|mS1qdw:83%XmS|Ri1qdwb:%02Xri|mi1qdwb:81%Xmi", 1+n8, 3+n8, n, n, 5+n8, n) end -- Shift ops. for name,n in pairs{ rol = 0, ror = 1, rcl = 2, rcr = 3, shl = 4, shr = 5, sar = 7, sal = 4 } do - map_op[name.."_2"] = format("m1:D1%Xm|mC1dwb:D3%Xm|mi:C1%XmU", n, n, n) + map_op[name.."_2"] = format("m1:D1%Xm|mC1qdwb:D3%Xm|mi:C1%XmU", n, n, n) end -- Conditional ops. for cc,n in pairs(map_cc) do map_op["j"..cc.."_1"] = format("J.:0F8%XJ", n) -- short: 7%X map_op["set"..cc.."_1"] = format("mb:n0F9%X2m", n) - map_op["cmov"..cc.."_2"] = format("rmdw:0F4%XrM", n) -- P6+ + map_op["cmov"..cc.."_2"] = format("rmqdw:0F4%XrM", n) -- P6+ end -- FP arithmetic ops. @@ -1400,9 +1415,9 @@ for name,n in pairs{ add = 0, mul = 1, com = 2, comp = 3, local fn = "f"..name map_op[fn.."_1"] = format("ff:D8%02Xr|xd:D8%Xm|xq:nDC%Xm", nc, n, n) if n == 2 or n == 3 then - map_op[fn.."_2"] = format("Fff:D8%02XR|Fx2d:D8%XM|Fx2q:DC%XM", nc, n, n) + map_op[fn.."_2"] = format("Fff:D8%02XR|Fx2d:D8%XM|Fx2q:nDC%XM", nc, n, n) else - map_op[fn.."_2"] = format("Fff:D8%02XR|fFf:DC%02Xr|Fx2d:D8%XM|Fx2q:DC%XM", nc, nr, n, n) + map_op[fn.."_2"] = format("Fff:D8%02XR|fFf:DC%02Xr|Fx2d:D8%XM|Fx2q:nDC%XM", nc, nr, n, n) map_op[fn.."p_1"] = format("ff:DE%02Xr", nr) map_op[fn.."p_2"] = format("fFf:DE%02Xr", nr) end @@ -1450,6 +1465,8 @@ local function dopattern(pat, args, sz, op) addin = nil elseif c == "n" then -- Disable operand size mods for opcode. szov = nil + elseif c == "W" then -- Force REX.W. + rex = 8 elseif c == "r" then -- Merge 1st operand regno. into opcode. addin = args[1]; opcode = opcode + (addin.reg % 8) if narg < 2 then narg = 2 end @@ -1468,7 +1485,7 @@ local function dopattern(pat, args, sz, op) local nn = c == "m" and 1 or 2 local t = args[nn] if narg <= nn then narg = nn + 1 end - local rex = szov == "q" and 8 or 0 + if szov == "q" and rex == 0 then rex = rex + 8 end if t.reg and t.reg > 7 then rex = rex + 1 end if t.xreg and t.xreg > 7 then rex = rex + 2 end if s > 7 then rex = rex + 4 end @@ -1479,11 +1496,13 @@ local function dopattern(pat, args, sz, op) addin = nil else if opcode then -- Flush opcode. + if szov == "q" and rex == 0 then rex = rex + 8 end if addin and addin.reg == -1 then - wputop(szov, opcode + 1, 0) + wputop(szov, opcode + 1, rex) waction("VREG", addin.vreg); wputxb(0) else - wputop(szov, opcode, (addin and addin.reg > 7) and 4 or 0) + if addin and addin.reg > 7 then rex = rex + 1 end + wputop(szov, opcode, rex) end opcode = nil end