Added support for ymm registers in intrinsics

This commit is contained in:
fsfod 2016-03-29 11:08:36 +01:00
parent c0797d346b
commit f21526df9b
6 changed files with 113 additions and 13 deletions

View File

@ -2452,6 +2452,7 @@ typedef struct IntrinBuildState {
RegSet inset, outset, modregs; RegSet inset, outset, modregs;
uint32_t spadj, contexspill, contexofs; uint32_t spadj, contexspill, contexofs;
uint8_t outcontext; uint8_t outcontext;
char vzeroupper;
} IntrinBuildState; } IntrinBuildState;
static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info) static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
@ -2465,6 +2466,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
for (i = 0; i < intrins->insz; i++) { for (i = 0; i < intrins->insz; i++) {
Reg r = reg_rid(info->in[i]); Reg r = reg_rid(info->in[i]);
if (reg_kind(info->in[i]) == REGKIND_V256)
info->vzeroupper = 1;
if (reg_isgpr(info->in[i])) { if (reg_isgpr(info->in[i])) {
if (r == RID_CONTEXT) { if (r == RID_CONTEXT) {
/* Save the offset in the input context so we can load it last */ /* Save the offset in the input context so we can load it last */
@ -2477,6 +2481,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
} }
for (i = 0; i < intrins->outsz; i++) { for (i = 0; i < intrins->outsz; i++) {
if (reg_kind(info->out[i]) == REGKIND_V256)
info->vzeroupper = 1;
rset_set(info->outset, reg_rid(info->out[i])); rset_set(info->outset, reg_rid(info->out[i]));
} }
@ -2602,6 +2609,13 @@ restart:
emit_epilogue(as, spadj, info.modregs, intrins->outsz); emit_epilogue(as, spadj, info.modregs, intrins->outsz);
/* Zero upper parts of ymm registers if any ymm register were used.
** TODO: This shouldn't be need for some AMD cpus like Jaguar.
*/
if (info.vzeroupper) {
as->mcp = emit_vop(XV_VZEROUPPER, 0, 0, 0, as->mcp, 1);
}
/* If one of the output registers was the same as the outcontext we will /* If one of the output registers was the same as the outcontext we will
* of saved the output value to the stack earlier, now save it into context * of saved the output value to the stack earlier, now save it into context
*/ */

View File

@ -859,6 +859,10 @@ static void emit_loadfpr(ASMState *as, uint32_t reg, Reg base, int ofs)
case REGKIND_V128: case REGKIND_V128:
op = XO_MOVUPS; op = XO_MOVUPS;
break; break;
case REGKIND_V256:
op = XV_MOVUPS;
r |= VEX_256;
break;
} }
if (!rk_isvec(kind)) { if (!rk_isvec(kind)) {
@ -889,6 +893,10 @@ static void emit_savefpr(ASMState *as, Reg reg, Reg base, int ofs)
case REGKIND_V128: case REGKIND_V128:
op = XO_MOVUPSto; op = XO_MOVUPSto;
break; break;
case REGKIND_V256:
op = XV_MOVUPSto;
r |= VEX_256;
break;
} }
if (!rk_isvec(kind)) { if (!rk_isvec(kind)) {

View File

@ -97,8 +97,9 @@ static int parse_fprreg(const char *name, uint32_t len)
{ {
uint32_t rid = 0, kind = REGKIND_FPR64; uint32_t rid = 0, kind = REGKIND_FPR64;
uint32_t pos = 3; uint32_t pos = 3;
int flags = 0;
if (len < 3 || name[0] != 'x' || if (len < 3 || (name[0] != 'x' && name[0] != 'y') ||
name[1] != 'm' || name[2] != 'm') name[1] != 'm' || name[2] != 'm')
return -1; return -1;
@ -120,15 +121,20 @@ static int parse_fprreg(const char *name, uint32_t len)
return -1; return -1;
} }
if (pos < len) { if (name[0] == 'y') {
if (name[pos] == 'f') { kind = REGKIND_V256;
kind = REGKIND_FPR32; flags |= INTRINSFLAG_VEX256;
pos++; } else {
} else if (name[pos] == 'v') { if (pos < len) {
kind = REGKIND_V128; if (name[pos] == 'f') {
pos++; kind = REGKIND_FPR32;
} else { pos++;
kind = REGKIND_FPR64; } else if (name[pos] == 'v') {
kind = REGKIND_V128;
pos++;
} else {
kind = REGKIND_FPR64;
}
} }
} }
@ -136,12 +142,12 @@ static int parse_fprreg(const char *name, uint32_t len)
return -1; return -1;
} }
return reg_make(rid, kind); return reg_make(rid, kind) | flags;
} }
int lj_intrinsic_getreg(CTState *cts, GCstr *name) { int lj_intrinsic_getreg(CTState *cts, GCstr *name) {
if (strdata(name)[0] == 'x') { if (strdata(name)[0] == 'x' || strdata(name)[0] == 'y') {
return parse_fprreg(strdata(name), name->len); return parse_fprreg(strdata(name), name->len);
} else { } else {
cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name); cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name);

View File

@ -26,6 +26,8 @@ typedef enum INTRINSFLAGS {
INTRINSFLAG_CALLED = 0x20, INTRINSFLAG_CALLED = 0x20,
/* MODRM should always be set as indirect mode */ /* MODRM should always be set as indirect mode */
INTRINSFLAG_INDIRECT = 0x40, INTRINSFLAG_INDIRECT = 0x40,
/* Opcode uses ymm registers */
INTRINSFLAG_VEX256 = 0x4000,
/* Intrinsic is a template with no machine code set until instantiate at runtime with /* Intrinsic is a template with no machine code set until instantiate at runtime with
** user supplied code. ** user supplied code.
*/ */
@ -54,7 +56,7 @@ typedef struct AsmHeader {
_(FPR64, IRT_NUM, CTID_DOUBLE) \ _(FPR64, IRT_NUM, CTID_DOUBLE) \
_(FPR32, IRT_FLOAT, CTID_FLOAT) \ _(FPR32, IRT_FLOAT, CTID_FLOAT) \
_(V128, 0, 0) \ _(V128, 0, 0) \
_(FPR5, 0, 0) \ _(V256, 0, 0) \
_(FPR6, 0, 0) \ _(FPR6, 0, 0) \
_(FPR7, 0, 0) \ _(FPR7, 0, 0) \

View File

@ -214,11 +214,25 @@ typedef struct IntrinWrapState {
#define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24))) #define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24)))
#define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24))) #define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24)))
#define XV_0f(o) ((uint32_t)(0xf8c5c5 + (0x##o<<24)))
#define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24))) #define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24)))
#define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24))) #define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24)))
#define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24))) #define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24)))
#define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24))) #define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24)))
typedef enum VEXPP {
VEXPP_0f = 0,
VEXPP_66 = 1,
VEXPP_f3 = 2,
VEXPP_f2 = 3,
} VEXPP;
typedef enum VEXMAP {
VEXMAP_0F = 1,
VEXMAP_0F38 = 2,
VEXMAP_0F3A = 3,
} VEXMAP;
/* This list of x86 opcodes is not intended to be complete. Opcodes are only /* This list of x86 opcodes is not intended to be complete. Opcodes are only
** included when needed. Take a look at DynASM or jit.dis_x86 to see the ** included when needed. Take a look at DynASM or jit.dis_x86 to see the
** whole mess. ** whole mess.
@ -271,6 +285,10 @@ typedef enum {
XV_SHLX = XV_660f38(f7), XV_SHLX = XV_660f38(f7),
XV_SHRX = XV_f20f38(f7), XV_SHRX = XV_f20f38(f7),
XV_MOVUPS = XV_0f(10),
XV_MOVUPSto = XV_0f(11),
XV_VZEROUPPER = XV_0f(77),
/* Variable-length opcodes. XO_* prefix. */ /* Variable-length opcodes. XO_* prefix. */
XO_OR = XO_(0b), XO_OR = XO_(0b),
XO_MOV = XO_(8b), XO_MOV = XO_(8b),

View File

@ -165,6 +165,23 @@ if ffi.arch == "x64" then
assert_jit(444.575, testrex, 123.075, 321.5) assert_jit(444.575, testrex, 123.075, 321.5)
end) end)
it("fpr_vexrex(ymm)", function()
local array = ffi.new("float8", 0, 1, 2, 3, 4, 5, 6, 7)
--force a Vex.B base register
assert_cdef([[void fpr_vexrex(float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) __mcode("?E")
__reglist(out, float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp)
__reglist(mod, ymm1, ymm7)]])
local ymmtest = ffi.intrinsic("fpr_vexrex", "\x90", 1)
local ymmout = ymmtest(array, 1, 2, 3, 4, 5, 6, 7)
for i=0,7 do
assert_equal(ymmout[i], i)
end
end)
end end
it("fpr_vec", function() it("fpr_vec", function()
@ -198,6 +215,41 @@ end
end end
end) end)
it("fpr_vec(ymm)", function()
assert_cdef([[void fpr_ymmvec(void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7)]], "fpr_ymmvec")
--test using plain array in place of a vector
local v1 = ffi.new("float[8]", 0, 1, 2, 3, 4, 5, 6, 7)
local ymmout = ffi.C.fpr_ymmvec(v1)
for i=0,7 do
assert_equal(ymmout[i], i)
end
assert_cdef([[void fpr_ymmvec2(void* ymm0, void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7, float8 ymm0)]], "fpr_ymmvec2")
local v2 = ffi.new("float[8]", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5)
local ymmtest2 = ffi.C.fpr_ymmvec2
local ymm7, ymm0 = ymmtest2(v1, v2)
for i=0,7 do
assert_equal(ymm0[i], i)
end
for i=0,7 do
assert_equal(ymm7[i], i+0.5)
end
--test using a cdata vector
v2 = ffi.new("float8", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5)
ymm7, ymm0 = ymmtest2(v1, v2)
for i=0,7 do
assert_equal(ymm0[i], i)
end
for i=0,7 do
assert_equal(ymm7[i], i+0.5)
end
end)
it("idiv", function() it("idiv", function()
assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv") assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv")