diff --git a/src/lj_asm.c b/src/lj_asm.c index 1e83133b..c4f97a5c 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -2452,6 +2452,7 @@ typedef struct IntrinBuildState { RegSet inset, outset, modregs; uint32_t spadj, contexspill, contexofs; uint8_t outcontext; + char vzeroupper; } IntrinBuildState; static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info) @@ -2465,6 +2466,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info) for (i = 0; i < intrins->insz; i++) { Reg r = reg_rid(info->in[i]); + if (reg_kind(info->in[i]) == REGKIND_V256) + info->vzeroupper = 1; + if (reg_isgpr(info->in[i])) { if (r == RID_CONTEXT) { /* Save the offset in the input context so we can load it last */ @@ -2477,6 +2481,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info) } for (i = 0; i < intrins->outsz; i++) { + if (reg_kind(info->out[i]) == REGKIND_V256) + info->vzeroupper = 1; + rset_set(info->outset, reg_rid(info->out[i])); } @@ -2602,6 +2609,13 @@ restart: emit_epilogue(as, spadj, info.modregs, intrins->outsz); + /* Zero upper parts of ymm registers if any ymm register were used. + ** TODO: This shouldn't be need for some AMD cpus like Jaguar. + */ + if (info.vzeroupper) { + as->mcp = emit_vop(XV_VZEROUPPER, 0, 0, 0, as->mcp, 1); + } + /* If one of the output registers was the same as the outcontext we will * of saved the output value to the stack earlier, now save it into context */ diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index 760d74a9..8a1a0975 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -859,6 +859,10 @@ static void emit_loadfpr(ASMState *as, uint32_t reg, Reg base, int ofs) case REGKIND_V128: op = XO_MOVUPS; break; + case REGKIND_V256: + op = XV_MOVUPS; + r |= VEX_256; + break; } if (!rk_isvec(kind)) { @@ -889,6 +893,10 @@ static void emit_savefpr(ASMState *as, Reg reg, Reg base, int ofs) case REGKIND_V128: op = XO_MOVUPSto; break; + case REGKIND_V256: + op = XV_MOVUPSto; + r |= VEX_256; + break; } if (!rk_isvec(kind)) { diff --git a/src/lj_intrinsic.c b/src/lj_intrinsic.c index 1da30a34..635358ab 100644 --- a/src/lj_intrinsic.c +++ b/src/lj_intrinsic.c @@ -97,8 +97,9 @@ static int parse_fprreg(const char *name, uint32_t len) { uint32_t rid = 0, kind = REGKIND_FPR64; uint32_t pos = 3; + int flags = 0; - if (len < 3 || name[0] != 'x' || + if (len < 3 || (name[0] != 'x' && name[0] != 'y') || name[1] != 'm' || name[2] != 'm') return -1; @@ -120,15 +121,20 @@ static int parse_fprreg(const char *name, uint32_t len) return -1; } - if (pos < len) { - if (name[pos] == 'f') { - kind = REGKIND_FPR32; - pos++; - } else if (name[pos] == 'v') { - kind = REGKIND_V128; - pos++; - } else { - kind = REGKIND_FPR64; + if (name[0] == 'y') { + kind = REGKIND_V256; + flags |= INTRINSFLAG_VEX256; + } else { + if (pos < len) { + if (name[pos] == 'f') { + kind = REGKIND_FPR32; + pos++; + } else if (name[pos] == 'v') { + kind = REGKIND_V128; + pos++; + } else { + kind = REGKIND_FPR64; + } } } @@ -136,12 +142,12 @@ static int parse_fprreg(const char *name, uint32_t len) return -1; } - return reg_make(rid, kind); + return reg_make(rid, kind) | flags; } int lj_intrinsic_getreg(CTState *cts, GCstr *name) { - if (strdata(name)[0] == 'x') { + if (strdata(name)[0] == 'x' || strdata(name)[0] == 'y') { return parse_fprreg(strdata(name), name->len); } else { cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name); diff --git a/src/lj_intrinsic.h b/src/lj_intrinsic.h index f1235cb2..bae8741e 100644 --- a/src/lj_intrinsic.h +++ b/src/lj_intrinsic.h @@ -26,6 +26,8 @@ typedef enum INTRINSFLAGS { INTRINSFLAG_CALLED = 0x20, /* MODRM should always be set as indirect mode */ INTRINSFLAG_INDIRECT = 0x40, + /* Opcode uses ymm registers */ + INTRINSFLAG_VEX256 = 0x4000, /* Intrinsic is a template with no machine code set until instantiate at runtime with ** user supplied code. */ @@ -54,7 +56,7 @@ typedef struct AsmHeader { _(FPR64, IRT_NUM, CTID_DOUBLE) \ _(FPR32, IRT_FLOAT, CTID_FLOAT) \ _(V128, 0, 0) \ - _(FPR5, 0, 0) \ + _(V256, 0, 0) \ _(FPR6, 0, 0) \ _(FPR7, 0, 0) \ diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 75fce4cd..c5f23082 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -214,11 +214,25 @@ typedef struct IntrinWrapState { #define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24))) #define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24))) +#define XV_0f(o) ((uint32_t)(0xf8c5c5 + (0x##o<<24))) #define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24))) #define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24))) #define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24))) #define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24))) +typedef enum VEXPP { + VEXPP_0f = 0, + VEXPP_66 = 1, + VEXPP_f3 = 2, + VEXPP_f2 = 3, +} VEXPP; + +typedef enum VEXMAP { + VEXMAP_0F = 1, + VEXMAP_0F38 = 2, + VEXMAP_0F3A = 3, +} VEXMAP; + /* This list of x86 opcodes is not intended to be complete. Opcodes are only ** included when needed. Take a look at DynASM or jit.dis_x86 to see the ** whole mess. @@ -271,6 +285,10 @@ typedef enum { XV_SHLX = XV_660f38(f7), XV_SHRX = XV_f20f38(f7), + XV_MOVUPS = XV_0f(10), + XV_MOVUPSto = XV_0f(11), + XV_VZEROUPPER = XV_0f(77), + /* Variable-length opcodes. XO_* prefix. */ XO_OR = XO_(0b), XO_MOV = XO_(8b), diff --git a/tests/intrinsic_spec.lua b/tests/intrinsic_spec.lua index c4701560..3a866104 100644 --- a/tests/intrinsic_spec.lua +++ b/tests/intrinsic_spec.lua @@ -165,6 +165,23 @@ if ffi.arch == "x64" then assert_jit(444.575, testrex, 123.075, 321.5) end) + + it("fpr_vexrex(ymm)", function() + local array = ffi.new("float8", 0, 1, 2, 3, 4, 5, 6, 7) + --force a Vex.B base register + + assert_cdef([[void fpr_vexrex(float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) __mcode("?E") + __reglist(out, float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) + __reglist(mod, ymm1, ymm7)]]) + + local ymmtest = ffi.intrinsic("fpr_vexrex", "\x90", 1) + + local ymmout = ymmtest(array, 1, 2, 3, 4, 5, 6, 7) + + for i=0,7 do + assert_equal(ymmout[i], i) + end + end) end it("fpr_vec", function() @@ -198,6 +215,41 @@ end end end) + it("fpr_vec(ymm)", function() + assert_cdef([[void fpr_ymmvec(void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7)]], "fpr_ymmvec") + --test using plain array in place of a vector + local v1 = ffi.new("float[8]", 0, 1, 2, 3, 4, 5, 6, 7) + local ymmout = ffi.C.fpr_ymmvec(v1) + + for i=0,7 do + assert_equal(ymmout[i], i) + end + + assert_cdef([[void fpr_ymmvec2(void* ymm0, void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7, float8 ymm0)]], "fpr_ymmvec2") + + local v2 = ffi.new("float[8]", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5) + local ymmtest2 = ffi.C.fpr_ymmvec2 + local ymm7, ymm0 = ymmtest2(v1, v2) + + for i=0,7 do + assert_equal(ymm0[i], i) + end + for i=0,7 do + assert_equal(ymm7[i], i+0.5) + end + + --test using a cdata vector + v2 = ffi.new("float8", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5) + ymm7, ymm0 = ymmtest2(v1, v2) + + for i=0,7 do + assert_equal(ymm0[i], i) + end + for i=0,7 do + assert_equal(ymm7[i], i+0.5) + end + end) + it("idiv", function() assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv")