mirror of
https://github.com/LuaJIT/LuaJIT.git
synced 2025-02-08 15:34:09 +00:00
Added support for ymm registers in intrinsics
This commit is contained in:
parent
c0797d346b
commit
f21526df9b
14
src/lj_asm.c
14
src/lj_asm.c
@ -2452,6 +2452,7 @@ typedef struct IntrinBuildState {
|
|||||||
RegSet inset, outset, modregs;
|
RegSet inset, outset, modregs;
|
||||||
uint32_t spadj, contexspill, contexofs;
|
uint32_t spadj, contexspill, contexofs;
|
||||||
uint8_t outcontext;
|
uint8_t outcontext;
|
||||||
|
char vzeroupper;
|
||||||
} IntrinBuildState;
|
} IntrinBuildState;
|
||||||
|
|
||||||
static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
|
static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
|
||||||
@ -2465,6 +2466,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
|
|||||||
for (i = 0; i < intrins->insz; i++) {
|
for (i = 0; i < intrins->insz; i++) {
|
||||||
Reg r = reg_rid(info->in[i]);
|
Reg r = reg_rid(info->in[i]);
|
||||||
|
|
||||||
|
if (reg_kind(info->in[i]) == REGKIND_V256)
|
||||||
|
info->vzeroupper = 1;
|
||||||
|
|
||||||
if (reg_isgpr(info->in[i])) {
|
if (reg_isgpr(info->in[i])) {
|
||||||
if (r == RID_CONTEXT) {
|
if (r == RID_CONTEXT) {
|
||||||
/* Save the offset in the input context so we can load it last */
|
/* Save the offset in the input context so we can load it last */
|
||||||
@ -2477,6 +2481,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < intrins->outsz; i++) {
|
for (i = 0; i < intrins->outsz; i++) {
|
||||||
|
if (reg_kind(info->out[i]) == REGKIND_V256)
|
||||||
|
info->vzeroupper = 1;
|
||||||
|
|
||||||
rset_set(info->outset, reg_rid(info->out[i]));
|
rset_set(info->outset, reg_rid(info->out[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2602,6 +2609,13 @@ restart:
|
|||||||
|
|
||||||
emit_epilogue(as, spadj, info.modregs, intrins->outsz);
|
emit_epilogue(as, spadj, info.modregs, intrins->outsz);
|
||||||
|
|
||||||
|
/* Zero upper parts of ymm registers if any ymm register were used.
|
||||||
|
** TODO: This shouldn't be need for some AMD cpus like Jaguar.
|
||||||
|
*/
|
||||||
|
if (info.vzeroupper) {
|
||||||
|
as->mcp = emit_vop(XV_VZEROUPPER, 0, 0, 0, as->mcp, 1);
|
||||||
|
}
|
||||||
|
|
||||||
/* If one of the output registers was the same as the outcontext we will
|
/* If one of the output registers was the same as the outcontext we will
|
||||||
* of saved the output value to the stack earlier, now save it into context
|
* of saved the output value to the stack earlier, now save it into context
|
||||||
*/
|
*/
|
||||||
|
@ -859,6 +859,10 @@ static void emit_loadfpr(ASMState *as, uint32_t reg, Reg base, int ofs)
|
|||||||
case REGKIND_V128:
|
case REGKIND_V128:
|
||||||
op = XO_MOVUPS;
|
op = XO_MOVUPS;
|
||||||
break;
|
break;
|
||||||
|
case REGKIND_V256:
|
||||||
|
op = XV_MOVUPS;
|
||||||
|
r |= VEX_256;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!rk_isvec(kind)) {
|
if (!rk_isvec(kind)) {
|
||||||
@ -889,6 +893,10 @@ static void emit_savefpr(ASMState *as, Reg reg, Reg base, int ofs)
|
|||||||
case REGKIND_V128:
|
case REGKIND_V128:
|
||||||
op = XO_MOVUPSto;
|
op = XO_MOVUPSto;
|
||||||
break;
|
break;
|
||||||
|
case REGKIND_V256:
|
||||||
|
op = XV_MOVUPSto;
|
||||||
|
r |= VEX_256;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!rk_isvec(kind)) {
|
if (!rk_isvec(kind)) {
|
||||||
|
@ -97,8 +97,9 @@ static int parse_fprreg(const char *name, uint32_t len)
|
|||||||
{
|
{
|
||||||
uint32_t rid = 0, kind = REGKIND_FPR64;
|
uint32_t rid = 0, kind = REGKIND_FPR64;
|
||||||
uint32_t pos = 3;
|
uint32_t pos = 3;
|
||||||
|
int flags = 0;
|
||||||
|
|
||||||
if (len < 3 || name[0] != 'x' ||
|
if (len < 3 || (name[0] != 'x' && name[0] != 'y') ||
|
||||||
name[1] != 'm' || name[2] != 'm')
|
name[1] != 'm' || name[2] != 'm')
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
@ -120,15 +121,20 @@ static int parse_fprreg(const char *name, uint32_t len)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos < len) {
|
if (name[0] == 'y') {
|
||||||
if (name[pos] == 'f') {
|
kind = REGKIND_V256;
|
||||||
kind = REGKIND_FPR32;
|
flags |= INTRINSFLAG_VEX256;
|
||||||
pos++;
|
} else {
|
||||||
} else if (name[pos] == 'v') {
|
if (pos < len) {
|
||||||
kind = REGKIND_V128;
|
if (name[pos] == 'f') {
|
||||||
pos++;
|
kind = REGKIND_FPR32;
|
||||||
} else {
|
pos++;
|
||||||
kind = REGKIND_FPR64;
|
} else if (name[pos] == 'v') {
|
||||||
|
kind = REGKIND_V128;
|
||||||
|
pos++;
|
||||||
|
} else {
|
||||||
|
kind = REGKIND_FPR64;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,12 +142,12 @@ static int parse_fprreg(const char *name, uint32_t len)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return reg_make(rid, kind);
|
return reg_make(rid, kind) | flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
int lj_intrinsic_getreg(CTState *cts, GCstr *name) {
|
int lj_intrinsic_getreg(CTState *cts, GCstr *name) {
|
||||||
|
|
||||||
if (strdata(name)[0] == 'x') {
|
if (strdata(name)[0] == 'x' || strdata(name)[0] == 'y') {
|
||||||
return parse_fprreg(strdata(name), name->len);
|
return parse_fprreg(strdata(name), name->len);
|
||||||
} else {
|
} else {
|
||||||
cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name);
|
cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name);
|
||||||
|
@ -26,6 +26,8 @@ typedef enum INTRINSFLAGS {
|
|||||||
INTRINSFLAG_CALLED = 0x20,
|
INTRINSFLAG_CALLED = 0x20,
|
||||||
/* MODRM should always be set as indirect mode */
|
/* MODRM should always be set as indirect mode */
|
||||||
INTRINSFLAG_INDIRECT = 0x40,
|
INTRINSFLAG_INDIRECT = 0x40,
|
||||||
|
/* Opcode uses ymm registers */
|
||||||
|
INTRINSFLAG_VEX256 = 0x4000,
|
||||||
/* Intrinsic is a template with no machine code set until instantiate at runtime with
|
/* Intrinsic is a template with no machine code set until instantiate at runtime with
|
||||||
** user supplied code.
|
** user supplied code.
|
||||||
*/
|
*/
|
||||||
@ -54,7 +56,7 @@ typedef struct AsmHeader {
|
|||||||
_(FPR64, IRT_NUM, CTID_DOUBLE) \
|
_(FPR64, IRT_NUM, CTID_DOUBLE) \
|
||||||
_(FPR32, IRT_FLOAT, CTID_FLOAT) \
|
_(FPR32, IRT_FLOAT, CTID_FLOAT) \
|
||||||
_(V128, 0, 0) \
|
_(V128, 0, 0) \
|
||||||
_(FPR5, 0, 0) \
|
_(V256, 0, 0) \
|
||||||
_(FPR6, 0, 0) \
|
_(FPR6, 0, 0) \
|
||||||
_(FPR7, 0, 0) \
|
_(FPR7, 0, 0) \
|
||||||
|
|
||||||
|
@ -214,11 +214,25 @@ typedef struct IntrinWrapState {
|
|||||||
#define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24)))
|
#define XO_f20f(o) ((uint32_t)(0x0ff2fc + (0x##o<<24)))
|
||||||
#define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24)))
|
#define XO_f30f(o) ((uint32_t)(0x0ff3fc + (0x##o<<24)))
|
||||||
|
|
||||||
|
#define XV_0f(o) ((uint32_t)(0xf8c5c5 + (0x##o<<24)))
|
||||||
#define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24)))
|
#define XV_660f38(o) ((uint32_t)(0x79e2c4 + (0x##o<<24)))
|
||||||
#define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24)))
|
#define XV_f20f38(o) ((uint32_t)(0x7be2c4 + (0x##o<<24)))
|
||||||
#define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24)))
|
#define XV_f20f3a(o) ((uint32_t)(0x7be3c4 + (0x##o<<24)))
|
||||||
#define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24)))
|
#define XV_f30f38(o) ((uint32_t)(0x7ae2c4 + (0x##o<<24)))
|
||||||
|
|
||||||
|
typedef enum VEXPP {
|
||||||
|
VEXPP_0f = 0,
|
||||||
|
VEXPP_66 = 1,
|
||||||
|
VEXPP_f3 = 2,
|
||||||
|
VEXPP_f2 = 3,
|
||||||
|
} VEXPP;
|
||||||
|
|
||||||
|
typedef enum VEXMAP {
|
||||||
|
VEXMAP_0F = 1,
|
||||||
|
VEXMAP_0F38 = 2,
|
||||||
|
VEXMAP_0F3A = 3,
|
||||||
|
} VEXMAP;
|
||||||
|
|
||||||
/* This list of x86 opcodes is not intended to be complete. Opcodes are only
|
/* This list of x86 opcodes is not intended to be complete. Opcodes are only
|
||||||
** included when needed. Take a look at DynASM or jit.dis_x86 to see the
|
** included when needed. Take a look at DynASM or jit.dis_x86 to see the
|
||||||
** whole mess.
|
** whole mess.
|
||||||
@ -271,6 +285,10 @@ typedef enum {
|
|||||||
XV_SHLX = XV_660f38(f7),
|
XV_SHLX = XV_660f38(f7),
|
||||||
XV_SHRX = XV_f20f38(f7),
|
XV_SHRX = XV_f20f38(f7),
|
||||||
|
|
||||||
|
XV_MOVUPS = XV_0f(10),
|
||||||
|
XV_MOVUPSto = XV_0f(11),
|
||||||
|
XV_VZEROUPPER = XV_0f(77),
|
||||||
|
|
||||||
/* Variable-length opcodes. XO_* prefix. */
|
/* Variable-length opcodes. XO_* prefix. */
|
||||||
XO_OR = XO_(0b),
|
XO_OR = XO_(0b),
|
||||||
XO_MOV = XO_(8b),
|
XO_MOV = XO_(8b),
|
||||||
|
@ -165,6 +165,23 @@ if ffi.arch == "x64" then
|
|||||||
|
|
||||||
assert_jit(444.575, testrex, 123.075, 321.5)
|
assert_jit(444.575, testrex, 123.075, 321.5)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it("fpr_vexrex(ymm)", function()
|
||||||
|
local array = ffi.new("float8", 0, 1, 2, 3, 4, 5, 6, 7)
|
||||||
|
--force a Vex.B base register
|
||||||
|
|
||||||
|
assert_cdef([[void fpr_vexrex(float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) __mcode("?E")
|
||||||
|
__reglist(out, float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp)
|
||||||
|
__reglist(mod, ymm1, ymm7)]])
|
||||||
|
|
||||||
|
local ymmtest = ffi.intrinsic("fpr_vexrex", "\x90", 1)
|
||||||
|
|
||||||
|
local ymmout = ymmtest(array, 1, 2, 3, 4, 5, 6, 7)
|
||||||
|
|
||||||
|
for i=0,7 do
|
||||||
|
assert_equal(ymmout[i], i)
|
||||||
|
end
|
||||||
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
it("fpr_vec", function()
|
it("fpr_vec", function()
|
||||||
@ -198,6 +215,41 @@ end
|
|||||||
end
|
end
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it("fpr_vec(ymm)", function()
|
||||||
|
assert_cdef([[void fpr_ymmvec(void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7)]], "fpr_ymmvec")
|
||||||
|
--test using plain array in place of a vector
|
||||||
|
local v1 = ffi.new("float[8]", 0, 1, 2, 3, 4, 5, 6, 7)
|
||||||
|
local ymmout = ffi.C.fpr_ymmvec(v1)
|
||||||
|
|
||||||
|
for i=0,7 do
|
||||||
|
assert_equal(ymmout[i], i)
|
||||||
|
end
|
||||||
|
|
||||||
|
assert_cdef([[void fpr_ymmvec2(void* ymm0, void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7, float8 ymm0)]], "fpr_ymmvec2")
|
||||||
|
|
||||||
|
local v2 = ffi.new("float[8]", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5)
|
||||||
|
local ymmtest2 = ffi.C.fpr_ymmvec2
|
||||||
|
local ymm7, ymm0 = ymmtest2(v1, v2)
|
||||||
|
|
||||||
|
for i=0,7 do
|
||||||
|
assert_equal(ymm0[i], i)
|
||||||
|
end
|
||||||
|
for i=0,7 do
|
||||||
|
assert_equal(ymm7[i], i+0.5)
|
||||||
|
end
|
||||||
|
|
||||||
|
--test using a cdata vector
|
||||||
|
v2 = ffi.new("float8", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5)
|
||||||
|
ymm7, ymm0 = ymmtest2(v1, v2)
|
||||||
|
|
||||||
|
for i=0,7 do
|
||||||
|
assert_equal(ymm0[i], i)
|
||||||
|
end
|
||||||
|
for i=0,7 do
|
||||||
|
assert_equal(ymm7[i], i+0.5)
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|
||||||
it("idiv", function()
|
it("idiv", function()
|
||||||
assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv")
|
assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user