mirror of
https://github.com/LuaJIT/LuaJIT.git
synced 2025-02-08 15:34:09 +00:00
Implement support for opcodes with dynamic registers
This commit is contained in:
parent
e6fecee925
commit
de4c0b6ea5
147
src/lj_asm.c
147
src/lj_asm.c
@ -2452,41 +2452,53 @@ typedef struct IntrinBuildState {
|
||||
RegSet inset, outset, modregs;
|
||||
uint32_t spadj, contexspill, contexofs;
|
||||
uint8_t outcontext;
|
||||
char vzeroupper;
|
||||
char vzeroupper, fuse;
|
||||
} IntrinBuildState;
|
||||
|
||||
static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
|
||||
{
|
||||
MSize offset = 0, i;
|
||||
int dynreg = intrin_regmode(intrins);
|
||||
int dynout = intrin_dynrout(intrins);
|
||||
memcpy(info->in, intrins->in, LJ_INTRINS_MAXREG);
|
||||
memcpy(info->out, intrins->out, LJ_INTRINS_MAXREG);
|
||||
|
||||
info->contexofs = -1;
|
||||
info->fuse = dynreg &&
|
||||
!(intrins->flags & (INTRINSFLAG_NOFUSE|INTRINSFLAG_INDIRECT));
|
||||
|
||||
for (i = 0; i < intrins->insz; i++) {
|
||||
Reg r = reg_rid(info->in[i]);
|
||||
int isdyn = dynreg && i < intrins->dyninsz;
|
||||
lua_assert(!isdyn || reg_isdyn(info->in[i]));
|
||||
|
||||
if (reg_kind(info->in[i]) == REGKIND_V256)
|
||||
info->vzeroupper = 1;
|
||||
|
||||
if (reg_isgpr(info->in[i])) {
|
||||
if (r == RID_CONTEXT) {
|
||||
if (!isdyn && r == RID_CONTEXT) {
|
||||
/* Save the offset in the input context so we can load it last */
|
||||
info->contexofs = offset;
|
||||
}
|
||||
offset += sizeof(intptr_t);
|
||||
}
|
||||
|
||||
if (!isdyn)
|
||||
rset_set(info->inset, r);
|
||||
}
|
||||
|
||||
for (i = 0; i < intrins->outsz; i++) {
|
||||
if (reg_kind(info->out[i]) == REGKIND_V256)
|
||||
info->vzeroupper = 1;
|
||||
if (i == 0 && dynout) continue;
|
||||
|
||||
rset_set(info->outset, reg_rid(info->out[i]));
|
||||
}
|
||||
|
||||
/* Don't try to fuse if a fixed register is the same as the input context */
|
||||
if (info->contexofs != -1)
|
||||
info->fuse = 0;
|
||||
|
||||
/* TODO: dynamic output context register selection */
|
||||
info->outcontext = RID_OUTCONTEXT;
|
||||
info->modregs |= info->outset|info->inset;
|
||||
@ -2506,6 +2518,7 @@ static void intrins_loadregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState
|
||||
|
||||
/* Finally load the input register conflicting with the input context */
|
||||
if (rset_test(info->inset, RID_CONTEXT) && info->contexofs != -1) {
|
||||
lua_assert(!info->fuse);
|
||||
emit_loadofsirt(as, IRT_INTP, RID_CONTEXT, RID_CONTEXT, info->contexofs);
|
||||
}
|
||||
|
||||
@ -2513,6 +2526,16 @@ static void intrins_loadregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState
|
||||
for (i = 0; i < intrins->insz; i++) {
|
||||
uint32_t reg = info->in[i];
|
||||
Reg r = reg_rid(reg);
|
||||
if (i == 0 && info->fuse) {
|
||||
lua_assert(info->contexofs == -1);
|
||||
/* The load is fused into the modrm of the opcode emitted in emit_intrins */
|
||||
if (reg_isgpr(reg)) {
|
||||
gpr++;
|
||||
} else {
|
||||
fpr++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (reg_isgpr(reg)) {
|
||||
if (r != RID_CONTEXT)
|
||||
@ -2559,6 +2582,32 @@ static void intrins_saveregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState
|
||||
}
|
||||
}
|
||||
|
||||
/* Replace placeholder register ids with platform specific registers */
|
||||
static RegSet pickdynlist(uint8_t *list, MSize sz, RegSet freeset)
|
||||
{
|
||||
MSize i;
|
||||
RegSet free = freeset;
|
||||
|
||||
for (i = 0; i < sz; i++) {
|
||||
RegSet rset = free & (reg_isgpr(list[i]) ? RSET_GPR : RSET_FPR);
|
||||
Reg r;
|
||||
|
||||
/* Try to use scratch register first */
|
||||
if ((rset & RSET_SCRATCH) != 0) {
|
||||
rset = rset & RSET_SCRATCH;
|
||||
}
|
||||
|
||||
r = rset_pickbot(rset);
|
||||
lua_assert(rset_test(free, r));
|
||||
|
||||
list[i] = reg_setrid(list[i], r);
|
||||
rset_clear(free, r);
|
||||
}
|
||||
|
||||
/* Return register set of extra used registers */
|
||||
return freeset & ~free;
|
||||
}
|
||||
|
||||
/*
|
||||
** Stack spill slots and gpr slots in the context are always the size of a native pointer
|
||||
** The output context register is always spilled to a fixed stack offset
|
||||
@ -2577,7 +2626,10 @@ static void wrap_intrins(jit_State *J, CIntrinsic *intrins, IntrinWrapState *sta
|
||||
AsmHeader *hdr;
|
||||
MCode *asmofs = NULL, *origtop;
|
||||
void* target = state->target;
|
||||
uint8_t *in = info.in, *out = info.out;
|
||||
int spadj = 0;
|
||||
int dynreg = intrin_regmode(intrins);
|
||||
Reg rout = RID_NONE, rin = RID_NONE;
|
||||
|
||||
lj_asm_setup_intrins(J, as);
|
||||
origtop = as->mctop;
|
||||
@ -2586,6 +2638,60 @@ static void wrap_intrins(jit_State *J, CIntrinsic *intrins, IntrinWrapState *sta
|
||||
info.modregs = state->mod;
|
||||
intrins_setup(intrins, &info);
|
||||
|
||||
/* Pick some ABI specific scratch registers for the opcode's input/output registers */
|
||||
if (dynreg) {
|
||||
RegSet scatch = RSET_ALL & ~info.inset;
|
||||
int inofs = 0;
|
||||
lua_assert(intrins->dyninsz <= intrins->insz);
|
||||
/* Avoid unnecessary spill of the output context */
|
||||
if (intrins->outsz != 0)
|
||||
rset_clear(scatch, info.outcontext);
|
||||
|
||||
if (dynreg == DYNREG_OPEXT || dynreg == DYNREG_TWOSTORE || reg_isvec(in[0])) {
|
||||
info.fuse = 0;
|
||||
}
|
||||
|
||||
if (info.fuse) {
|
||||
inofs++;
|
||||
rin = RID_CONTEXT;
|
||||
rset_clear(scatch, RID_CONTEXT);
|
||||
}
|
||||
|
||||
if ((intrins->dyninsz-inofs) > 0) {
|
||||
rset_clear(scatch, RID_CONTEXT);
|
||||
/* Merge in registers used for dynamic input registers */
|
||||
info.inset |= pickdynlist(in+inofs, intrins->dyninsz-inofs, scatch);
|
||||
}
|
||||
|
||||
if (rin == RID_NONE)
|
||||
rin = reg_rid(in[0]);
|
||||
|
||||
/* Allocate the dynamic output register */
|
||||
if (intrins->outsz > 0 && intrin_dynrout(intrins)) {
|
||||
if (dynreg == DYNREG_INOUT) {
|
||||
rout = reg_rid(in[1]);
|
||||
out[0] = reg_setrid(out[0], rout);
|
||||
} else if (dynreg == DYNREG_OPEXT) {
|
||||
/* Destructive single register opcode */
|
||||
rout = out[0] = reg_setrid(out[0], rin);
|
||||
} else {
|
||||
scatch = RSET_INIT & ~info.outset;
|
||||
rset_clear(scatch, info.outcontext);
|
||||
scatch = pickdynlist(out, 1, scatch);
|
||||
rout = reg_rid(out[0]);
|
||||
}
|
||||
|
||||
rset_set(info.outset, rout);
|
||||
}
|
||||
|
||||
if (rout == RID_NONE && intrins->dyninsz > 1) {
|
||||
lua_assert(reg_isdyn(intrins->in[1]));
|
||||
rout = reg_rid(in[1]);
|
||||
}
|
||||
|
||||
info.modregs |= info.inset|info.outset;
|
||||
}
|
||||
|
||||
/* Used for picking scratch register when loading or saving boxed values */
|
||||
as->modset = info.modregs|RID_CONTEXT;
|
||||
|
||||
@ -2651,16 +2757,49 @@ restart:
|
||||
emit_storeofsirt(as, IRT_INTP, info.outcontext, RID_SP, TEMPSPILL);
|
||||
}
|
||||
|
||||
if (intrins->flags & INTRINSFLAG_CALLED) {
|
||||
Reg rin = 0;
|
||||
#if LJ_TARGET_X86ORX64
|
||||
/* Setup modrm to tobe a load from the input context pointer we assume offset
|
||||
* will be to the first value in either the gpr or fpr part of the context
|
||||
* because the first input register should always be the dynamic one for opcodes
|
||||
*/
|
||||
as->mrm.idx = RID_NONE;
|
||||
as->mrm.scale = XM_SCALE1;
|
||||
as->mrm.ofs = 0;
|
||||
|
||||
if (dynreg) {
|
||||
if (info.fuse || (intrins->flags & INTRINSFLAG_INDIRECT)) {
|
||||
lua_assert(!reg_isvec(in[0]));
|
||||
as->mrm.base = rin;
|
||||
rin = RID_MRM;
|
||||
|
||||
if (info.fuse) {
|
||||
/* Set the fused offset into the input context */
|
||||
if (reg_isfp(in[0])) {
|
||||
as->mrm.ofs = offsetof(RegContext, fpr);
|
||||
} else {
|
||||
as->mrm.ofs = offsetof(RegContext, gpr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
as->mrm.base = RID_NONE;
|
||||
lua_assert(rin != RID_NONE);
|
||||
}
|
||||
} else if(intrins->flags & INTRINSFLAG_CALLED) {
|
||||
#if LJ_64
|
||||
/* Pick a scratch register in case the relative distance for the call is
|
||||
** larger than a signed 32bit value
|
||||
*/
|
||||
rin = intrinsic_scratch(as, RSET_GPR);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
if (intrins->flags & INTRINSFLAG_CALLED) {
|
||||
/* emit a call to the target which may be collocated after us */
|
||||
emit_intrins(as, intrins, rin, (uintptr_t)target);
|
||||
} else if (dynreg) {
|
||||
/* Write an opcode to the wrapper */
|
||||
asmofs = emit_intrins(as, intrins, rin, rout);
|
||||
} else {
|
||||
/* Append the user supplied machine code */
|
||||
asmofs = asm_mcode(as, state->target, state->targetsz);
|
||||
|
@ -1204,6 +1204,18 @@ static void cp_decl_msvcattribute(CPState *cp, CPDecl *decl)
|
||||
|
||||
#if LJ_HASINTRINSICS
|
||||
|
||||
static uint8_t getsignedbyte(CPState *cp)
|
||||
{
|
||||
int32_t val = cp->val.i32;
|
||||
|
||||
if (cp->tok != CTOK_INTEGER)
|
||||
cp_err_token(cp, CTOK_INTEGER);
|
||||
|
||||
/* Flatten negative values to a signed 8 bit number */
|
||||
/* NYI: immediate values larger than 8 bits */
|
||||
return (val < 0 ? (uint8_t)(int8_t)val : val);
|
||||
}
|
||||
|
||||
static void cp_decl_mcode(CPState *cp, CPDecl *decl)
|
||||
{
|
||||
/* Check were declared after a function definition */
|
||||
@ -1224,6 +1236,17 @@ static void cp_decl_mcode(CPState *cp, CPDecl *decl)
|
||||
decl->redir = cp->str;
|
||||
|
||||
cp_next(cp);
|
||||
/* Check if we have immediate and prefix byte values */
|
||||
if (cp_opt(cp, ',')) {
|
||||
/* NYI: immediate values larger than 8 bits */
|
||||
decl->bits = (CTSize)getsignedbyte(cp);
|
||||
cp_next(cp);
|
||||
|
||||
if (cp_opt(cp, ',')) {
|
||||
decl->bits |= getsignedbyte(cp) << 8;
|
||||
cp_next(cp);
|
||||
}
|
||||
}
|
||||
cp_check(cp, ')');
|
||||
/* Mark the function as an intrinsic */
|
||||
decl->stack[decl->top-1].info |= CTF_INTRINS;
|
||||
|
@ -175,7 +175,15 @@ typedef int (LJ_FASTCALL *IntrinsicWrapper)(void *incontext, void* outcontext);
|
||||
|
||||
typedef struct CIntrinsic {
|
||||
IntrinsicWrapper wrapped;
|
||||
union {
|
||||
uint8_t in[8];
|
||||
struct {
|
||||
uint8_t opregs[5]; /* cmpxchg8b */
|
||||
uint8_t immb;
|
||||
uint8_t prefix; /* prefix byte see INTRINSFLAG_PREFIX */
|
||||
uint8_t dyninsz; /* dynamic input register count */
|
||||
};
|
||||
};
|
||||
union {
|
||||
uint8_t out[8];
|
||||
struct {
|
||||
|
@ -650,7 +650,29 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
|
||||
static MCode* emit_intrins(ASMState *as, CIntrinsic *intrins, Reg r1,
|
||||
uintptr_t r2)
|
||||
{
|
||||
if (intrins->flags & INTRINSFLAG_CALLED) {
|
||||
uint32_t regmode = intrin_regmode(intrins);
|
||||
if (regmode) {
|
||||
if (regmode == DYNREG_OPEXT) {
|
||||
r2 = intrin_getopextb(intrins);
|
||||
}
|
||||
|
||||
/* force 64 bit operands */
|
||||
if (intrins->flags & INTRINSFLAG_REXW) {
|
||||
r2 |= REX_64;
|
||||
}
|
||||
|
||||
if (intrins->flags & INTRINSFLAG_IMMB) {
|
||||
*--as->mcp = intrins->immb;
|
||||
}
|
||||
|
||||
emit_mrm(as, intrins->opcode, (Reg)r2, r1);
|
||||
|
||||
if (intrins->flags & INTRINSFLAG_PREFIX) {
|
||||
*--as->mcp = intrins->prefix;
|
||||
}
|
||||
|
||||
checkmclim(as);
|
||||
} else if (intrins->flags & INTRINSFLAG_CALLED) {
|
||||
lua_assert(r2);
|
||||
emit_call_(as, (MCode*)r2, r1);
|
||||
return NULL;
|
||||
|
@ -24,6 +24,7 @@
|
||||
typedef enum RegFlags {
|
||||
REGFLAG_64BIT = REGKIND_GPR64 << 6, /* 64 bit override */
|
||||
REGFLAG_BLACKLIST = 1 << 17,
|
||||
REGFLAG_DYN = 1 << 18,
|
||||
}RegFlags;
|
||||
|
||||
typedef struct RegEntry {
|
||||
@ -55,6 +56,8 @@ RegEntry reglut[] = {
|
||||
#if LJ_64
|
||||
GPRDEF_R64(MKREG_GPR64)
|
||||
#endif
|
||||
{"gpr32", REGFLAG_DYN|RID_DYN_GPR},
|
||||
{"gpr64", REGFLAG_64BIT|REGFLAG_DYN|RID_DYN_GPR}
|
||||
};
|
||||
|
||||
static CTypeID register_intrinsic(lua_State *L, CIntrinsic* src, CType *func)
|
||||
@ -118,7 +121,9 @@ static int parse_fprreg(const char *name, uint32_t len)
|
||||
}
|
||||
rid += RID_MIN_FPR;
|
||||
} else {
|
||||
return -1;
|
||||
/* Unnumbered reg is considered a placeholder for a dynamic reg */
|
||||
flags = REGFLAG_DYN;
|
||||
rid = RID_DYN_FPR;
|
||||
}
|
||||
|
||||
if (name[0] == 'y') {
|
||||
@ -192,7 +197,7 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
|
||||
CTypeID liststart)
|
||||
{
|
||||
CTState *cts = ctype_cts(L);
|
||||
uint32_t i, count = 0;
|
||||
uint32_t i, count = 0, dyncount = 0;
|
||||
RegSet rset = 0;
|
||||
const char *listname;
|
||||
uint8_t *regout = NULL;
|
||||
@ -232,11 +237,20 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
|
||||
|
||||
r = reg_rid(reg);
|
||||
|
||||
/* Check for duplicate registers in the list */
|
||||
if (reg & REGFLAG_DYN) {
|
||||
if (regsetid == REGSET_MOD)
|
||||
lj_err_callerv(L, LJ_ERR_FFI_BADREG, "cannot use dynamic register", strdata(str), listname);
|
||||
|
||||
if (++dyncount > LJ_INTRINS_MAXDYNREG) {
|
||||
lj_err_callerv(L, LJ_ERR_FFI_BADREG, "too many dynamic", strdata(str), listname);
|
||||
}
|
||||
} else {
|
||||
/* Check for duplicate fixed registers in the list */
|
||||
if (rset_test(rset, r)) {
|
||||
lj_err_callerv(L, LJ_ERR_FFI_BADREG, "duplicate", strdata(str), listname);
|
||||
}
|
||||
rset_set(rset, r);
|
||||
}
|
||||
|
||||
if (regsetid == REGSET_OUT && reg_isgpr(reg)) {
|
||||
CType *ct = ctype_rawchild(cts, ctarg);
|
||||
@ -261,6 +275,9 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
|
||||
|
||||
if (regsetid == REGSET_IN) {
|
||||
intrins->insz = (uint8_t)count;
|
||||
if (dyncount != 0) {
|
||||
intrins->dyninsz = dyncount;
|
||||
}
|
||||
} else if (regsetid == REGSET_OUT) {
|
||||
intrins->outsz = (uint8_t)count;
|
||||
}
|
||||
@ -268,15 +285,91 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
|
||||
return rset;
|
||||
}
|
||||
|
||||
static int parse_opmode(const char *op, MSize len)
|
||||
{
|
||||
MSize i = 0;
|
||||
int m = 0;
|
||||
int r = 0;
|
||||
int flags = 0;
|
||||
|
||||
for (; i < len; i++) {
|
||||
switch (op[i]) {
|
||||
case 'm':
|
||||
m = 1;
|
||||
break;
|
||||
case 'M':
|
||||
m = 2;
|
||||
break;
|
||||
/* modrm register */
|
||||
case 'r':
|
||||
r = 1;
|
||||
break;
|
||||
case 'R':
|
||||
r = r == 0 ? 2 : 3;
|
||||
break;
|
||||
case 'U':
|
||||
flags |= INTRINSFLAG_IMMB;
|
||||
break;
|
||||
case 'C':
|
||||
flags |= INTRINSFLAG_CALLED;
|
||||
break;
|
||||
case 'X':
|
||||
flags |= INTRINSFLAG_REXW;
|
||||
break;
|
||||
case 'P':
|
||||
flags |= INTRINSFLAG_PREFIX;
|
||||
break;
|
||||
case 'I':
|
||||
flags |= INTRINSFLAG_INDIRECT;
|
||||
break;
|
||||
case 'E':
|
||||
flags |= INTRINSFLAG_EXPLICTREGS;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* return index of invalid flag */
|
||||
return -(int)(i+1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((r || m) & !(flags & INTRINSFLAG_REGMODEMASK)) {
|
||||
|
||||
/* 'Rm' mem/r is left reg is right */
|
||||
if (r == 2 && m == 1) {
|
||||
flags |= DYNREG_TWOSTORE; /* MR */
|
||||
} else if(r == 0 && m == 1) {
|
||||
flags |= DYNREG_OPEXT;
|
||||
} else if ((r == 1 && m == 2) || r == 3) {
|
||||
flags |= DYNREG_ONE; /* RM */
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* if neither of the operands is listed as memory disable trying to fuse a load in */
|
||||
if (r == 3) {
|
||||
flags |= INTRINSFLAG_NOFUSE; /* rR */
|
||||
}
|
||||
}
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode)
|
||||
{
|
||||
int len;
|
||||
uint32_t opext = 0;
|
||||
|
||||
if (opcode == 0) {
|
||||
lj_err_callermsg(L, "bad opcode literal");
|
||||
}
|
||||
|
||||
#if LJ_TARGET_X86ORX64
|
||||
/* the LSB of the opcode should be the register number */
|
||||
if (intrin_regmode(intrins) == DYNREG_OPEXT) {
|
||||
opext = (opcode & 7);
|
||||
opcode = opcode >> 4;
|
||||
}
|
||||
|
||||
if (opcode <= 0xff) {
|
||||
len = 1;
|
||||
} else if (opcode <= 0xffff) {
|
||||
@ -288,11 +381,16 @@ static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode)
|
||||
}
|
||||
|
||||
opcode = lj_bswap(opcode);
|
||||
|
||||
if (len < 4) {
|
||||
opcode |= (uint8_t)(int8_t)-(len+1);
|
||||
} else {
|
||||
lj_err_callermsg(L, "bad opcode literal");
|
||||
}
|
||||
|
||||
if (intrin_regmode(intrins) == DYNREG_OPEXT) {
|
||||
intrin_setopextb(intrins, opext);
|
||||
}
|
||||
#endif
|
||||
|
||||
intrins->opcode = opcode;
|
||||
@ -303,6 +401,7 @@ static int parse_opstr(lua_State *L, GCstr *opstr, CIntrinsic *intrins, int* bui
|
||||
const char *op = strdata(opstr);
|
||||
uint32_t opcode = 0;
|
||||
uint32_t i;
|
||||
int flags;
|
||||
|
||||
/* Parse the opcode number if this is not a template */
|
||||
if (op[0] != '?') {
|
||||
@ -320,11 +419,22 @@ static int parse_opstr(lua_State *L, GCstr *opstr, CIntrinsic *intrins, int* bui
|
||||
opcode = (opcode << 4) + (d & 15);
|
||||
}
|
||||
|
||||
if (*op == '_') op++;
|
||||
} else {
|
||||
*buildflags |= INTRINSFLAG_TEMPLATE;
|
||||
op++;
|
||||
}
|
||||
|
||||
flags = parse_opmode(op, opstr->len - (MSize)(op-strdata(opstr)));
|
||||
|
||||
if (flags < 0) {
|
||||
lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr), "bad mode flags");
|
||||
} else {
|
||||
intrins->flags |= flags;
|
||||
}
|
||||
/* Flags only used during construction of the intrinsic in the upper bits*/
|
||||
*buildflags |= flags & 0xffff0000;
|
||||
|
||||
return opcode;
|
||||
}
|
||||
|
||||
@ -378,7 +488,8 @@ CTypeID lj_intrinsic_template(lua_State *L, int narg)
|
||||
intrins = lj_intrinsic_get(cts, ct->size);
|
||||
|
||||
/* Can't be a template if it an opcode */
|
||||
if ((intrins->opcode && intrins->outsz <= 4) || intrins->wrapped)
|
||||
if (intrin_regmode(intrins) != DYNREG_FIXED || (intrins->opcode && intrins->outsz <= 4) ||
|
||||
intrins->wrapped)
|
||||
lj_err_arg(L, narg, LJ_ERR_FFI_INVTYPE);
|
||||
|
||||
return id;
|
||||
@ -407,21 +518,80 @@ int lj_intrinsic_create(lua_State *L)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int inferreg(CTState *cts, CType *ct) {
|
||||
CTSize sz = ct->size;
|
||||
int rid = -1, kind = -1;
|
||||
|
||||
if (ctype_isnum(ct->info)) {
|
||||
if (ctype_isfp(ct->info)) {
|
||||
rid = RID_DYN_FPR;
|
||||
if (sz > 8)
|
||||
return -1;
|
||||
kind = sz == 4 ? REGKIND_FPR32 : REGKIND_FPR64;
|
||||
} else {
|
||||
rid = RID_DYN_GPR;
|
||||
if (sz == 8) {
|
||||
if (LJ_32)
|
||||
return -1; /* NYI: 64 bit pair registers */
|
||||
kind = REGKIND_GPR64;
|
||||
rid |= INTRINSFLAG_REXW;
|
||||
} else {
|
||||
kind = ct->info & CTF_UNSIGNED ? REGKIND_GPR32CD : REGKIND_GPRI32;
|
||||
}
|
||||
}
|
||||
} else if (ctype_isptr(ct->info)) {
|
||||
ct = ctype_raw(cts, ctype_cid(ct->info));
|
||||
if (ctype_isvector(ct->info)) {
|
||||
goto vec;
|
||||
} else {
|
||||
rid = RID_DYN_GPR;
|
||||
kind = LJ_32 ? REGKIND_GPR32CD : REGKIND_GPR64;
|
||||
}
|
||||
} else if (ctype_isvector(ct->info)) {
|
||||
CType *vtype;
|
||||
vec:
|
||||
vtype = ctype_raw(cts, ctype_cid(ct->info));
|
||||
if (ctype_typeid(cts, vtype) < CTID_BOOL || ctype_typeid(cts, vtype) > CTID_DOUBLE ||
|
||||
(ct->size != 16 && ct->size != 32)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ct->size == 32) {
|
||||
kind = REGKIND_V256;
|
||||
rid = RID_DYN_FPR | INTRINSFLAG_VEX256;
|
||||
} else {
|
||||
kind = REGKIND_V128;
|
||||
rid = RID_DYN_FPR;
|
||||
}
|
||||
|
||||
} else {
|
||||
lua_assert(ctype_iscomplex(ct->info));
|
||||
return -1;
|
||||
}
|
||||
|
||||
return reg_make(rid, kind);
|
||||
}
|
||||
|
||||
GCcdata *lj_intrinsic_createffi(CTState *cts, CType *func)
|
||||
{
|
||||
GCcdata *cd;
|
||||
CIntrinsic *intrins = lj_intrinsic_get(cts, func->size);
|
||||
CTypeID id = ctype_typeid(cts, func);
|
||||
RegSet mod = intrin_getmodrset(cts, intrins);
|
||||
uint32_t op = intrins->opcode;
|
||||
void* mcode = ((char*)&op) + (4-intrin_oplen(intrins));
|
||||
|
||||
if (intrins->opcode == 0) {
|
||||
lj_err_callermsg(cts->L, "expected non template intrinsic");
|
||||
}
|
||||
|
||||
/* Build the interpreter wrapper */
|
||||
if (intrin_regmode(intrins) == DYNREG_FIXED) {
|
||||
uint32_t op = intrins->opcode;
|
||||
void* mcode = ((char*)&op) + (4-intrin_oplen(intrins));
|
||||
intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, mcode,
|
||||
intrin_oplen(intrins), mod);
|
||||
} else {
|
||||
intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, NULL, 0, mod);
|
||||
}
|
||||
|
||||
cd = lj_cdata_new(cts, id, CTSIZE_PTR);
|
||||
*(void **)cdataptr(cd) = intrins->wrapped;
|
||||
@ -433,8 +603,9 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm)
|
||||
CTState *cts = ctype_cts(L);
|
||||
CType *func = ctype_get(cts, fid);
|
||||
CTypeID sib = func->sib, retid = ctype_cid(func->info);
|
||||
RegSet routset = 0;
|
||||
uint32_t opcode;
|
||||
int buildflags = 0;
|
||||
int buildflags = 0, dynout = 0;
|
||||
CIntrinsic _intrins;
|
||||
CIntrinsic* intrins = &_intrins;
|
||||
memset(intrins, 0, sizeof(CIntrinsic));
|
||||
@ -445,18 +616,62 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (sib) {
|
||||
if (buildflags & INTRINSFLAG_EXPLICTREGS) {
|
||||
process_reglist(L, intrins, REGSET_IN, sib);
|
||||
} else {
|
||||
/* Infer the types of input register based on parameter types */
|
||||
while (sib != 0) {
|
||||
CType *arg = ctype_get(cts, sib);
|
||||
CType *ct = ctype_rawchild(cts, arg);
|
||||
int reg = inferreg(cts, ct);
|
||||
sib = arg->sib;
|
||||
|
||||
if (reg == -1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Save the register info in place of the argument index */
|
||||
arg->size = reg & 0xff;
|
||||
setarg_casttype(cts, arg, ct);
|
||||
|
||||
/* Merge shared register flags */
|
||||
intrins->flags |= reg & 0xff00;
|
||||
|
||||
intrins->in[intrins->insz++] = reg & 0xff;
|
||||
intrins->dyninsz++;
|
||||
if (intrins->dyninsz > LJ_INTRINS_MAXDYNREG)
|
||||
return 0;
|
||||
|
||||
if (sib != 0 && intrins->insz == LJ_INTRINS_MAXREG) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (retid != CTID_VOID) {
|
||||
CType *ct = ctype_get(cts, retid);
|
||||
|
||||
/* Check if the intrinsic had __reglist declared on it */
|
||||
if (ctype_isfield(ct->info)) {
|
||||
process_reglist(L, intrins, REGSET_OUT, retid);
|
||||
routset = process_reglist(L, intrins, REGSET_OUT, retid);
|
||||
sib = retid;
|
||||
} else {
|
||||
int reg = inferreg(cts, ct);
|
||||
|
||||
if (reg == -1) {
|
||||
return 0;
|
||||
}
|
||||
/* Merge shared register flags */
|
||||
intrins->flags |= reg & 0xff00;
|
||||
|
||||
/* Create a field entry for the return value that we make the ctype child
|
||||
** of the function.
|
||||
*/
|
||||
sib = lj_ctype_new(cts, &ct);
|
||||
ct->info = CTINFO(CT_FIELD, retid);
|
||||
ct->size = reg;
|
||||
intrins->out[intrins->outsz++] = reg & 0xff;
|
||||
dynout = 1;
|
||||
}
|
||||
} else {
|
||||
sib = retid;
|
||||
@ -466,6 +681,60 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm)
|
||||
if (opcode) {
|
||||
setopcode(L, intrins, opcode);
|
||||
}
|
||||
if (intrin_regmode(intrins) == DYNREG_FIXED) {
|
||||
/* dyninsz is overlapped by input registers 6/7/8 */
|
||||
if ((intrins->insz < 6 && intrins->dyninsz > 0) || dynout) {
|
||||
lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr),
|
||||
"no register mode specified for dynamic registers");
|
||||
}
|
||||
}
|
||||
|
||||
#if LJ_TARGET_X86ORX64
|
||||
/* Validate dynamic register count for the specified register mode*/
|
||||
if (intrin_regmode(intrins) == DYNREG_ONE){
|
||||
if (intrins->dyninsz == 2 && intrins->outsz == 1 && routset == 0) {
|
||||
/* Infer destructive opcode if the single out */
|
||||
intrin_setregmode(intrins, DYNREG_INOUT);
|
||||
} else if(intrins->dyninsz == 2){
|
||||
intrin_setregmode(intrins, DYNREG_TWOIN);
|
||||
} else if (intrins->dyninsz == 0 || intrins->outsz == 0 ||
|
||||
!reg_isdyn(intrins->out[0])) {
|
||||
return 0;
|
||||
}
|
||||
}else if (intrin_regmode(intrins) == DYNREG_TWOSTORE) {
|
||||
if (intrins->dyninsz == 1 && intrins->outsz != 0) {
|
||||
intrin_setregmode(intrins, DYNREG_ONESTORE);
|
||||
} else if (intrins->insz == 0 || intrins->dyninsz == 0) {
|
||||
/* Store opcodes need at least an address the value could be an immediate */
|
||||
return 0;
|
||||
}
|
||||
} else if (intrin_regmode(intrins) == DYNREG_OPEXT) {
|
||||
if (intrins->dyninsz != 1)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Swap the registers from there declared order to match how there
|
||||
** processed
|
||||
*/
|
||||
if (intrin_regmode(intrins) >= DYNREG_SWAPREGS) {
|
||||
uint8_t temp = intrins->in[0];
|
||||
intrins->in[0] = intrins->in[1]; intrins->in[1] = temp;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (intrins->flags & INTRINSFLAG_PREFIX) {
|
||||
intrins->prefix = (uint8_t)imm;
|
||||
/* Prefix value should be declared before an immediate value in the
|
||||
** __mcode definition the second number declared is shifted right when
|
||||
** packed in the ctype.
|
||||
*/
|
||||
imm >>= 8;
|
||||
}
|
||||
|
||||
if (intrins->flags & INTRINSFLAG_IMMB) {
|
||||
intrins->immb = (uint8_t)(imm & 0xff);
|
||||
}
|
||||
|
||||
register_intrinsic(L, intrins, ctype_get(cts, fid));
|
||||
|
||||
lua_assert(sib > 0 && sib < cts->top);
|
||||
@ -567,6 +836,18 @@ int lj_intrinsic_call(CTState *cts, CType *ct)
|
||||
lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg+1) | CCF_INTRINS_ARG);
|
||||
}
|
||||
|
||||
/* Swap input values around to match the platform ordering the wrapper expects */
|
||||
if (intrin_regmode(intrins) >= DYNREG_SWAPREGS &&
|
||||
reg_isgpr(intrins->in[0]) == reg_isgpr(intrins->in[1])) {
|
||||
if (reg_isgpr(intrins->in[0])) {
|
||||
intptr_t temp = context.gpr[0];
|
||||
context.gpr[0] = context.gpr[1]; context.gpr[1] = temp;
|
||||
} else {
|
||||
double temp = context.fpr[0];
|
||||
context.fpr[0] = context.fpr[1]; context.fpr[1] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Pass in the return type chain so the results are typed */
|
||||
outcontent = setup_results(L, intrins, ctype_cid(ctype_get(cts, funcid)->info));
|
||||
|
||||
|
@ -14,20 +14,54 @@
|
||||
#define LJ_INTRINS_MAXREG 8
|
||||
#endif
|
||||
|
||||
/* The max number of dynamic registers in each reglist(in/out)*/
|
||||
#define LJ_INTRINS_MAXDYNREG 2
|
||||
|
||||
typedef struct LJ_ALIGN(16) RegContext {
|
||||
intptr_t gpr[LJ_INTRINS_MAXREG];
|
||||
double fpr[LJ_INTRINS_MAXREG];
|
||||
} RegContext;
|
||||
|
||||
typedef enum REGMODE {
|
||||
DYNREG_FIXED = 0,
|
||||
/* one input register and optionally one output */
|
||||
DYNREG_ONE,
|
||||
/* 1(R) register in, 1 out(M) which can be a memory address to store the value */
|
||||
DYNREG_ONESTORE,
|
||||
/* 2 in 0 out first must always be treated as indirect */
|
||||
DYNREG_TWOSTORE,
|
||||
/* one input(M) register and the second is part of part of the opcode */
|
||||
DYNREG_OPEXT,
|
||||
/* Two input register and one output same register that's same RID the second input */
|
||||
DYNREG_INOUT,
|
||||
/* Two input registers with M dynamic output register */
|
||||
DYNREG_TWOIN,
|
||||
|
||||
DYNREG_SWAPREGS = DYNREG_INOUT,
|
||||
} REGMODE;
|
||||
|
||||
typedef enum INTRINSFLAGS {
|
||||
INTRINSFLAG_REGMODEMASK = 7,
|
||||
|
||||
INTRINSFLAG_MEMORYSIDE = 0x08, /* has memory side effects so needs an IR memory barrier */
|
||||
|
||||
/* Intrinsic should be emitted as a naked function that is called */
|
||||
INTRINSFLAG_CALLED = 0x20,
|
||||
/* MODRM should always be set as indirect mode */
|
||||
INTRINSFLAG_INDIRECT = 0x40,
|
||||
/* Don't fuse load into op */
|
||||
INTRINSFLAG_NOFUSE = 0x80,
|
||||
/* Force REX.w 64 bit size override bit to be set for x64 */
|
||||
INTRINSFLAG_REXW = 0x100,
|
||||
/* Append a user supplied prefixed before the opcode and its REX byte */
|
||||
INTRINSFLAG_PREFIX = 0x200,
|
||||
/* Opcode has an immediate byte that needs to be set at construction time */
|
||||
INTRINSFLAG_IMMB = 0x400,
|
||||
|
||||
/* Opcode uses ymm registers */
|
||||
INTRINSFLAG_VEX256 = 0x4000,
|
||||
/* Input parameters names explicitly declare input registers */
|
||||
INTRINSFLAG_EXPLICTREGS = 0x10000,
|
||||
/* Intrinsic is a template with no machine code set until instantiate at runtime with
|
||||
** user supplied code.
|
||||
*/
|
||||
@ -47,7 +81,19 @@ typedef struct AsmHeader {
|
||||
uint32_t totalzs;
|
||||
} AsmHeader;
|
||||
|
||||
#define intrin_regmode(intrins) ((intrins)->flags & INTRINSFLAG_REGMODEMASK)
|
||||
#define intrin_setregmode(intrins, mode) \
|
||||
(intrins)->flags = ((intrins)->flags & ~INTRINSFLAG_REGMODEMASK)|(mode)
|
||||
|
||||
#define intrin_getopextb(intrins) ((intrins)->out[3])
|
||||
#define intrin_setopextb(intrins, opext) \
|
||||
lua_assert((intrins)->outsz < 4); \
|
||||
((intrins)->out[3] = (opext))
|
||||
#define intrin_oplen(intrins) ((-(int8_t)(intrins)->opcode)-1)
|
||||
|
||||
/* odd numbered have an dynamic output */
|
||||
#define intrin_dynrout(intrins) (intrin_regmode(intrins) && reg_isdyn(intrins->out[0]))
|
||||
/* Get the optional RegSet of registers modified by the intrinsic */
|
||||
#define intrin_getmodrset(cts, intrins) \
|
||||
((ctype_get(cts, (intrins)->id)->size >> 16) ? \
|
||||
ctype_get(cts, ctype_get(cts, (intrins)->id)->size >> 16)->size : 0)
|
||||
@ -91,6 +137,7 @@ CTypeID1 regkind_ct[16];
|
||||
#define reg_isgpr(reg) (reg_rid(reg) < RID_MAX_GPR)
|
||||
#define reg_isfp(reg) (reg_rid(reg) >= RID_MIN_FPR)
|
||||
#define reg_isvec(reg) (reg_rid(reg) >= RID_MIN_FPR && reg_kind(reg) >= REGKIND_VEC_START)
|
||||
#define reg_isdyn(reg) (reg_rid(reg) == RID_DYN_GPR || reg_rid(reg) == RID_DYN_FPR)
|
||||
|
||||
#define reg_irt(reg) (reg_isgpr(reg) ? rk_irtgpr(reg_kind(reg)) : rk_irtfpr(reg_kind(reg)))
|
||||
#define rk_irtgpr(kind) ((IRType)regkind_it[(kind)])
|
||||
|
@ -74,6 +74,9 @@ enum {
|
||||
RID_CONTEXT = RID_ECX,
|
||||
RID_OUTCONTEXT = RID_EDX,
|
||||
#endif
|
||||
/* Placeholder register ids for dynamic register entries in intrinsics */
|
||||
RID_DYN_FPR = RID_MAX_FPR-1,
|
||||
RID_DYN_GPR = RID_SP,
|
||||
};
|
||||
|
||||
/* -- Register sets ------------------------------------------------------- */
|
||||
|
@ -353,6 +353,21 @@ context("__mcode", function()
|
||||
assert_equal(ffi.C.multi1(1.1), 1)
|
||||
end)
|
||||
|
||||
it("bad dynamic registers", function()
|
||||
--No modrm specifed for the implicit output register decleared having a non void return type
|
||||
assert_cdeferr([[int32_t dynerr1() __mcode("90");]])
|
||||
assert_cdeferr([[void dynerr2(int32_t a) __mcode("90");]])
|
||||
assert_cdeferr([[int32_t dynerr3(int32_t a) __mcode("90");]])
|
||||
-- no dynamic registers listed
|
||||
assert_cdeferr([[void dynerr4() __mcode("90m");]])
|
||||
assert_cdeferr([[void dynerr5() __mcode("90rM");]])
|
||||
assert_cdeferr([[void dynerr6() __mcode("90Mr");]])
|
||||
--need 2 in or 1 in and a return type
|
||||
assert_cdeferr([[void dynerr7(int32_t a) __mcode("90rM");]])
|
||||
--too many dynamic registers
|
||||
assert_cdeferr([[void dynerr8(int a, int b, int c) __mcode("90rR");]])
|
||||
end)
|
||||
|
||||
it("bad ffi types mcode", function()
|
||||
assert_cdeferr([[void testffi1(float a2, ...) __mcode("90");]])
|
||||
assert_cdeferr([[void testffi2(complex a2) __mcode("90");]])
|
||||
@ -382,6 +397,131 @@ context("__mcode", function()
|
||||
assert_error(function() idiv(1, 2, 3, 4) end)
|
||||
end)
|
||||
|
||||
it("output pointers", function()
|
||||
assert_cdef([[const char* addptr(const char* nptr, int32_t n) __mcode("03rM");]], "addptr")
|
||||
local s = "0123456789abcdefghijklmnopqrstvwxyz"
|
||||
|
||||
local ptr = ffi.C.addptr(s, 0)
|
||||
assert_equal(ptr, ffi.cast("const char*", s))
|
||||
assert_equal(ptr[0], string.byte(s))
|
||||
|
||||
local function checker(i, sptr)
|
||||
assert(tostring(sptr), tostring(ptr+i))
|
||||
assert(sptr == ptr+i)
|
||||
end
|
||||
|
||||
assert_jitchecker(checker, function(i)
|
||||
return (ffi.C.addptr(s, i))
|
||||
end)
|
||||
end)
|
||||
|
||||
it("signed/unsigned numbers", function()
|
||||
assert_cdef([[int32_t sub_signed(int32_t n, int32_t i) __mcode("2brM");]], "sub_signed")
|
||||
assert_cdef([[uint32_t sub_unsigned(uint32_t n, uint32_t i) __mcode("2brM");]], "sub_unsigned")
|
||||
assert_cdef([[uint32_t sub_signedun(int32_t n, int32_t i) __mcode("2brM");]], "sub_signedun")
|
||||
|
||||
assert_equal(tonumber(ffi.C.sub_unsigned(3, 1)), 2)
|
||||
|
||||
local function unsignedtest(n1, n2)
|
||||
return (tonumber(ffi.C.sub_unsigned(n1, n2)))
|
||||
end
|
||||
|
||||
assert_jit(2, unsignedtest, 3, 1)
|
||||
assert_jit(2999999999, unsignedtest, 3000000000, 1)
|
||||
--wrap around
|
||||
assert_jit(4294967295, unsignedtest, 300, 301)
|
||||
|
||||
local function unsignedtest_boxed(n1, n2)
|
||||
return (ffi.C.sub_unsigned(n1, n2))
|
||||
end
|
||||
|
||||
assert_jit(ffi.new("uint32_t", 2), unsignedtest_boxed, 3, 1)
|
||||
assert_jit(ffi.new("uint32_t", 2999999999), unsignedtest_boxed, 3000000000, 1)
|
||||
--wrap around
|
||||
assert_jit(ffi.new("uint32_t", 4294967295), unsignedtest_boxed, 300, 301)
|
||||
|
||||
local function signedtest(n1, n2)
|
||||
return (ffi.C.sub_signed(n1, n2))
|
||||
end
|
||||
|
||||
assert_jit(-2, signedtest, -1, 1)
|
||||
assert_noexit(3, signedtest, -1, -4)
|
||||
end)
|
||||
|
||||
it("op encode", function()
|
||||
assert_cdef([[int32_t not32(int32_t n) __mcode("F72m");]], "not32")
|
||||
|
||||
local function test_not(i)
|
||||
return (ffi.C.not32(i))
|
||||
end
|
||||
|
||||
assert_jit(-1, test_not, 0)
|
||||
assert_noexit(0, test_not, -1)
|
||||
|
||||
assert_cdef([[int32_t add_imm3(int32_t n) __mcode("830mU", 3);]], "add_imm3")
|
||||
|
||||
local function checker(i, n)
|
||||
return i+3, n
|
||||
end
|
||||
assert_jitchecker(checker, function(i)
|
||||
return (ffi.C.add_imm3(i))
|
||||
end)
|
||||
end)
|
||||
|
||||
it("prefix byte", function()
|
||||
assert_cdef([[void atomicadd(int32_t* nptr, int32_t n) __mcode("01mRIP", 0xF0);]], "atomicadd")
|
||||
|
||||
local sum = 0
|
||||
local function checker(i, jsum)
|
||||
sum = sum+i
|
||||
if(jsum ~= sum) then
|
||||
return jsum, sum
|
||||
end
|
||||
end
|
||||
|
||||
local numptr = ffi.new("int32_t[1]", 0)
|
||||
|
||||
assert_jitchecker(checker, function(i)
|
||||
ffi.C.atomicadd(numptr, i)
|
||||
return numptr[0]
|
||||
end)
|
||||
end)
|
||||
|
||||
if ffi.arch == "x64" then
|
||||
it("prefix64", function()
|
||||
assert_cdef([[void atomicadd64(int64_t* nptr, int64_t n) __mcode("01mRIP", 0xF0);]], "atomicadd64")
|
||||
|
||||
local sum = 0
|
||||
local function checker(i, jsum)
|
||||
sum = sum+i
|
||||
assert(jsum == sum)
|
||||
end
|
||||
|
||||
local numptr = ffi.new("int64_t[1]", 0)
|
||||
|
||||
assert_jitchecker(checker, function(i)
|
||||
ffi.C.atomicadd64(numptr, i)
|
||||
return numptr[0]
|
||||
end)
|
||||
end)
|
||||
end
|
||||
|
||||
it("prefix and imm byte", function()
|
||||
assert_cdef([[void atomicadd1(int32_t* nptr) __mcode("830mIUP", 0xF0, 0x01);]], "atomicadd1")
|
||||
|
||||
local function checker(i, jsum)
|
||||
if(jsum ~= i) then
|
||||
return i, jsum
|
||||
end
|
||||
end
|
||||
|
||||
local numptr = ffi.new("int32_t[1]", 0)
|
||||
|
||||
assert_jitchecker(checker, function(i)
|
||||
ffi.C.atomicadd1(numptr)
|
||||
return numptr[0]
|
||||
end)
|
||||
end)
|
||||
it("idiv(template)", function()
|
||||
assert_cdef([[void idivT(int32_t eax, int32_t ecx) __mcode("?E") __reglist(out, int32_t eax, int32_t edx)]])
|
||||
--trying to create template intrinsic through C library should always fail
|
||||
@ -416,6 +556,117 @@ context("__mcode", function()
|
||||
assert_exit(10, test_idiv, 10, 5)
|
||||
end)
|
||||
|
||||
it("prefetch", function()
|
||||
assert_cdef([[void prefetch0(void* mem) __mcode("0F181mI")]], "prefetch0")
|
||||
assert_cdef([[void prefetch1(void* mem) __mcode("0F182mI")]], "prefetch1")
|
||||
assert_cdef([[void prefetch2(void* mem) __mcode("0F183mI")]], "prefetch2")
|
||||
assert_cdef([[void prefetchnta(void* mem) __mcode("0F180mI")]], "prefetchnta")
|
||||
|
||||
local asm = ffi.C
|
||||
local kmem = ffi.new("int[4]")
|
||||
local mem = 1
|
||||
mem = mem and ffi.new("int[8]", 1, 2, 3, 4, 5, 6, 7, 8)
|
||||
|
||||
local function testprefetch(a, b, c)
|
||||
local n = a+b
|
||||
local ptr = mem+c
|
||||
|
||||
asm.prefetch2(ptr)
|
||||
asm.prefetch1(kmem)
|
||||
asm.prefetch0(mem+a)
|
||||
asm.prefetchnta(mem)
|
||||
|
||||
asm.prefetch0(kmem+a)
|
||||
asm.prefetch1(kmem+b)
|
||||
return (ptr) ~= 0 and ptr[0] + ptr[3]
|
||||
end
|
||||
|
||||
assert_jit(11, testprefetch, 1, 2, 3)
|
||||
end)
|
||||
|
||||
it("cmpxchg", function()
|
||||
assert_cdef([[void cmpxchg(int32_t* gpr32, int32_t gpr32, int32_t eax) __mcode("0FB1mRPEI", 0xF0) __reglist(out, int32_t eax);]], "cmpxchg")
|
||||
|
||||
local kptr32 = ffi.new("int32_t[1]", 0)
|
||||
int4[0] = 0
|
||||
|
||||
local function checker(i, n, eax)
|
||||
assert(n == i)
|
||||
assert(kptr32[0] == i)
|
||||
assert(eax == i-1)
|
||||
end
|
||||
|
||||
local function test_cmpxchg(i)
|
||||
local eax = ffi.C.cmpxchg(kptr32, i, i-1)
|
||||
return kptr32[0], eax
|
||||
end
|
||||
|
||||
assert_jitchecker(checker, test_cmpxchg)
|
||||
--test not equal non swapping
|
||||
local num, eax = test_cmpxchg(0)
|
||||
assert_equal(eax, kptr32[0])
|
||||
|
||||
num, eax = test_cmpxchg(kptr32[0]+1)
|
||||
assert_equal(eax, kptr32[0]-1)
|
||||
end)
|
||||
|
||||
if ffi.arch == "x64" then
|
||||
it("cmpxchg64", function()
|
||||
assert_cdef([[void cmpxchg64(int64_t* gpr64, int64_t gpr64, int64_t rax) __mcode("0FB1mRPEIX", 0xF0) __reglist(out, int64_t rax);]], "cmpxchg64")
|
||||
|
||||
local kptr64 = ffi.new("int64_t[1]", 0)
|
||||
|
||||
local function test_cmpxchg64(i)
|
||||
local rax = ffi.C.cmpxchg64(kptr64, -i, -(i-1))
|
||||
return kptr64[0], rax
|
||||
end
|
||||
|
||||
local function checker(i, newval, rax)
|
||||
assert(newval == -i)
|
||||
assert(kptr64[0] == -i)
|
||||
assert(rax == -(i-1))
|
||||
end
|
||||
|
||||
assert_jitchecker(checker, test_cmpxchg64, 2)
|
||||
|
||||
--test not equal non swapping
|
||||
local num, rax = test_cmpxchg64(0, 1)
|
||||
assert_equal(rax, kptr64[0])
|
||||
end)
|
||||
end
|
||||
|
||||
it("cmpxchg8b", function()
|
||||
|
||||
ffi.cdef([[typedef struct int32pair {
|
||||
int32_t i1;
|
||||
int32_t i2;
|
||||
} __attribute__((aligned(8))) int32pair;]])
|
||||
|
||||
assert_cdef([[void cmpxchg8b(void* gpr32, int32_t eax, int32_t edx, int32_t ebx, int32_t ecx) __mcode("0FC71mPEI", 0xf0)
|
||||
__reglist(out, int32_t eax, int32_t edx);]], "cmpxchg8b")
|
||||
|
||||
local int32pair = ffi.new("int32pair")
|
||||
int32pair.i1 = 1
|
||||
int32pair.i2 = -1
|
||||
|
||||
local function test_cmpxchg8b(i)
|
||||
local eax,edx = ffi.C.cmpxchg8b(int32pair, i, -i, i+1, -(i+1))
|
||||
return int32pair.i1, int32pair.i2, eax, edx
|
||||
end
|
||||
|
||||
local function checker(i, n1, n2, eax, edx)
|
||||
assert(n1 == i+1)
|
||||
assert(n2 == -(i+1))
|
||||
assert(int32pair.i1 == i+1)
|
||||
assert(int32pair.i2 == -(i+1))
|
||||
|
||||
assert(eax == i)
|
||||
assert(edx == -i)
|
||||
end
|
||||
|
||||
assert_jitchecker(checker, test_cmpxchg8b)
|
||||
end)
|
||||
|
||||
it("cpuid_brand", function()
|
||||
assert_cdef([[void cpuid(int32_t eax, int32_t ecx) __mcode("0FA2_E") __reglist(out, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx);]], "cpuid")
|
||||
|
||||
@ -526,6 +777,204 @@ context("__reglist", function()
|
||||
end)
|
||||
end)
|
||||
|
||||
it("popcnt", function()
|
||||
assert_cdef([[int32_t popcnt(int32_t n) __mcode("f30fb8rM");]], "popcnt")
|
||||
|
||||
local popcnt = ffi.C.popcnt
|
||||
|
||||
assert_equal(popcnt(7), 3)
|
||||
assert_equal(popcnt(1024), 1)
|
||||
assert_equal(popcnt(1023), 10)
|
||||
|
||||
local function testpopcnt(num)
|
||||
return (popcnt(num))
|
||||
end
|
||||
|
||||
assert_jit(10, testpopcnt, 1023)
|
||||
assert_noexit(32, testpopcnt, -1)
|
||||
assert_noexit(0, testpopcnt, 0)
|
||||
assert_noexit(1, testpopcnt, 1)
|
||||
|
||||
ffi.cdef([[int32_t popcntuf(int32_t n) __mcode("f30fb8rR");]])
|
||||
--check unfused
|
||||
popcnt = ffi.C.popcntuf
|
||||
|
||||
assert_equal(popcnt(7), 3)
|
||||
assert_equal(popcnt(1024), 1)
|
||||
end)
|
||||
|
||||
it("addsd", function()
|
||||
assert_cdef([[double addsd(double n1, double n2) __mcode("F20F58rM");]], "addsd")
|
||||
local addsd = ffi.C.addsd
|
||||
|
||||
function test_addsd(n1, n2)
|
||||
return (addsd(n1, n2))
|
||||
end
|
||||
|
||||
assert_equal(3, addsd(1, 2))
|
||||
assert_equal(0, addsd(0, 0))
|
||||
|
||||
assert_jit(-3, test_addsd, -4.5, 1.5)
|
||||
assert_noexit(3, test_addsd, 4.5, -1.5)
|
||||
|
||||
--check dual num exit
|
||||
assert_equal(5, test_addsd(3 , 2))
|
||||
|
||||
--test same ref input
|
||||
function test_addsd2(n)
|
||||
return (addsd(n, n))
|
||||
end
|
||||
|
||||
assert_jit(3, test_addsd2, 1.5)
|
||||
assert_noexit(-3, test_addsd2, -1.5)
|
||||
|
||||
--check dual num exit
|
||||
assert_equal(6, test_addsd2(3))
|
||||
|
||||
--check unfused
|
||||
ffi.cdef([[double addsduf(double n1, double n2) __mcode("F20F58rR");]])
|
||||
addsd = ffi.C.addsduf
|
||||
|
||||
assert_equal(3, addsd(1, 2))
|
||||
assert_equal(0, addsd(0, 0))
|
||||
end)
|
||||
|
||||
it("addss", function()
|
||||
assert_cdef([[float addss(float n1, float n2) __mcode("F30F58rM");]], "addss")
|
||||
local addsd = ffi.C.addss
|
||||
|
||||
function test_addsd(n1, n2)
|
||||
return (addsd(n1, n2))
|
||||
end
|
||||
|
||||
assert_equal(3, addsd(1, 2))
|
||||
assert_equal(0, addsd(0, 0))
|
||||
|
||||
assert_jit(-3, test_addsd, -4.5, 1.5)
|
||||
assert_noexit(3, test_addsd, 4.5, -1.5)
|
||||
--check dual num exit
|
||||
assert_equal(5, test_addsd(3, 2))
|
||||
|
||||
--test same ref input
|
||||
function test_addss2(n)
|
||||
return (addsd(n, n))
|
||||
end
|
||||
|
||||
assert_jit(-9, test_addss2, -4.5)
|
||||
assert_noexit(3, test_addss2, 1.5)
|
||||
|
||||
--check unfused
|
||||
ffi.cdef[[float addssuf(float n1, float n2) __mcode("F30F58rR");]]
|
||||
addsd = ffi.C.addssuf
|
||||
|
||||
assert_equal(3, addsd(1, 2))
|
||||
assert_equal(0, addsd(0, 0))
|
||||
end)
|
||||
|
||||
it("shufps", function()
|
||||
assert_cdef([[float4 shufps(float4 v1, float4 v2) __mcode("0FC6rMU", 0);]], "shufps")
|
||||
|
||||
local shufps = ffi.C.shufps
|
||||
|
||||
local v = ffi.new("float4", 1.5, 2.25, 3.125, 4.0625)
|
||||
local vzero = ffi.new("float4", 1)
|
||||
|
||||
function test_shufps(v1, v2)
|
||||
return (shufps(v1, v2))
|
||||
end
|
||||
|
||||
local vout = shufps(v, v)
|
||||
assert_equal(vout[0], 1.5)
|
||||
assert_equal(vout[1], 1.5)
|
||||
assert_equal(vout[2], 1.5)
|
||||
assert_equal(vout[3], 1.5)
|
||||
|
||||
assert_cdef([[float4 shufpsrev(float4 v1, float4 v2) __mcode("0FC6rMU", 0x1b);]], "shufpsrev")
|
||||
|
||||
local vout = ffi.C.shufpsrev(v, v)
|
||||
|
||||
assert_equal(vout[0], 4.0625)
|
||||
assert_equal(vout[1], 3.125)
|
||||
assert_equal(vout[2], 2.25)
|
||||
assert_equal(vout[3], 1.5)
|
||||
end)
|
||||
|
||||
context("mixed register type opcodes", function()
|
||||
|
||||
it("cvttsd2s", function()
|
||||
assert_cdef([[int cvttsd2s(double n) __mcode("F20F2CrM");]], "cvttsd2s")
|
||||
local cvttsd2s = ffi.C.cvttsd2s
|
||||
|
||||
function test_cvttsd2s(n)
|
||||
return (cvttsd2s(n))
|
||||
end
|
||||
|
||||
assert_equal(0, cvttsd2s(-0))
|
||||
assert_equal(1, cvttsd2s(1))
|
||||
assert_equal(1, cvttsd2s(1.2))
|
||||
|
||||
assert_jit(3, test_cvttsd2s, 3.3)
|
||||
assert_noexit(-1, test_cvttsd2s, -1.5)
|
||||
--check dual num exit
|
||||
assert_equal(5, test_cvttsd2s(5))
|
||||
|
||||
--check unfused
|
||||
ffi.cdef([[int cvttsd2suf(double n) __mcode("F20F2CrR");]])
|
||||
cvttsd2s = ffi.C.cvttsd2suf
|
||||
|
||||
assert_equal(0, cvttsd2s(-0))
|
||||
assert_equal(1, cvttsd2s(1))
|
||||
assert_equal(1, cvttsd2s(1.2))
|
||||
end)
|
||||
|
||||
it("cvtsi2sd", function()
|
||||
assert_cdef([[double cvtsi2sd(int32_t n) __mcode("F20F2ArM");]], "cvtsi2sd")
|
||||
local cvtsi2sd = ffi.C.cvtsi2sd
|
||||
|
||||
function test_cvtsi2sd(n1, n2)
|
||||
return (cvtsi2sd(n1)+n2)
|
||||
end
|
||||
|
||||
assert_equal(0.5, test_cvtsi2sd(0, 0.5))
|
||||
assert_equal(1.25, test_cvtsi2sd(1.0, 0.25))
|
||||
assert_equal(-1.5, test_cvtsi2sd(-2, 0.5))
|
||||
|
||||
assert_jit(3.25, test_cvtsi2sd, 3, 0.25)
|
||||
assert_noexit(-1.5, test_cvtsi2sd, -2, 0.5)
|
||||
|
||||
--check dual num exit
|
||||
assert_equal(11, test_cvtsi2sd(5, 6))
|
||||
|
||||
--check unfused
|
||||
ffi.cdef([[double cvtsi2sduf(int32_t n) __mcode("F20F2ArR");]])
|
||||
cvtsi2sd = ffi.C.cvtsi2sduf
|
||||
assert_equal(0.5, test_cvtsi2sd(0, 0.5))
|
||||
assert_equal(1.25, test_cvtsi2sd(1.0, 0.25))
|
||||
assert_equal(-1.5, test_cvtsi2sd(-2, 0.5))
|
||||
end)
|
||||
|
||||
it("pextrw", function()
|
||||
local v = ffi.new("byte16", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
|
||||
|
||||
assert_cdef([[int32_t pextrw_0(byte16 v) __mcode("660FC5mRU", 0);]], "pextrw_0")
|
||||
assert_equal(0x0201, ffi.C.pextrw_0(v))
|
||||
|
||||
assert_cdef([[int32_t pextrw_7(byte16 v) __mcode("660FC5mRU", 7);]], "pextrw_7")
|
||||
assert_equal(0x100f, ffi.C.pextrw_7(v))
|
||||
end)
|
||||
|
||||
it("pinsrw", function()
|
||||
assert_cdef([[int4 pinsrw_0(byte16 v, int32_t word) __mcode("660FC4rMU", 0);]], "pinsrw_0")
|
||||
|
||||
local v = ffi.new("byte16", 0)
|
||||
local vout = ffi.C.pinsrw_0(v, 0xf0f1)
|
||||
assert_equal(0xf0f1, vout[0])
|
||||
|
||||
assert_cdef([[int4 pinsrw_7(byte16 v, int32_t word) __mcode("660FC4rMU", 7);]], "pinsrw_7")
|
||||
vout = ffi.C.pinsrw_0(v, 0xf0f1)
|
||||
assert_equal(0xf0f1, vout[0])
|
||||
end)
|
||||
end)
|
||||
|
||||
end)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user