diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index ec4d6eed..56342858 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -669,6 +669,10 @@ static MCode* emit_intrins(ASMState *as, CIntrinsic *intrins, Reg r1, if (intrins->flags & INTRINSFLAG_IMMB) { *--as->mcp = intrins->immb; } + /* Tell emit_op the opcode is 4 bytes long */ + if (intrins->flags & INTRINSFLAG_LARGEOP) { + r2 |= OP4B; + } emit_mrm(as, intrins->opcode, (Reg)r2, r1); diff --git a/src/lj_intrinsic.c b/src/lj_intrinsic.c index a8419c84..b92f994f 100644 --- a/src/lj_intrinsic.c +++ b/src/lj_intrinsic.c @@ -385,7 +385,7 @@ static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode) if (len < 4) { opcode |= (uint8_t)(int8_t)-(len+1); } else { - lj_err_callermsg(L, "bad opcode literal"); + intrins->flags |= INTRINSFLAG_LARGEOP; } if (intrin_regmode(intrins) == DYNREG_OPEXT) { diff --git a/src/lj_intrinsic.h b/src/lj_intrinsic.h index e275c891..d8ab7670 100644 --- a/src/lj_intrinsic.h +++ b/src/lj_intrinsic.h @@ -57,6 +57,8 @@ typedef enum INTRINSFLAGS { INTRINSFLAG_PREFIX = 0x200, /* Opcode has an immediate byte that needs to be set at construction time */ INTRINSFLAG_IMMB = 0x400, + /* Opcode is larger than the emit system normally handles x86/x64(4 bytes) */ + INTRINSFLAG_LARGEOP = 0x800, /* Opcode uses ymm registers */ INTRINSFLAG_VEX256 = 0x4000, @@ -89,7 +91,7 @@ typedef struct AsmHeader { #define intrin_setopextb(intrins, opext) \ lua_assert((intrins)->outsz < 4); \ ((intrins)->out[3] = (opext)) -#define intrin_oplen(intrins) ((-(int8_t)(intrins)->opcode)-1) +#define intrin_oplen(intrins) (((intrins)->flags & INTRINSFLAG_LARGEOP) ? 4 : (-(int8_t)(intrins)->opcode)-1) /* odd numbered have an dynamic output */ #define intrin_dynrout(intrins) (intrin_regmode(intrins) && reg_isdyn(intrins->out[0])) diff --git a/tests/intrinsic_spec.lua b/tests/intrinsic_spec.lua index 2728c95f..cb36bf0a 100644 --- a/tests/intrinsic_spec.lua +++ b/tests/intrinsic_spec.lua @@ -899,8 +899,47 @@ it("shufps", function() assert_equal(vout[3], 1.5) end) +it("phaddd 4byte opcode", function() + + ffi.cdef([[int4 phaddd(int4 v1, int4 v2) __mcode("660F3802rM");]]) + + local phaddd = ffi.C.phaddd + + function hsum(v) + local result = phaddd(v, v) + result = phaddd(result, result) + return result[0] + end + + local v = ffi.new("int4", 1, 2, 3, 4) + local vzero = ffi.new("int4", 0) + + assert_equal(hsum(v), 10) + assert_equal(hsum(vzero), 0) +end) + context("mixed register type opcodes", function() + it("pcmpstr", function() + ffi.cdef([[void pcmpistri(byte16 string, byte16 mask) __mcode("660F3A63rMU", 0x2) __reglist(out, int32_t ecx)]]) + + local charlist = ffi.new("byte16", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) + local string = ffi.new("byte16", 2, 2, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 7, 7, 2, 2) + + local ecx = ffi.C.pcmpistri(charlist, string) + assert_equal(ecx, 4) + + ffi.cdef([[ + void pcmpistrm(byte16 string, byte16 mask) __mcode("660F3A62rMU", 0x40) __reglist(out, byte16 xmm0v); + int32_t pmovmskb(byte16 mask) __mcode("660FD7rM"); + ]]) + + local mask = ffi.C.pcmpistrm(charlist, string) + mask = ffi.C.pmovmskb(mask) + + assert_equal(mask, 48) + end) + it("cvttsd2s", function() assert_cdef([[int cvttsd2s(double n) __mcode("F20F2CrM");]], "cvttsd2s") local cvttsd2s = ffi.C.cvttsd2s