diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h
index ec4d6eed..56342858 100644
--- a/src/lj_emit_x86.h
+++ b/src/lj_emit_x86.h
@@ -669,6 +669,10 @@ static MCode* emit_intrins(ASMState *as, CIntrinsic *intrins, Reg r1,
     if (intrins->flags & INTRINSFLAG_IMMB) {
       *--as->mcp = intrins->immb;
     }
+    /* Tell emit_op the opcode is 4 bytes long */
+    if (intrins->flags & INTRINSFLAG_LARGEOP) {
+      r2 |= OP4B;
+    }
 
     emit_mrm(as, intrins->opcode, (Reg)r2, r1);
 
diff --git a/src/lj_intrinsic.c b/src/lj_intrinsic.c
index a8419c84..b92f994f 100644
--- a/src/lj_intrinsic.c
+++ b/src/lj_intrinsic.c
@@ -385,7 +385,7 @@ static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode)
   if (len < 4) {
     opcode |= (uint8_t)(int8_t)-(len+1);
   } else {
-    lj_err_callermsg(L, "bad opcode literal");
+    intrins->flags |= INTRINSFLAG_LARGEOP;
   }
 
   if (intrin_regmode(intrins) == DYNREG_OPEXT) {
diff --git a/src/lj_intrinsic.h b/src/lj_intrinsic.h
index e275c891..d8ab7670 100644
--- a/src/lj_intrinsic.h
+++ b/src/lj_intrinsic.h
@@ -57,6 +57,8 @@ typedef enum INTRINSFLAGS {
   INTRINSFLAG_PREFIX   = 0x200,
   /* Opcode has an immediate byte that needs to be set at construction time */
   INTRINSFLAG_IMMB     = 0x400,
+  /* Opcode is larger than the emit system normally handles x86/x64(4 bytes) */
+  INTRINSFLAG_LARGEOP  = 0x800,
  
   /* Opcode uses ymm registers */
   INTRINSFLAG_VEX256   = 0x4000,
@@ -89,7 +91,7 @@ typedef struct AsmHeader {
 #define intrin_setopextb(intrins, opext) \
   lua_assert((intrins)->outsz < 4); \
   ((intrins)->out[3] = (opext))
-#define intrin_oplen(intrins) ((-(int8_t)(intrins)->opcode)-1)
+#define intrin_oplen(intrins) (((intrins)->flags & INTRINSFLAG_LARGEOP) ? 4 : (-(int8_t)(intrins)->opcode)-1)
 
 /* odd numbered have an dynamic output */
 #define intrin_dynrout(intrins) (intrin_regmode(intrins) && reg_isdyn(intrins->out[0]))
diff --git a/tests/intrinsic_spec.lua b/tests/intrinsic_spec.lua
index 2728c95f..cb36bf0a 100644
--- a/tests/intrinsic_spec.lua
+++ b/tests/intrinsic_spec.lua
@@ -899,8 +899,47 @@ it("shufps", function()
   assert_equal(vout[3], 1.5)
 end)
 
+it("phaddd 4byte opcode", function()
+
+  ffi.cdef([[int4 phaddd(int4 v1, int4 v2) __mcode("660F3802rM");]])
+
+  local phaddd = ffi.C.phaddd
+
+  function hsum(v)
+    local result = phaddd(v, v)
+    result = phaddd(result, result)
+    return result[0]
+  end
+
+  local v = ffi.new("int4", 1, 2, 3, 4)
+  local vzero = ffi.new("int4", 0)
+
+  assert_equal(hsum(v), 10)
+  assert_equal(hsum(vzero), 0)
+end)
+
 context("mixed register type opcodes", function()
 
+  it("pcmpstr", function()
+    ffi.cdef([[void pcmpistri(byte16 string, byte16 mask) __mcode("660F3A63rMU", 0x2) __reglist(out, int32_t ecx)]])
+    
+    local charlist = ffi.new("byte16", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
+    local string = ffi.new("byte16",   2, 2, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 7, 7, 2, 2)
+
+    local ecx = ffi.C.pcmpistri(charlist, string)
+    assert_equal(ecx, 4)
+    
+    ffi.cdef([[
+      void pcmpistrm(byte16 string, byte16 mask) __mcode("660F3A62rMU", 0x40) __reglist(out, byte16 xmm0v);
+      int32_t pmovmskb(byte16 mask) __mcode("660FD7rM");
+    ]])
+    
+    local mask = ffi.C.pcmpistrm(charlist, string)
+    mask = ffi.C.pmovmskb(mask)
+    
+    assert_equal(mask, 48)
+  end)
+
   it("cvttsd2s", function()  
     assert_cdef([[int cvttsd2s(double n) __mcode("F20F2CrM");]], "cvttsd2s")
     local cvttsd2s = ffi.C.cvttsd2s