diff --git a/src/lj_asm.c b/src/lj_asm.c
index 1e83133b..c4f97a5c 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -2452,6 +2452,7 @@ typedef struct IntrinBuildState {
   RegSet inset, outset, modregs;
   uint32_t spadj, contexspill, contexofs;
   uint8_t outcontext;
+  char vzeroupper;
 } IntrinBuildState;
 
 static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
@@ -2465,6 +2466,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
   for (i = 0; i < intrins->insz; i++) {
     Reg r = reg_rid(info->in[i]);
     
+    if (reg_kind(info->in[i]) == REGKIND_V256)
+      info->vzeroupper = 1;
+
     if (reg_isgpr(info->in[i])) {
       if (r == RID_CONTEXT) {
         /* Save the offset in the input context so we can load it last */
@@ -2477,6 +2481,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
   }
 
   for (i = 0; i < intrins->outsz; i++) {
+    if (reg_kind(info->out[i]) == REGKIND_V256)
+      info->vzeroupper = 1;
+
     rset_set(info->outset, reg_rid(info->out[i]));
   }
 
@@ -2602,6 +2609,13 @@ restart:
 
   emit_epilogue(as, spadj, info.modregs, intrins->outsz);
 
+  /* Zero upper parts of ymm registers if any ymm register were used.
+  ** TODO: This shouldn't be need for some AMD cpus like Jaguar.
+  */
+  if (info.vzeroupper) {
+    as->mcp = emit_vop(XV_VZEROUPPER, 0, 0, 0, as->mcp, 1);
+  }
+
   /* If one of the output registers was the same as the outcontext we will
    * of saved the output value to the stack earlier, now save it into context
    */
diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h
index 760d74a9..8a1a0975 100644
--- a/src/lj_emit_x86.h
+++ b/src/lj_emit_x86.h
@@ -859,6 +859,10 @@ static void emit_loadfpr(ASMState *as, uint32_t reg, Reg base, int ofs)
   case REGKIND_V128:
     op = XO_MOVUPS;
     break;
+  case REGKIND_V256:
+    op = XV_MOVUPS;
+    r |= VEX_256;
+    break;
   }
 
   if (!rk_isvec(kind)) {
@@ -889,6 +893,10 @@ static void emit_savefpr(ASMState *as, Reg reg, Reg base, int ofs)
   case REGKIND_V128:
     op = XO_MOVUPSto;
     break;
+  case REGKIND_V256:
+    op = XV_MOVUPSto;
+    r |= VEX_256;
+    break;
   }
 
   if (!rk_isvec(kind)) {
diff --git a/src/lj_intrinsic.c b/src/lj_intrinsic.c
index 1da30a34..635358ab 100644
--- a/src/lj_intrinsic.c
+++ b/src/lj_intrinsic.c
@@ -97,8 +97,9 @@ static int parse_fprreg(const char *name, uint32_t len)
 {
   uint32_t rid = 0, kind = REGKIND_FPR64;
   uint32_t pos = 3;
+  int flags = 0;
 
-  if (len < 3 || name[0] != 'x' || 
+  if (len < 3 || (name[0] != 'x' && name[0] != 'y') || 
       name[1] != 'm' || name[2] != 'm')
     return -1;
 
@@ -120,15 +121,20 @@ static int parse_fprreg(const char *name, uint32_t len)
     return -1;
   }
 
-  if (pos < len) {
-    if (name[pos] == 'f') {
-      kind = REGKIND_FPR32;
-      pos++;
-    } else if (name[pos] == 'v') {
-      kind = REGKIND_V128;
-      pos++;
-    } else {
-      kind = REGKIND_FPR64;
+  if (name[0] == 'y') {
+    kind = REGKIND_V256;
+    flags |= INTRINSFLAG_VEX256;
+  } else {
+    if (pos < len) {
+      if (name[pos] == 'f') {
+        kind = REGKIND_FPR32;
+        pos++;
+      } else if (name[pos] == 'v') {
+        kind = REGKIND_V128;
+        pos++;
+      } else {
+        kind = REGKIND_FPR64;
+      }
     }
   }
 
@@ -136,12 +142,12 @@ static int parse_fprreg(const char *name, uint32_t len)
     return -1;
   }
 
-  return reg_make(rid, kind);
+  return reg_make(rid, kind) | flags;
 }
 
 int lj_intrinsic_getreg(CTState *cts, GCstr *name) {
 
-  if (strdata(name)[0] == 'x') {
+  if (strdata(name)[0] == 'x' || strdata(name)[0] == 'y') {
     return parse_fprreg(strdata(name), name->len);
   } else {
     cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name);
diff --git a/src/lj_intrinsic.h b/src/lj_intrinsic.h
index f1235cb2..bae8741e 100644
--- a/src/lj_intrinsic.h
+++ b/src/lj_intrinsic.h
@@ -26,6 +26,8 @@ typedef enum INTRINSFLAGS {
   INTRINSFLAG_CALLED = 0x20,
   /* MODRM should always be set as indirect mode */
   INTRINSFLAG_INDIRECT = 0x40,
+  /* Opcode uses ymm registers */
+  INTRINSFLAG_VEX256   = 0x4000,
   /* Intrinsic is a template with no machine code set until instantiate at runtime with
   ** user supplied code.
   */
@@ -54,7 +56,7 @@ typedef struct AsmHeader {
   _(FPR64, IRT_NUM,   CTID_DOUBLE) \
   _(FPR32, IRT_FLOAT, CTID_FLOAT) \
   _(V128,  0,         0) \
-  _(FPR5,  0,         0) \
+  _(V256,  0,         0) \
   _(FPR6,  0,         0) \
   _(FPR7,  0,         0) \
 
diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h
index 75fce4cd..c5f23082 100644
--- a/src/lj_target_x86.h
+++ b/src/lj_target_x86.h
@@ -214,11 +214,25 @@ typedef struct IntrinWrapState {
 #define XO_f20f(o)	((uint32_t)(0x0ff2fc + (0x##o<<24)))
 #define XO_f30f(o)	((uint32_t)(0x0ff3fc + (0x##o<<24)))
 
+#define XV_0f(o)	((uint32_t)(0xf8c5c5 + (0x##o<<24)))
 #define XV_660f38(o)	((uint32_t)(0x79e2c4 + (0x##o<<24)))
 #define XV_f20f38(o)	((uint32_t)(0x7be2c4 + (0x##o<<24)))
 #define XV_f20f3a(o)	((uint32_t)(0x7be3c4 + (0x##o<<24)))
 #define XV_f30f38(o)	((uint32_t)(0x7ae2c4 + (0x##o<<24)))
 
+typedef enum VEXPP {
+  VEXPP_0f = 0,
+  VEXPP_66 = 1,
+  VEXPP_f3 = 2,
+  VEXPP_f2 = 3,
+} VEXPP;
+
+typedef enum VEXMAP {
+  VEXMAP_0F = 1,
+  VEXMAP_0F38 = 2,
+  VEXMAP_0F3A = 3,
+} VEXMAP;
+
 /* This list of x86 opcodes is not intended to be complete. Opcodes are only
 ** included when needed. Take a look at DynASM or jit.dis_x86 to see the
 ** whole mess.
@@ -271,6 +285,10 @@ typedef enum {
   XV_SHLX =	XV_660f38(f7),
   XV_SHRX =	XV_f20f38(f7),
 
+  XV_MOVUPS =    XV_0f(10),
+  XV_MOVUPSto =  XV_0f(11),
+  XV_VZEROUPPER = XV_0f(77),
+
   /* Variable-length opcodes. XO_* prefix. */
   XO_OR =	XO_(0b),
   XO_MOV =	XO_(8b),
diff --git a/tests/intrinsic_spec.lua b/tests/intrinsic_spec.lua
index c4701560..3a866104 100644
--- a/tests/intrinsic_spec.lua
+++ b/tests/intrinsic_spec.lua
@@ -165,6 +165,23 @@ if ffi.arch == "x64" then
     
     assert_jit(444.575, testrex, 123.075, 321.5)
   end)
+ 
+  it("fpr_vexrex(ymm)", function()
+    local array = ffi.new("float8", 0, 1, 2, 3, 4, 5, 6, 7)
+    --force a Vex.B base register
+    
+    assert_cdef([[void fpr_vexrex(float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) __mcode("?E") 
+                                  __reglist(out, float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) 
+                                  __reglist(mod, ymm1, ymm7)]])
+                    
+    local ymmtest = ffi.intrinsic("fpr_vexrex", "\x90", 1)
+
+    local ymmout = ymmtest(array, 1, 2, 3, 4, 5, 6, 7)
+    
+    for i=0,7 do
+      assert_equal(ymmout[i], i)
+    end
+  end)
 end
   
   it("fpr_vec", function()
@@ -198,6 +215,41 @@ end
     end
   end) 
 
+  it("fpr_vec(ymm)", function()
+    assert_cdef([[void fpr_ymmvec(void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7)]], "fpr_ymmvec")
+    --test using plain array in place of a vector 
+    local v1 = ffi.new("float[8]", 0, 1, 2, 3, 4, 5, 6, 7)
+    local ymmout = ffi.C.fpr_ymmvec(v1)
+    
+    for i=0,7 do
+      assert_equal(ymmout[i], i)
+    end
+  
+    assert_cdef([[void fpr_ymmvec2(void* ymm0, void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7, float8 ymm0)]], "fpr_ymmvec2")
+    
+    local v2 = ffi.new("float[8]", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5) 
+    local ymmtest2 = ffi.C.fpr_ymmvec2
+    local ymm7, ymm0 = ymmtest2(v1, v2)
+    
+    for i=0,7 do
+      assert_equal(ymm0[i], i)
+    end    
+    for i=0,7 do
+      assert_equal(ymm7[i], i+0.5)
+    end
+    
+    --test using a cdata vector
+    v2 = ffi.new("float8", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5) 
+    ymm7, ymm0 = ymmtest2(v1, v2)
+    
+    for i=0,7 do
+      assert_equal(ymm0[i], i)
+    end 
+    for i=0,7 do
+      assert_equal(ymm7[i], i+0.5)
+    end
+  end)
+  
   it("idiv", function()
     assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv")