Added support for ymm registers in intrinsics

2025-02-08 15:34:09 +00:00 · 2016-03-29 11:08:36 +01:00 · 2016-03-29 11:08:36 +01:00 · f21526df9b
commit f21526df9b
parent c0797d346b
6 changed files with 113 additions and 13 deletions
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@ -2452,6 +2452,7 @@ typedef struct IntrinBuildState {
  RegSet inset, outset, modregs;
  uint32_t spadj, contexspill, contexofs;
  uint8_t outcontext;
  char vzeroupper;
 } IntrinBuildState;
 static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
@ -2465,6 +2466,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
  for (i = 0; i < intrins->insz; i++) {
    Reg r = reg_rid(info->in[i]);
    if (reg_kind(info->in[i]) == REGKIND_V256)
      info->vzeroupper = 1;
    if (reg_isgpr(info->in[i])) {
      if (r == RID_CONTEXT) {
        /* Save the offset in the input context so we can load it last */
@ -2477,6 +2481,9 @@ static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
  }
  for (i = 0; i < intrins->outsz; i++) {
    if (reg_kind(info->out[i]) == REGKIND_V256)
      info->vzeroupper = 1;
    rset_set(info->outset, reg_rid(info->out[i]));
  }
@ -2602,6 +2609,13 @@ restart:
  emit_epilogue(as, spadj, info.modregs, intrins->outsz);
  /* Zero upper parts of ymm registers if any ymm register were used.
  ** TODO: This shouldn't be need for some AMD cpus like Jaguar.
  */
  if (info.vzeroupper) {
    as->mcp = emit_vop(XV_VZEROUPPER, 0, 0, 0, as->mcp, 1);
  }
  /* If one of the output registers was the same as the outcontext we will
   * of saved the output value to the stack earlier, now save it into context
   */
--- a/src/lj_emit_x86.h
+++ b/src/lj_emit_x86.h
@ -859,6 +859,10 @@ static void emit_loadfpr(ASMState *as, uint32_t reg, Reg base, int ofs)
  case REGKIND_V128:
    op = XO_MOVUPS;
    break;
  case REGKIND_V256:
    op = XV_MOVUPS;
    r |= VEX_256;
    break;
  }
  if (!rk_isvec(kind)) {
@ -889,6 +893,10 @@ static void emit_savefpr(ASMState *as, Reg reg, Reg base, int ofs)
  case REGKIND_V128:
    op = XO_MOVUPSto;
    break;
  case REGKIND_V256:
    op = XV_MOVUPSto;
    r |= VEX_256;
    break;
  }
  if (!rk_isvec(kind)) {
--- a/src/lj_intrinsic.c
+++ b/src/lj_intrinsic.c
@ -97,8 +97,9 @@ static int parse_fprreg(const char *name, uint32_t len)
 {
  uint32_t rid = 0, kind = REGKIND_FPR64;
  uint32_t pos = 3;
  int flags = 0;
-  if (len < 3 || name[0] != 'x' || 
+  if (len < 3 || (name[0] != 'x' && name[0] != 'y') || 
      name[1] != 'm' || name[2] != 'm')
    return -1;
@ -120,15 +121,20 @@ static int parse_fprreg(const char *name, uint32_t len)
    return -1;
  }
-  if (pos < len) {
+  if (name[0] == 'y') {
-    if (name[pos] == 'f') {
+    kind = REGKIND_V256;
-      kind = REGKIND_FPR32;
+    flags |= INTRINSFLAG_VEX256;
-      pos++;
+  } else {
-    } else if (name[pos] == 'v') {
+    if (pos < len) {
-      kind = REGKIND_V128;
+      if (name[pos] == 'f') {
-      pos++;
+        kind = REGKIND_FPR32;
-    } else {
+        pos++;
-      kind = REGKIND_FPR64;
+      } else if (name[pos] == 'v') {
        kind = REGKIND_V128;
        pos++;
      } else {
        kind = REGKIND_FPR64;
      }
    }
  }
@ -136,12 +142,12 @@ static int parse_fprreg(const char *name, uint32_t len)
    return -1;
  }
-  return reg_make(rid, kind);
+  return reg_make(rid, kind) | flags;
 }
 int lj_intrinsic_getreg(CTState *cts, GCstr *name) {
-  if (strdata(name)[0] == 'x') {
+  if (strdata(name)[0] == 'x' || strdata(name)[0] == 'y') {
    return parse_fprreg(strdata(name), name->len);
  } else {
    cTValue *reginfotv = lj_tab_getstr(cts->miscmap, name);
--- a/src/lj_intrinsic.h
+++ b/src/lj_intrinsic.h
@ -26,6 +26,8 @@ typedef enum INTRINSFLAGS {
  INTRINSFLAG_CALLED = 0x20,
  /* MODRM should always be set as indirect mode */
  INTRINSFLAG_INDIRECT = 0x40,
  /* Opcode uses ymm registers */
  INTRINSFLAG_VEX256   = 0x4000,
  /* Intrinsic is a template with no machine code set until instantiate at runtime with
  ** user supplied code.
  */
@ -54,7 +56,7 @@ typedef struct AsmHeader {
  _(FPR64, IRT_NUM,   CTID_DOUBLE) \
  _(FPR32, IRT_FLOAT, CTID_FLOAT) \
  _(V128,  0,         0) \
-  _(FPR5,  0,         0) \
+  _(V256,  0,         0) \
  _(FPR6,  0,         0) \
  _(FPR7,  0,         0) \
--- a/src/lj_target_x86.h
+++ b/src/lj_target_x86.h
@ -214,11 +214,25 @@ typedef struct IntrinWrapState {
 #define XO_f20f(o)	((uint32_t)(0x0ff2fc + (0x##o<<24)))
 #define XO_f30f(o)	((uint32_t)(0x0ff3fc + (0x##o<<24)))
 #define XV_0f(o)	((uint32_t)(0xf8c5c5 + (0x##o<<24)))
 #define XV_660f38(o)	((uint32_t)(0x79e2c4 + (0x##o<<24)))
 #define XV_f20f38(o)	((uint32_t)(0x7be2c4 + (0x##o<<24)))
 #define XV_f20f3a(o)	((uint32_t)(0x7be3c4 + (0x##o<<24)))
 #define XV_f30f38(o)	((uint32_t)(0x7ae2c4 + (0x##o<<24)))
 typedef enum VEXPP {
  VEXPP_0f = 0,
  VEXPP_66 = 1,
  VEXPP_f3 = 2,
  VEXPP_f2 = 3,
 } VEXPP;
 typedef enum VEXMAP {
  VEXMAP_0F = 1,
  VEXMAP_0F38 = 2,
  VEXMAP_0F3A = 3,
 } VEXMAP;
 /* This list of x86 opcodes is not intended to be complete. Opcodes are only
 ** included when needed. Take a look at DynASM or jit.dis_x86 to see the
 ** whole mess.
@ -271,6 +285,10 @@ typedef enum {
  XV_SHLX =	XV_660f38(f7),
  XV_SHRX =	XV_f20f38(f7),
  XV_MOVUPS =    XV_0f(10),
  XV_MOVUPSto =  XV_0f(11),
  XV_VZEROUPPER = XV_0f(77),
  /* Variable-length opcodes. XO_* prefix. */
  XO_OR =	XO_(0b),
  XO_MOV =	XO_(8b),
--- a/tests/intrinsic_spec.lua
+++ b/tests/intrinsic_spec.lua
@ -165,6 +165,23 @@ if ffi.arch == "x64" then
    assert_jit(444.575, testrex, 123.075, 321.5)
  end)
  it("fpr_vexrex(ymm)", function()
    local array = ffi.new("float8", 0, 1, 2, 3, 4, 5, 6, 7)
    --force a Vex.B base register
    assert_cdef([[void fpr_vexrex(float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) __mcode("?E") 
                                  __reglist(out, float8 ymm14, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx, int32_t esi, int32_t edi, int32_t ebp) 
                                  __reglist(mod, ymm1, ymm7)]])
    local ymmtest = ffi.intrinsic("fpr_vexrex", "\x90", 1)
    local ymmout = ymmtest(array, 1, 2, 3, 4, 5, 6, 7)
    for i=0,7 do
      assert_equal(ymmout[i], i)
    end
  end)
 end
  it("fpr_vec", function()
@ -198,6 +215,41 @@ end
    end
  end) 
  it("fpr_vec(ymm)", function()
    assert_cdef([[void fpr_ymmvec(void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7)]], "fpr_ymmvec")
    --test using plain array in place of a vector 
    local v1 = ffi.new("float[8]", 0, 1, 2, 3, 4, 5, 6, 7)
    local ymmout = ffi.C.fpr_ymmvec(v1)
    for i=0,7 do
      assert_equal(ymmout[i], i)
    end
    assert_cdef([[void fpr_ymmvec2(void* ymm0, void* ymm7) __mcode("90_E") __reglist(out, float8 ymm7, float8 ymm0)]], "fpr_ymmvec2")
    local v2 = ffi.new("float[8]", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5) 
    local ymmtest2 = ffi.C.fpr_ymmvec2
    local ymm7, ymm0 = ymmtest2(v1, v2)
    for i=0,7 do
      assert_equal(ymm0[i], i)
    end    
    for i=0,7 do
      assert_equal(ymm7[i], i+0.5)
    end
    --test using a cdata vector
    v2 = ffi.new("float8", 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5) 
    ymm7, ymm0 = ymmtest2(v1, v2)
    for i=0,7 do
      assert_equal(ymm0[i], i)
    end 
    for i=0,7 do
      assert_equal(ymm7[i], i+0.5)
    end
  end)
  it("idiv", function()
    assert_cdef([[void idiv(int32_t eax, int32_t ecx) __mcode("99F7F9_E") __reglist(out, int32_t eax, int32_t edx)]], "idiv")