mirror of
https://github.com/LuaJIT/LuaJIT.git
synced 2025-02-12 17:24:09 +00:00
FFI: Optimize ffi.copy() and ffi.fill().
This commit is contained in:
parent
5ebe4990ba
commit
a7d1dbacb1
287
src/lj_crecord.c
287
src/lj_crecord.c
@ -91,25 +91,7 @@ static CTypeID argv2ctype(jit_State *J, TRef tr, cTValue *o)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* -- Convert C type to C type -------------------------------------------- */
|
/* Convert CType to IRType (if possible). */
|
||||||
|
|
||||||
/*
|
|
||||||
** This code mirrors the code in lj_cconv.c. It performs the same steps
|
|
||||||
** for the trace recorder that lj_cconv.c does for the interpreter.
|
|
||||||
**
|
|
||||||
** One major difference is that we can get away with much fewer checks
|
|
||||||
** here. E.g. checks for casts, constness or correct types can often be
|
|
||||||
** omitted, even if they might fail. The interpreter subsequently throws
|
|
||||||
** an error, which aborts the trace.
|
|
||||||
**
|
|
||||||
** All operations are specialized to their C types, so the on-trace
|
|
||||||
** outcome must be the same as the outcome in the interpreter. If the
|
|
||||||
** interpreter doesn't throw an error, then the trace is correct, too.
|
|
||||||
** Care must be taken not to generate invalid (temporary) IR or to
|
|
||||||
** trigger asserts.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Convert CType to IRType. */
|
|
||||||
static IRType crec_ct2irt(CTState *cts, CType *ct)
|
static IRType crec_ct2irt(CTState *cts, CType *ct)
|
||||||
{
|
{
|
||||||
if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
|
if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
|
||||||
@ -135,6 +117,253 @@ static IRType crec_ct2irt(CTState *cts, CType *ct)
|
|||||||
return IRT_CDATA;
|
return IRT_CDATA;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -- Optimized memory fill and copy -------------------------------------- */
|
||||||
|
|
||||||
|
/* Maximum length and unroll of inlined copy/fill. */
|
||||||
|
#define CREC_COPY_MAXUNROLL 16
|
||||||
|
#define CREC_COPY_MAXLEN 128
|
||||||
|
|
||||||
|
#define CREC_FILL_MAXUNROLL 16
|
||||||
|
#if LJ_TARGET_UNALIGNED
|
||||||
|
#define CREC_FILL_MAXLEN (CTSIZE_PTR * CREC_FILL_MAXUNROLL)
|
||||||
|
#else
|
||||||
|
#define CREC_FILL_MAXLEN CREC_FILL_MAXUNROLL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Number of windowed registers used for optimized memory copy. */
|
||||||
|
#if LJ_TARGET_X86
|
||||||
|
#define CREC_COPY_REGWIN 2
|
||||||
|
#elif LJ_TARGET_PPC || LJ_TARGET_MIPS
|
||||||
|
#define CREC_COPY_REGWIN 8
|
||||||
|
#else
|
||||||
|
#define CREC_COPY_REGWIN 4
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* List of memory offsets for copy/fill. */
|
||||||
|
typedef struct CRecMemList {
|
||||||
|
CTSize ofs; /* Offset in bytes. */
|
||||||
|
IRType tp; /* Type of load/store. */
|
||||||
|
TRef trofs; /* TRef of interned offset. */
|
||||||
|
TRef trval; /* TRef of load value. */
|
||||||
|
} CRecMemList;
|
||||||
|
|
||||||
|
/* Generate copy list for element-wise struct copy. */
|
||||||
|
static MSize crec_copy_struct(CRecMemList *ml, CTState *cts, CType *ct)
|
||||||
|
{
|
||||||
|
CTypeID fid = ct->sib;
|
||||||
|
MSize mlp = 0;
|
||||||
|
while (fid) {
|
||||||
|
CType *df = ctype_get(cts, fid);
|
||||||
|
fid = df->sib;
|
||||||
|
if (ctype_isfield(df->info)) {
|
||||||
|
CType *cct;
|
||||||
|
IRType tp;
|
||||||
|
if (!gcref(df->name)) continue; /* Ignore unnamed fields. */
|
||||||
|
cct = ctype_rawchild(cts, df); /* Field type. */
|
||||||
|
tp = crec_ct2irt(cts, cct);
|
||||||
|
if (tp == IRT_CDATA) return 0; /* NYI: aggregates. */
|
||||||
|
if (mlp >= CREC_COPY_MAXUNROLL) return 0;
|
||||||
|
ml[mlp].ofs = df->size;
|
||||||
|
ml[mlp].tp = tp;
|
||||||
|
mlp++;
|
||||||
|
if (ctype_iscomplex(cct->info)) {
|
||||||
|
if (mlp >= CREC_COPY_MAXUNROLL) return 0;
|
||||||
|
ml[mlp].ofs = df->size + (cct->size >> 1);
|
||||||
|
ml[mlp].tp = tp;
|
||||||
|
mlp++;
|
||||||
|
}
|
||||||
|
} else if (!ctype_isconstval(df->info)) {
|
||||||
|
/* NYI: bitfields and sub-structures. */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mlp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Generate unrolled copy list, from highest to lowest step size/alignment. */
|
||||||
|
static MSize crec_copy_unroll(CRecMemList *ml, CTSize len, CTSize step,
|
||||||
|
IRType tp)
|
||||||
|
{
|
||||||
|
CTSize ofs = 0;
|
||||||
|
MSize mlp = 0;
|
||||||
|
if (tp == IRT_CDATA) tp = IRT_U8 + 2*lj_fls(step);
|
||||||
|
do {
|
||||||
|
while (ofs + step <= len) {
|
||||||
|
if (mlp >= CREC_COPY_MAXUNROLL) return 0;
|
||||||
|
ml[mlp].ofs = ofs;
|
||||||
|
ml[mlp].tp = tp;
|
||||||
|
mlp++;
|
||||||
|
ofs += step;
|
||||||
|
}
|
||||||
|
step >>= 1;
|
||||||
|
tp -= 2;
|
||||||
|
} while (ofs < len);
|
||||||
|
return mlp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Emit copy list with windowed loads/stores.
|
||||||
|
** LJ_TARGET_UNALIGNED: may emit unaligned loads/stores (not marked as such).
|
||||||
|
*/
|
||||||
|
static void crec_copy_emit(jit_State *J, CRecMemList *ml, MSize mlp,
|
||||||
|
TRef trdst, TRef trsrc)
|
||||||
|
{
|
||||||
|
MSize i, j, rwin = 0;
|
||||||
|
for (i = 0, j = 0; i < mlp; ) {
|
||||||
|
TRef trofs = lj_ir_kintp(J, ml[i].ofs);
|
||||||
|
TRef trsptr = emitir(IRT(IR_ADD, IRT_PTR), trsrc, trofs);
|
||||||
|
ml[i].trval = emitir(IRT(IR_XLOAD, ml[i].tp), trsptr, 0);
|
||||||
|
ml[i].trofs = trofs;
|
||||||
|
i++;
|
||||||
|
rwin += (LJ_SOFTFP && ml[i].tp == IRT_NUM) ? 2 : 1;
|
||||||
|
if (rwin >= CREC_COPY_REGWIN || i >= mlp) { /* Flush buffered stores. */
|
||||||
|
rwin = 0;
|
||||||
|
for ( ; j < i; j++) {
|
||||||
|
TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, ml[j].trofs);
|
||||||
|
emitir(IRT(IR_XSTORE, ml[j].tp), trdptr, ml[j].trval);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Optimized memory copy. */
|
||||||
|
static void crec_copy(jit_State *J, TRef trdst, TRef trsrc, TRef trlen,
|
||||||
|
CType *ct)
|
||||||
|
{
|
||||||
|
if (tref_isk(trlen)) { /* Length must be constant. */
|
||||||
|
CRecMemList ml[CREC_COPY_MAXUNROLL];
|
||||||
|
MSize mlp = 0;
|
||||||
|
CTSize step = 1, len = (CTSize)IR(tref_ref(trlen))->i;
|
||||||
|
IRType tp = IRT_CDATA;
|
||||||
|
int needxbar = 0;
|
||||||
|
if (len == 0) return; /* Shortcut. */
|
||||||
|
if (len > CREC_COPY_MAXLEN) goto fallback;
|
||||||
|
if (ct) {
|
||||||
|
CTState *cts = ctype_ctsG(J2G(J));
|
||||||
|
lua_assert(ctype_isarray(ct->info) || ctype_isstruct(ct->info));
|
||||||
|
if (ctype_isarray(ct->info)) {
|
||||||
|
CType *cct = ctype_rawchild(cts, ct);
|
||||||
|
tp = crec_ct2irt(cts, cct);
|
||||||
|
if (tp == IRT_CDATA) goto rawcopy;
|
||||||
|
step = lj_ir_type_size[tp];
|
||||||
|
lua_assert((len & (step-1)) == 0);
|
||||||
|
} else if ((ct->info & CTF_UNION)) {
|
||||||
|
step = (1u << ctype_align(ct->info));
|
||||||
|
goto rawcopy;
|
||||||
|
} else {
|
||||||
|
mlp = crec_copy_struct(ml, cts, ct);
|
||||||
|
goto emitcopy;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
rawcopy:
|
||||||
|
needxbar = 1;
|
||||||
|
if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR)
|
||||||
|
step = CTSIZE_PTR;
|
||||||
|
}
|
||||||
|
mlp = crec_copy_unroll(ml, len, step, tp);
|
||||||
|
emitcopy:
|
||||||
|
if (mlp) {
|
||||||
|
crec_copy_emit(J, ml, mlp, trdst, trsrc);
|
||||||
|
if (needxbar)
|
||||||
|
emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fallback:
|
||||||
|
/* Call memcpy. Always needs a barrier to disable alias analysis. */
|
||||||
|
lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen);
|
||||||
|
emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Generate unrolled fill list, from highest to lowest step size/alignment. */
|
||||||
|
static MSize crec_fill_unroll(CRecMemList *ml, CTSize len, CTSize step)
|
||||||
|
{
|
||||||
|
CTSize ofs = 0;
|
||||||
|
MSize mlp = 0;
|
||||||
|
IRType tp = IRT_U8 + 2*lj_fls(step);
|
||||||
|
do {
|
||||||
|
while (ofs + step <= len) {
|
||||||
|
if (mlp >= CREC_COPY_MAXUNROLL) return 0;
|
||||||
|
ml[mlp].ofs = ofs;
|
||||||
|
ml[mlp].tp = tp;
|
||||||
|
mlp++;
|
||||||
|
ofs += step;
|
||||||
|
}
|
||||||
|
step >>= 1;
|
||||||
|
tp -= 2;
|
||||||
|
} while (ofs < len);
|
||||||
|
return mlp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Emit stores for fill list.
|
||||||
|
** LJ_TARGET_UNALIGNED: may emit unaligned stores (not marked as such).
|
||||||
|
*/
|
||||||
|
static void crec_fill_emit(jit_State *J, CRecMemList *ml, MSize mlp,
|
||||||
|
TRef trdst, TRef trfill)
|
||||||
|
{
|
||||||
|
MSize i;
|
||||||
|
for (i = 0; i < mlp; i++) {
|
||||||
|
TRef trofs = lj_ir_kintp(J, ml[i].ofs);
|
||||||
|
TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, trofs);
|
||||||
|
emitir(IRT(IR_XSTORE, ml[i].tp), trdptr, trfill);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Optimized memory fill. */
|
||||||
|
static void crec_fill(jit_State *J, TRef trdst, TRef trlen, TRef trfill,
|
||||||
|
CTSize step)
|
||||||
|
{
|
||||||
|
if (tref_isk(trlen)) { /* Length must be constant. */
|
||||||
|
CRecMemList ml[CREC_FILL_MAXUNROLL];
|
||||||
|
MSize mlp;
|
||||||
|
CTSize len = (CTSize)IR(tref_ref(trlen))->i;
|
||||||
|
if (len == 0) return; /* Shortcut. */
|
||||||
|
if (len > CREC_FILL_MAXLEN) goto fallback;
|
||||||
|
if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR)
|
||||||
|
step = CTSIZE_PTR;
|
||||||
|
mlp = crec_fill_unroll(ml, len, step);
|
||||||
|
if (!mlp) goto fallback;
|
||||||
|
if (tref_isk(trfill) || ml[0].tp != IRT_U8)
|
||||||
|
trfill = emitconv(trfill, IRT_INT, IRT_U8, 0);
|
||||||
|
if (ml[0].tp != IRT_U8) { /* Scatter U8 to U16/U32/U64. */
|
||||||
|
if (CTSIZE_PTR == 8 && ml[0].tp == IRT_U64) {
|
||||||
|
if (tref_isk(trfill)) /* Pointless on x64 with zero-extended regs. */
|
||||||
|
trfill = emitconv(trfill, IRT_U64, IRT_U32, 0);
|
||||||
|
trfill = emitir(IRT(IR_MUL, IRT_U64), trfill,
|
||||||
|
lj_ir_kint64(J, U64x(01010101,01010101)));
|
||||||
|
} else {
|
||||||
|
trfill = emitir(IRTI(IR_MUL), trfill,
|
||||||
|
lj_ir_kint(J, ml[0].tp == IRT_U16 ? 0x0101 : 0x01010101));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
crec_fill_emit(J, ml, mlp, trdst, trfill);
|
||||||
|
} else {
|
||||||
|
fallback:
|
||||||
|
/* Call memset. Always needs a barrier to disable alias analysis. */
|
||||||
|
lj_ir_call(J, IRCALL_memset, trdst, trfill, trlen); /* Note: arg order! */
|
||||||
|
}
|
||||||
|
emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -- Convert C type to C type -------------------------------------------- */
|
||||||
|
|
||||||
|
/*
|
||||||
|
** This code mirrors the code in lj_cconv.c. It performs the same steps
|
||||||
|
** for the trace recorder that lj_cconv.c does for the interpreter.
|
||||||
|
**
|
||||||
|
** One major difference is that we can get away with much fewer checks
|
||||||
|
** here. E.g. checks for casts, constness or correct types can often be
|
||||||
|
** omitted, even if they might fail. The interpreter subsequently throws
|
||||||
|
** an error, which aborts the trace.
|
||||||
|
**
|
||||||
|
** All operations are specialized to their C types, so the on-trace
|
||||||
|
** outcome must be the same as the outcome in the interpreter. If the
|
||||||
|
** interpreter doesn't throw an error, then the trace is correct, too.
|
||||||
|
** Care must be taken not to generate invalid (temporary) IR or to
|
||||||
|
** trigger asserts.
|
||||||
|
*/
|
||||||
|
|
||||||
/* Determine whether a passed number or cdata number is non-zero. */
|
/* Determine whether a passed number or cdata number is non-zero. */
|
||||||
static int crec_isnonzero(CType *s, void *p)
|
static int crec_isnonzero(CType *s, void *p)
|
||||||
{
|
{
|
||||||
@ -1298,26 +1527,32 @@ void LJ_FASTCALL recff_ffi_copy(jit_State *J, RecordFFData *rd)
|
|||||||
trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN);
|
trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN);
|
||||||
trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1));
|
trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1));
|
||||||
}
|
}
|
||||||
lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen);
|
|
||||||
emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
|
|
||||||
rd->nres = 0;
|
rd->nres = 0;
|
||||||
|
crec_copy(J, trdst, trsrc, trlen, NULL);
|
||||||
} /* else: interpreter will throw. */
|
} /* else: interpreter will throw. */
|
||||||
}
|
}
|
||||||
|
|
||||||
void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd)
|
void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd)
|
||||||
{
|
{
|
||||||
CTState *cts = ctype_ctsG(J2G(J));
|
CTState *cts = ctype_ctsG(J2G(J));
|
||||||
TRef tr = J->base[0], trlen = J->base[1], trfill = J->base[2];
|
TRef trdst = J->base[0], trlen = J->base[1], trfill = J->base[2];
|
||||||
if (tr && trlen) {
|
if (trdst && trlen) {
|
||||||
tr = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, tr, &rd->argv[0]);
|
CTSize step = 1;
|
||||||
|
if (tviscdata(&rd->argv[0])) { /* Get alignment of original destination. */
|
||||||
|
CTSize sz;
|
||||||
|
CType *ct = ctype_raw(cts, cdataV(&rd->argv[0])->ctypeid);
|
||||||
|
if (ctype_isptr(ct->info))
|
||||||
|
ct = ctype_rawchild(cts, ct);
|
||||||
|
step = (1u<<ctype_align(lj_ctype_info(cts, ctype_typeid(cts, ct), &sz)));
|
||||||
|
}
|
||||||
|
trdst = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, trdst, &rd->argv[0]);
|
||||||
trlen = crec_toint(J, cts, trlen, &rd->argv[1]);
|
trlen = crec_toint(J, cts, trlen, &rd->argv[1]);
|
||||||
if (trfill)
|
if (trfill)
|
||||||
trfill = crec_toint(J, cts, trfill, &rd->argv[2]);
|
trfill = crec_toint(J, cts, trfill, &rd->argv[2]);
|
||||||
else
|
else
|
||||||
trfill = lj_ir_kint(J, 0);
|
trfill = lj_ir_kint(J, 0);
|
||||||
lj_ir_call(J, IRCALL_memset, tr, trfill, trlen);
|
|
||||||
emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
|
|
||||||
rd->nres = 0;
|
rd->nres = 0;
|
||||||
|
crec_fill(J, trdst, trlen, trfill, step);
|
||||||
} /* else: interpreter will throw. */
|
} /* else: interpreter will throw. */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user