From 6b22c1df166fe4ea81ec622fb40fd6501ab1af3d Mon Sep 17 00:00:00 2001 From: Albert <-> Date: Thu, 18 Jan 2024 20:49:50 -0500 Subject: [PATCH] Strings --- src/lib_aux.c | 3 +- src/lib_base.c | 2 +- src/lib_ffi.c | 2 +- src/lib_io.c | 2 +- src/lib_os.c | 2 +- src/lj_api.c | 2 +- src/lj_arena.c | 65 +++- src/lj_arena.h | 4 +- src/lj_ctype.c | 2 +- src/lj_def.h | 5 +- src/lj_ffrecord.c | 4 +- src/lj_gc.c | 805 +++++++++++++++++++++++++++++++++++++++---- src/lj_gc.h | 181 +++++++++- src/lj_intrin.h | 27 +- src/lj_meta.c | 11 +- src/lj_obj.h | 47 ++- src/lj_opt_fold.c | 4 +- src/lj_record.c | 2 +- src/lj_state.c | 12 +- src/lj_state.h | 1 + src/lj_str.c | 335 +++++++++++------- src/lj_str.h | 6 +- src/luajit_rolling.h | 4 + src/vm_x64.dasc | 3 +- 24 files changed, 1268 insertions(+), 263 deletions(-) diff --git a/src/lib_aux.c b/src/lib_aux.c index 1607648b..0d2696c6 100644 --- a/src/lib_aux.c +++ b/src/lib_aux.c @@ -403,6 +403,7 @@ LUA_API lua_State *luaJIT_newstate(lua_Alloc f, void *ud, luaJIT_allocpages allocp, luaJIT_freepages freep, luaJIT_reallochuge realloch, + luaJIT_reallocraw rawalloc, void *page_ud) { lua_State *L; @@ -417,7 +418,7 @@ LUA_API lua_State *luaJIT_newstate(lua_Alloc f, void *ud, ud = NULL; } #endif - L = lj_newstate(f, ud, allocp, freep, realloch, page_ud); + L = lj_newstate(f, ud, allocp, freep, realloch, rawalloc, page_ud); if (L) { G(L)->panic = panic; diff --git a/src/lib_base.c b/src/lib_base.c index 4e6f8a30..8658672c 100644 --- a/src/lib_base.c +++ b/src/lib_base.c @@ -416,7 +416,7 @@ LJLIB_CF(load) SBufExt *sbx = bufV(L->base); s = sbx->r; len = sbufxlen(sbx); - if (!name) name = &G(L)->strempty; /* Buffers are not NUL-terminated. */ + if (!name) name = G(L)->strempty; /* Buffers are not NUL-terminated. */ } else { GCstr *str = lj_lib_checkstr(L, 1); s = strdata(str); diff --git a/src/lib_ffi.c b/src/lib_ffi.c index 065ac923..563d66f8 100644 --- a/src/lib_ffi.c +++ b/src/lib_ffi.c @@ -860,7 +860,7 @@ LUALIB_API int luaopen_ffi(lua_State *L) LJ_LIB_REG(L, NULL, ffi_clib); LJ_LIB_REG(L, NULL, ffi_callback); /* NOBARRIER: the key is new and lj_tab_newkey() handles the barrier. */ - settabV(L, lj_tab_setstr(L, cts->miscmap, &cts->g->strempty), tabV(L->top-1)); + settabV(L, lj_tab_setstr(L, cts->miscmap, cts->g->strempty), tabV(L->top-1)); L->top--; lj_clib_default(L, tabV(L->top-1)); /* Create ffi.C default namespace. */ lua_pushliteral(L, LJ_OS_NAME); diff --git a/src/lib_io.c b/src/lib_io.c index 21b2d31c..c1ff92b4 100644 --- a/src/lib_io.c +++ b/src/lib_io.c @@ -185,7 +185,7 @@ static int io_file_readlen(lua_State *L, FILE *fp, MSize m) } else { int c = getc(fp); ungetc(c, fp); - setstrV(L, L->top++, &G(L)->strempty); + setstrV(L, L->top++, G(L)->strempty); return (c != EOF); } } diff --git a/src/lib_os.c b/src/lib_os.c index cf0df281..dc6ce0bb 100644 --- a/src/lib_os.c +++ b/src/lib_os.c @@ -221,7 +221,7 @@ LJLIB_CF(os_date) sz += (sz|1); } } else { - setstrV(L, L->top++, &G(L)->strempty); + setstrV(L, L->top++, G(L)->strempty); } return 1; } diff --git a/src/lj_api.c b/src/lj_api.c index 9c462f54..2319835f 100644 --- a/src/lj_api.c +++ b/src/lj_api.c @@ -800,7 +800,7 @@ LUA_API void lua_concat(lua_State *L, int n) copyTV(L, L->top-1, L->top+LJ_FR2); } while (--n > 0); } else if (n == 0) { /* Push empty string. */ - setstrV(L, L->top, &G(L)->strempty); + setstrV(L, L->top, G(L)->strempty); incr_top(L); } /* else n == 1: nothing to do. */ diff --git a/src/lj_arena.c b/src/lj_arena.c index 8147ce52..c117206b 100644 --- a/src/lj_arena.c +++ b/src/lj_arena.c @@ -265,13 +265,22 @@ static void *lj_arena_api_reallochuge(void *ud, void *p, size_t osz, size_t nsz) return newp; } -#define lj_arena_firstalloc(g, arena, id, atype, type) \ +static void *lj_arena_api_rawalloc(void *ud, void *p, size_t osz, size_t nsz) +{ + if (!p) + return RESERVE_AND_COMMIT_PAGES(nsz); + if (!nsz) + UNRESERVE_PAGES(p, osz); + return NULL; +} + +#define lj_arena_firstalloc(g, arena, id, atype, type, init) \ { \ atype *a = (atype *)lj_arena_alloc(&g->gc.ctx); \ if (!a) \ return 0; \ arena = &a->hdr; \ - do_arena_init(a, g, id, atype, type); \ + init(a, g, id, atype, type); \ } static int lj_blob_firstalloc(global_State *g, GCAblob **h) @@ -290,7 +299,7 @@ static int lj_blob_firstalloc(global_State *g, GCAblob **h) int lj_arena_init(struct global_State *g, luaJIT_allocpages allocp, luaJIT_freepages freep, luaJIT_reallochuge realloch, - void *page_ud) + luaJIT_reallocraw rawalloc, void *page_ud) { g->gc.bloblist_alloc = 32; g->gc.bloblist = @@ -306,10 +315,11 @@ int lj_arena_init(struct global_State *g, luaJIT_allocpages allocp, g->gc.bloblist_wr = 1; /* All must be provided to override */ - if (allocp && freep && realloch) { + if (allocp && freep && realloch && rawalloc) { g->gc.ctx.allocpages = allocp; g->gc.ctx.freepages = freep; g->gc.ctx.reallochuge = realloch; + g->gc.ctx.rawalloc = rawalloc; g->gc.ctx.pageud = page_ud; } else { arena_alloc *arenas = (arena_alloc*)g->allocf(g->allocd, NULL, 0, sizeof(arena_alloc)); @@ -319,6 +329,7 @@ int lj_arena_init(struct global_State *g, luaJIT_allocpages allocp, g->gc.ctx.allocpages = &lj_arena_api_allocpages; g->gc.ctx.freepages = &lj_arena_api_freepages; g->gc.ctx.reallochuge = &lj_arena_api_reallochuge; + g->gc.ctx.rawalloc = &lj_arena_api_rawalloc; g->gc.ctx.pageud = arenas; arenas->g = g; @@ -334,11 +345,13 @@ int lj_arena_init(struct global_State *g, luaJIT_allocpages allocp, } /* Allocate all arenas */ - lj_arena_firstalloc(g, g->gc.tab, ~LJ_TTAB, GCAtab, GCtab); - lj_arena_firstalloc(g, g->gc.fintab, ~LJ_TTAB, GCAtab, GCtab); - lj_arena_firstalloc(g, g->gc.uv, ~LJ_TUPVAL, GCAupval, GCupval); - lj_arena_firstalloc(g, g->gc.func, ~LJ_TFUNC, GCAfunc, GCfunc); - lj_arena_firstalloc(g, g->gc.udata, ~LJ_TUDATA, GCAudata, GCudata); + lj_arena_firstalloc(g, g->gc.tab, ~LJ_TTAB, GCAtab, GCtab, do_arena_init); + lj_arena_firstalloc(g, g->gc.fintab, ~LJ_TTAB, GCAtab, GCtab, do_arena_init); + lj_arena_firstalloc(g, g->gc.uv, ~LJ_TUPVAL, GCAupval, GCupval, do_arena_init); + lj_arena_firstalloc(g, g->gc.func, ~LJ_TFUNC, GCAfunc, GCfunc, do_arena_init); + lj_arena_firstalloc(g, g->gc.udata, ~LJ_TUDATA, GCAudata, GCudata, do_arena_init); + lj_arena_firstalloc(g, g->gc.str_small, ~LJ_TSTR, GCAstr, GCstr, do_smallstr_arena_init); + g->gc.str = &lj_arena_str_med(g)->hdr; ((GCAudata *)g->gc.udata)->free4_h = ((GCAudata *)g->gc.udata)->free_h; @@ -359,6 +372,36 @@ static void release_chain_arena(struct global_State *g, GCArenaHdr *a) } } +static void release_string_table(struct global_State *g) +{ + /* The string table is more tricky. Entries are either valid pointers + * or on the freelist. Start by zeroing all freelist entries + */ + int32_t at = g->str.secondary_slot_free_head; + while (at != -1) { + int32_t n = (int32_t)mrefu(g->str.secondary_list[at]); + setmrefu(g->str.secondary_list[at], 0); + at = n; + } + + /* Remaining nonzero entries are valid pointers. */ + for (uint32_t i = 0; i < g->str.secondary_list_capacity; i++) { + void *a = mref(g->str.secondary_list[i], void); + if (a) + release_one_arena(g, a); + } +} + +static void release_huge_strings(struct global_State *g) +{ + GCArenaHdr *a = g->gc.str_huge; + while (a) { + GCAstr *s = (GCAstr *)a; + a = a->gray; + lj_arena_freehuge(&g->gc.ctx, s, s->free_h); + } +} + static void release_all_arenas(struct global_State *g) { release_chain_arena(g, g->gc.tab); @@ -366,6 +409,10 @@ static void release_all_arenas(struct global_State *g) release_chain_arena(g, g->gc.uv); release_chain_arena(g, g->gc.func); release_chain_arena(g, g->gc.udata); + release_chain_arena(g, g->gc.str_small); + release_chain_arena(g, g->gc.str); + release_huge_strings(g); + release_string_table(g); for (uint32_t i = 0; i < g->gc.bloblist_wr; i++) { GCAblob *a = g->gc.bloblist[i]; if (a->flags & GCA_BLOB_HUGE) diff --git a/src/lj_arena.h b/src/lj_arena.h index 4147ccb0..e081c774 100644 --- a/src/lj_arena.h +++ b/src/lj_arena.h @@ -11,6 +11,7 @@ typedef unsigned (*luaJIT_allocpages)(void *ud, void **pages, unsigned n); typedef void (*luaJIT_freepages)(void *ud, void **pages, unsigned n); typedef void *(*luaJIT_reallochuge)(void *ud, void *p, size_t osz, size_t nsz); +typedef void *(*luaJIT_reallocraw)(void *ud, void *p, size_t osz, size_t nsz); #define ARENA_SHIFT 16 #define ARENA_SIZE (1u << ARENA_SHIFT) @@ -65,12 +66,13 @@ typedef struct arena_context { luaJIT_allocpages allocpages; luaJIT_freepages freepages; luaJIT_reallochuge reallochuge; + luaJIT_reallocraw rawalloc; void *pageud; } arena_context; int lj_arena_init(struct global_State *g, luaJIT_allocpages allocp, luaJIT_freepages freep, luaJIT_reallochuge realloch, - void *page_ud); + luaJIT_reallocraw rawalloc, void *page_ud); void lj_arena_cleanup(struct global_State *g); /* Add ARENA_FREELIST_CHUNK free arenas */ diff --git a/src/lj_ctype.c b/src/lj_ctype.c index 8a4a55f8..92b905d7 100644 --- a/src/lj_ctype.c +++ b/src/lj_ctype.c @@ -364,7 +364,7 @@ cTValue *lj_ctype_meta(CTState *cts, CTypeID id, MMS mm) } if (ctype_isptr(ct->info) && ctype_isfunc(ctype_get(cts, ctype_cid(ct->info))->info)) - tv = lj_tab_getstr(cts->miscmap, &cts->g->strempty); + tv = lj_tab_getstr(cts->miscmap, cts->g->strempty); else tv = lj_tab_getinth(cts->miscmap, -(int32_t)id); if (tv && tvistab(tv) && diff --git a/src/lj_def.h b/src/lj_def.h index 5bb7173a..cac50a36 100644 --- a/src/lj_def.h +++ b/src/lj_def.h @@ -55,12 +55,13 @@ typedef unsigned int uintptr_t; #define LJ_MAX_BUF LJ_MAX_MEM32 /* Max. buffer length. */ #define LJ_MAX_UDATA LJ_MAX_MEM32 /* Max. userdata length. */ -#define LJ_MAX_STRTAB (1<<26) /* Max. string table size. */ +#define LJ_MAX_STRTAB (1<<22) /* Max. string table size. */ #define LJ_MAX_HBITS 26 /* Max. hash bits. */ #define LJ_MAX_ABITS 28 /* Max. bits of array key. */ #define LJ_MAX_ASIZE ((1<<(LJ_MAX_ABITS-1))+1) /* Max. array part size. */ #define LJ_MAX_COLOSIZE 0 /* Max. elems for colocated array. */ #define LJ_COLO_ENABLED 1 +#define LJ_HUGE_STR_THRESHOLD 4000 #define LJ_MAX_LINE LJ_MAX_MEM32 /* Max. source code line number. */ #define LJ_MAX_XLEVEL 200 /* Max. syntactic nesting level. */ @@ -77,7 +78,7 @@ typedef unsigned int uintptr_t; /* Minimum table/buffer sizes. */ #define LJ_MIN_GLOBAL 6 /* Min. global table size (hbits). */ #define LJ_MIN_REGISTRY 2 /* Min. registry size (hbits). */ -#define LJ_MIN_STRTAB 256 /* Min. string table size (pow2). */ +#define LJ_MIN_STRTAB 64 /* Min. string table size (pow2). */ #define LJ_MIN_SBUF 32 /* Min. string buffer length. */ #define LJ_MIN_VECSZ 8 /* Min. size for growable vectors. */ #define LJ_MIN_IRSZ 32 /* Min. size for growable IR. */ diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c index d3029da0..d37e89c9 100644 --- a/src/lj_ffrecord.c +++ b/src/lj_ffrecord.c @@ -849,7 +849,7 @@ static void LJ_FASTCALL recff_string_range(jit_State *J, RecordFFData *rd) J->base[0] = emitir(IRT(IR_SNEW, IRT_STR), trptr, trslen); } else { /* Range underflow: return empty string. */ emitir(IRTGI(IR_LT), trend, trstart); - J->base[0] = lj_ir_kstr(J, &J2G(J)->strempty); + J->base[0] = lj_ir_kstr(J, J2G(J)->strempty); } } else { /* Return string.byte result(s). */ ptrdiff_t i, len = end - start; @@ -886,7 +886,7 @@ static void LJ_FASTCALL recff_string_char(jit_State *J, RecordFFData *rd) tr = emitir(IRTG(IR_BUFPUT, IRT_PGC), tr, J->base[i]); J->base[0] = emitir(IRTG(IR_BUFSTR, IRT_STR), tr, hdr); } else if (i == 0) { - J->base[0] = lj_ir_kstr(J, &J2G(J)->strempty); + J->base[0] = lj_ir_kstr(J, J2G(J)->strempty); } UNUSED(rd); } diff --git a/src/lj_gc.c b/src/lj_gc.c index 31675377..b36c9ad8 100644 --- a/src/lj_gc.c +++ b/src/lj_gc.c @@ -41,6 +41,9 @@ ((x)->gch.gcflags = (((x)->gch.gcflags & (uint8_t)~LJ_GC_COLORS) | (g)->gc.currentblack)) #define isfinalized(u) ((u)->gcflags & LJ_GC_FINALIZED) +#define lj_huge_str_size(len) \ + len + 1 + offsetof(GCAstr, mark[2]) + sizeof(GCstr) + /* -- Mark phase ---------------------------------------------------------- */ #define gray_enq(a, g) \ @@ -106,7 +109,8 @@ static LJ_NOINLINE uintptr_t move_blob(global_State *g, uintptr_t src, MSize sz) /* ORDER LJ_T */ const uint32_t kInverseDividers[~LJ_TNUMX] = { - 0, 0, 0, 0, 0, + 0, 0, 0, 0, + MULTIPLICATIVE_INVERSE(sizeof(GCstr)), MULTIPLICATIVE_INVERSE(sizeof(GCupval)), 0, 0, MULTIPLICATIVE_INVERSE(sizeof(GCfunc)), @@ -115,7 +119,8 @@ const uint32_t kInverseDividers[~LJ_TNUMX] = { MULTIPLICATIVE_INVERSE(sizeof(GCudata)), }; const uint32_t kDividers[~LJ_TNUMX] = { - 0, 0, 0, 0, 0, + 0, 0, 0, 0, + sizeof(GCstr), sizeof(GCupval), 0, 0, sizeof(GCfunc), @@ -131,6 +136,20 @@ static void gc_mark_type(global_State *g, GCobj *o, int gct) { lj_assertG(gct == o->gch.gct, "GC type mismatch obj %d / param %d", o->gch.gct, gct); + if (gct == ~LJ_TSTR) { + /* There is a choice, we can either modify the object here, or we can + * put it on the gray queue and process it normally. If we mark it black + * here we can avoid the mark & bit branch below and triggering further + * barriers and avoid arena traversal. Doing it this way also allows + * a permanent gray state for fixed objects. + */ + GCAcommon *a = arena(o); + uint32_t idx = (uint32_t)(objmask(o) >> 4); + a->mark[aidxh(idx)] |= abit(aidxl(idx)); + o->gch.gcflags = (o->gch.gcflags & ~LJ_GC_BLACKS) | g->gc.currentblack; + return; + } + if (LJ_LIKELY(kInverseDividers[gct])) { /* Generic arena marking */ GCAcommon *a = arena(o); @@ -159,7 +178,7 @@ static void gc_mark_type(global_State *g, GCobj *o, int gct) lj_assertG(iswhite(g, o), "mark of non-white object"); lj_assertG(!checkdead(g, o), "mark of dead object"); white2gray(o); - if (gct != ~LJ_TSTR && gct != ~LJ_TCDATA) { + if (gct != ~LJ_TCDATA) { lj_assertG(gct == ~LJ_TTHREAD || gct == ~LJ_TPROTO || gct == ~LJ_TTRACE, "bad GC type %d", gct); lj_assertG(o->gch.gcflags & LJ_GC_GRAY, "not gray?"); @@ -716,7 +735,7 @@ typedef void (LJ_FASTCALL *GCFreeFunc)(global_State *g, GCobj *o); /* GC free functions for LJ_TSTR .. LJ_TUDATA. ORDER LJ_T */ static const GCFreeFunc gc_freefunc[] = { - (GCFreeFunc)lj_str_free, + (GCFreeFunc)0, (GCFreeFunc)0, (GCFreeFunc)lj_state_free, (GCFreeFunc)lj_func_freeproto, @@ -779,10 +798,21 @@ static void gc_free_arena(global_State *g, GCArenaHdr *a) if (!a->free[FREE_HIGH_INDEX(otype)]) \ free &= ~(1ull << FREE_HIGH_INDEX(otype)); \ } +#define sweep_fixup2(atype, otype) \ + free &= FREE_MASK(otype) & ~1ull; \ + a->free[0] = 0; \ + a->free[1] &= FREE_LOW2(atype, otype); \ + if (!a->free[1]) \ + free &= ~2ull; \ + if (HIGH_ELEMENTS_OCCUPIED(otype) != 0) { \ + a->free[FREE_HIGH_INDEX(otype)] &= FREE_HIGH(otype); \ + if (!a->free[FREE_HIGH_INDEX(otype)]) \ + free &= ~(1ull << FREE_HIGH_INDEX(otype)); \ + } /* The first arena in the list is the primary one. It is being allocated out of * and can never be put on the freelist or released */ -#define sweep_free(atype, src, freevar) \ +#define sweep_free(atype, src, freevar, cond, ...) \ if (LJ_LIKELY(g->gc.src != &a->hdr)) { \ if (LJ_UNLIKELY(I256_EQ_64_MASK(any, zero) == 0xF)) { \ GCArenaHdr *x = &a->hdr; \ @@ -790,10 +820,11 @@ static void gc_free_arena(global_State *g, GCArenaHdr *a) if (x == g->gc.freevar) { \ g->gc.freevar = x->freenext; \ } \ + __VA_ARGS__ \ gc_free_arena(g, x); \ continue; \ } \ - if (LJ_UNLIKELY(free && !a->free_h)) { \ + if (LJ_UNLIKELY(cond)) { \ free_enq(&a->hdr, g->gc.freevar); \ } \ } @@ -918,7 +949,7 @@ static void *gc_sweep_tab1_i256(global_State *g, GCAtab *a) sweep_fixup(GCAtab, GCtab); - sweep_free(GCAtab, tab, free_tab); + sweep_free(GCAtab, tab, free_tab, free && !a->free_h); a->free_h = free; a = (GCAtab *)a->hdr.next; @@ -962,7 +993,7 @@ static void *gc_sweep_fintab1_i256(global_State *g, GCAtab *a) sweep_fixup(GCAtab, GCtab); - sweep_free(GCAtab, fintab, free_fintab); + sweep_free(GCAtab, fintab, free_fintab, free && !a->free_h); a->free_h = free; a = (GCAtab *)a->hdr.next; @@ -1113,7 +1144,7 @@ static void *gc_sweep_func_i256(global_State *g, GCAfunc *a, uint32_t lim) sweep_fixup(GCAfunc, GCfunc); - sweep_free(GCAfunc, func, free_func); + sweep_free(GCAfunc, func, free_func, free && !a->free_h); a->free_h = free; a = (GCAfunc *)a->hdr.next; @@ -1151,7 +1182,7 @@ static void *gc_sweep_uv_i256(global_State *g, GCAupval *a, uint32_t lim) sweep_fixup(GCAupval, GCupval); - sweep_free(GCAupval, uv, free_uv); + sweep_free(GCAupval, uv, free_uv, free && !a->free_h); a->free_h = free; a = (GCAupval *)a->hdr.next; @@ -1285,6 +1316,330 @@ static void *gc_sweep_udata1(global_State *g, GCAudata *a) return a->hdr.next; } +StrTab *get_strtab(global_State *g, uint32_t hid) +{ + if (hid >= 0xFC000000) { + return strtab_primary(g, hid); + } else { + return strtab_secondary(g, hid); + } +} + +/* Clear one string table entry. + * Precondition: the entry referred to by hid must have a matching string + */ +static void gc_clear_strtab(global_State *g, uint32_t hid) +{ + StrTab *st; + uint32_t i = hid & 0xF; + g->str.num--; + /* Primary 111111, (22-bit array index), (4-bit entry index) + * Secondary (19-bit array index), (9-bit arena index), (4-bit entry index) + */ + lj_assertG(i != 0xF, "Invalid hid field - low index 15"); + if (hid >= 0xFC000000) { /* Primary always has the top 6 bits set */ + st = strtab_primary(g, hid); + lj_assertG(((hid & 0x3FFFFFF) >> 4) <= g->str.mask, + "Invalid hid field - primary list exceeded"); + lj_assertG(gcrefu(st->strs[i]) > 1, "Clearing null string"); + lj_assertG(((GCstr *)(gcrefu(st->strs[i]) & ~(uintptr_t)1))->hid == + hid, + "Mismatch, str->hid != hid"); + + setgcrefnull(st->strs[i]); + /* By flipping the bits we eliminate matches because the low N bits must + * match the index of the chain + */ + st->hashes[i] = ~st->hashes[i]; + st->prev_len--; + return; + } + lj_assertG((hid >> 13) < g->str.secondary_list_capacity, + "Invalid hid field - secondary list exceeded"); + lj_assertG(((hid >> 4) & 0x1FF) < STRTAB_ENTRIES_PER_ARENA, + "Invalid hid field - bad arena index"); + st = strtab_secondary(g, hid); + lj_assertG(gcrefu(st->strs[i]) > 1, "Clearing null string"); + lj_assertG(((GCstr *)(gcrefu(st->strs[i]) & ~(uintptr_t)1))->hid == + hid, + "Mismatch, str->hid != hid"); + setgcrefnull(st->strs[i]); + st->hashes[i] = ~st->hashes[i]; + st->prev_len--; + if(!(st->prev_len & 0xF)) { + lj_mem_freechainedstrtab(g, st); + } +} + +static void clean_str_small(global_State *g, GCstr *strs, uint64_t mask, uint64_t *free) +{ + do { + uint32_t i = tzcount64(mask); + mask = reset_lowest64(mask); + uint64_t v = free[i]; + do { + uint32_t j = tzcount64(v); + v = reset_lowest64(v); + + gc_clear_strtab(g, strs[(i << 6) + j].hid); + } while (v); + } while (mask); +} + +static void free_str_small(global_State *g, GCArenaHdr *h) +{ + GCstr *s = (GCstr*)h; + /* If the arena is considered dirty then every element is in use */ + for (uint32_t i = ELEMENTS_OCCUPIED(GCAstr, GCstr); + i < ARENA_SIZE / sizeof(GCstr); i += 2) { + gc_clear_strtab(g, s[i].hid); + } +} + +static void *gc_sweep_str_small(global_State *g, GCAstr *a, uint32_t lim) +{ + I256 v; + I256 x; + I256 t; + I256 any; + I256 zero; + I256 mask; + uint64_t temp_buf[67]; + uint64_t *temp = (uint64_t*)(((uintptr_t)temp_buf + 31) & ~31ull); + I256_ZERO(any); + I256_ZERO(zero); + I256_BCAST_8(mask, 0x55); + + for (; a && lim; lim--) { + uint64_t free = ~0ull; + uint32_t count = 0; + uint64_t free_mask = 0; + + lj_assertG((a->hdr.flags & LJ_GC_SWEEPS) != LJ_GC_SWEEPS, "both bits cannot be set!"); + + lj_assertG(!(a->hdr.flags & g->gc.currentsweep), "sweeping swept arena"); + a->hdr.flags ^= LJ_GC_SWEEPS; + + for (uint32_t i = 0; i < SIMD_WORDS_FOR_TYPE(GCstr); i++) { + /* + * count += popcount(mark) + * free = ~fixed & ~mark & 0x55 (implemented as (fixed | mark) ^ 0x55) + * mark = 0 + */ + I256_LOADA(v, &a->mark[i * SIMD_MULTIPLIER]); + I256_LOADA(x, &a->fixed[i * SIMD_MULTIPLIER]); + /* compute popcount(mark[i] | (mark[i+1] << 1)) */ + /* This should be slightly faster than doing it in scalar */ + I256_SHL_64(t, v, 1); + I256_SHUFFLE_64(t, t, 0xF); + I256_OR(t, t, v); + count += popcount64(I256_EXTRACT(t, 0)) + popcount64(I256_EXTRACT(t, 2)); + + I256_OR(t, v, x); + I256_OR(any, any, t); + if (!isminor(g)) + I256_STOREA(&a->mark[i * SIMD_MULTIPLIER], zero); + I256_XOR(t, t, mask); + + I256_LOADA(v, &a->free[i * SIMD_MULTIPLIER]); + I256_XOR(v, v, t); + I256_STOREA(&temp[i * SIMD_MULTIPLIER], v); + free_mask |= I256_NEQ_64_MASK(v, zero) << (4 * i); + I256_STOREA(&a->free[i * SIMD_MULTIPLIER], t); + free ^= I256_EQ_64_MASK(t, zero) << (SIMD_MULTIPLIER * i); + } + + free_mask &= ~1ull; + temp[1] &= FREE_LOW2(GCAstr, GCstr); + if (!temp[1]) + free_mask &= ~2ull; + + sweep_fixup2(GCAstr, GCstr); + + sweep_free(GCAstr, str_small, free_str_small, free && !a->free_h, + if (a->hdr.flags & LJ_GC_SWEEP_DIRTY) free_str_small(g, x); + else clean_str_small(g, (GCstr *)a, free_mask, temp);); + + g->str.num_small += count; + if (a->hdr.flags & LJ_GC_SWEEP_DIRTY) { + g->str.num_dead += ((ARENA_SIZE - sizeof(GCstr)) >> 5) - count; + } else if(free_mask) { + /* This isn't a dirty arena, so we must eagerly clean */ + clean_str_small(g, (GCstr*)a, free_mask, temp); + } + a->free_h = free; + a = (GCAstr *)a->hdr.next; + } + return a; +} + +static void *gc_sweep_str_small1(global_State *g, GCAstr *a) +{ + return gc_sweep_str_small(g, a, 1); +} + +/* Rescan this arena, aggregate adjacent free blocks and chain all free blocks + * together. + */ +static void gc_aggregate_str_freelist(global_State *g, GCAstr *a) +{ + uint32_t *pnext = &a->free_start; + FreeBlock *prev = NULL; + FreeBlock *b; + /* i is the current word, j is the current bit in that word. */ + uint32_t i, j; + /* at is the current byte offset, walk_at is the byte offset of the + * next entry in the previously existing freelist */ + uint32_t at, walk_at = a->free_start; + /* end is one past the end of the chunk starting at 'at' */ + uint32_t end = 0; + /* free contains a 1 if this starts a free block (mark & ~free) */ + uint64_t free; + + a->in_use = ARENA_SIZE - sizeof(GCAstr); + + /* This arena consists of + * Free blocks (mark & ~free) + * Newly freed strings (also mark & ~free) + * Valid strings (~mark & free) + * Extents (~mark & ~free) + * + * All current free blocks are chained, in-order into the freelist, so + * we can identify newly freed strings by whether the next free entry + * is at the expected offset. + */ + for (i = 1; i < 64; i++) { + free = ~a->free[i] & a->mark[i]; + while(free) { + j = tzcount64(free); + free = reset_lowest64(free); + at = (i << 10) | (j << 4); + + b = (FreeBlock *)((char *)a + at); + if (at == walk_at) { + walk_at = b->next; + + /* If this is the expected entry then continue walking the freelist. + * This may coalesce with the previous one + */ + if (at == end) { + prev->size += b->size; + end += b->size << 4; + /* Change to extent */ + a->mark[i] ^= abit(j); + continue; + } + } else { + /* This is a newly freed thing. */ + GCstr *str = (GCstr *)b; + uint32_t len = (str->len >> 4) + 2; + gc_clear_strtab(g, str->hid); + if (at == end) { + /* This coalesces with the previous entry. */ + prev->size += len; + end += len << 4; + /* Change to extent */ + a->mark[i] ^= abit(j); + continue; + } + /* New entry */ + b->size = len; + } + *pnext = at; + pnext = &b->next; + prev = b; + end = at + (b->size << 4); + a->in_use -= b->size << 4; + } + } + *pnext = 0; +} + +/* Allocation arena sweeping + * + * Small strings are collected lazily, to make the actual sweeping very fast. + * Lazy sweeping has a problem with a GC as it will never actually "free" + * memory unless entire arenas get released. This isn't a problem for other + * types as the accounting can still be done, and for small strings we + * can use a cheap popcount to compute the real active consumption, however + * for allocated strings we have to scan. + * + * If we didn't compute space used then allocations would act as a ratchet, + * new strings would go into freed space and either not increment the total + * and so not get included in the pacing, or falsely increment it and then + * either never disappear or disappear incorrectly. + * + * The solution is to do a full sweep and eager collection here. We might + * as well also compact free areas. + */ +static void *gc_sweep_str_med(global_State *g, GCAstr *a, uint32_t lim) +{ + I256 v; + I256 m; + I256 f; + I256 b; + I256 any; + I256 new_free; + I256 zero; + I256_ZERO(zero); + for (; a && lim; lim--) { + I256_ZERO(any); + I256_ZERO(new_free); + lj_assertG((a->hdr.flags & LJ_GC_SWEEPS) != LJ_GC_SWEEPS, + "both bits cannot be set!"); + + lj_assertG(!(a->hdr.flags & g->gc.currentsweep), "sweeping swept arena"); + a->hdr.flags ^= LJ_GC_SWEEPS; + + for (uint32_t i = 0; i < SIMD_WORDS_FOR_TYPE(GCstr); i++) { + /* + * (fixed, free, mark) -> (free, mark) + * 111 -> 10 + * 110 -> 10 + * 101 -> INVALID + * 100 -> INVALID + * 011 -> 10 + * 010 -> 01 + * 001 -> 01 + * 000 -> 00 + * free = (free & mark) | fixed + * mark = (free ^ mark) & ~fixed + */ + I256_LOADA(m, &a->mark[i * SIMD_MULTIPLIER]); + I256_LOADA(f, &a->fixed[i * SIMD_MULTIPLIER]); + I256_LOADA(b, &a->free[i * SIMD_MULTIPLIER]); + I256_AND(v, m, b); + I256_OR(v, v, f); + I256_OR(any, any, v); + I256_STOREA(&a->free[i * SIMD_MULTIPLIER], v); + I256_ANDNOT(v, b, v); + I256_OR(new_free, new_free, v); + I256_XOR(v, m, b); + I256_ANDNOT(v, v, f); + I256_STOREA(&a->mark[i * SIMD_MULTIPLIER], v); + } + int has_new_free = 0; + if (I256_EQ_64_MASK(new_free, zero) != 0xF) { + /* Even in the case where all strings are freed, we still need to remove + * the newly freed ones from the string table, so this can't be skipped. + */ + has_new_free = !(a->hdr.flags & LJ_GC_ON_FREE_LIST); + a->hdr.flags |= LJ_GC_ON_FREE_LIST; + gc_aggregate_str_freelist(g, a); + } + sweep_free(GCAstr, str, free_str, has_new_free); + + g->gc.strings += a->in_use; + a = (GCAstr *)a->hdr.next; + } + return a; +} + +static void *gc_sweep_str_med1(global_State *g, GCAstr *a) +{ + return gc_sweep_str_med(g, a, 1); +} + /* Partial sweep of a GC list. */ static GCRef *gc_sweep(global_State *g, GCRef *p, uint32_t lim) { @@ -1305,29 +1660,6 @@ static GCRef *gc_sweep(global_State *g, GCRef *p, uint32_t lim) return p; } -/* Sweep one string interning table chain. Preserves hashalg bit. */ -static void gc_sweepstr(global_State *g, GCRef *chain) -{ - /* Mask with other white and LJ_GC_FIXED. Or LJ_GC_SFIXED on shutdown. */ - int sweep = g->gc.safecolor; - uint8_t mask = isminor(g) ? 0xFF : ~LJ_GC_COLORS; - uintptr_t u = gcrefu(*chain); - GCRef q; - GCRef *p = &q; - GCobj *o; - setgcrefp(q, (u & ~(uintptr_t)1)); - while ((o = gcref(*p)) != NULL) { - if ((o->gch.gcflags & sweep)) { /* Black or current white? */ - o->gch.gcflags &= mask; /* String is alive. */ - p = &o->gch.nextgc; - } else { /* Otherwise string is dead, free it. */ - setgcrefr(*p, o->gch.nextgc); - lj_str_free(g, gco2str(o)); - } - } - setgcrefp(*chain, (gcrefu(q) | (u & 1))); -} - /* Check whether we can clear a key or a value slot from a table. */ static int gc_mayclear(global_State *g, cTValue *o, int val) { @@ -1497,13 +1829,10 @@ void lj_gc_finalize_cdata(lua_State *L) void lj_gc_freeall(global_State *g) { GCArenaHdr *a; - MSize i, strmask; + /* Free everything, except super-fixed objects (the main thread). */ g->gc.safecolor = LJ_GC_SFIXED; gc_fullsweep(g, &g->gc.root); - strmask = g->str.mask; - for (i = 0; i <= strmask; i++) /* Free all string hash chains. */ - gc_sweepstr(g, &g->str.tab[i]); /* Only track malloced data from this point. */ g->gc.total = g->gc.malloc; @@ -1553,7 +1882,7 @@ static void atomic(global_State *g, lua_State *L) /* Prepare for sweep phase. */ /* Gray is for strings which are gray while sweeping */ - g->gc.safecolor = g->gc.currentblack | LJ_GC_GRAY | LJ_GC_FIXED | LJ_GC_SFIXED; + g->gc.safecolor = g->gc.currentblack | LJ_GC_GRAY | LJ_GC_SFIXED; if (!isminor(g)) { /* Need to keep the thread list around */ setgcrefnull(g->gc.grayagain_th); @@ -1570,9 +1899,19 @@ static void atomic(global_State *g, lua_State *L) * next cycle anyway. * This is also why we cannot just assert that total >= malloc + accum * even though in practice that will almost always hold. + * + * String memory is not computed during mark as the objects aren't traversed + * and uniqueness is therefore not enforced. Instead, we use the old string + * memory as an estimate and do a fixup as we sweep. */ - g->gc.total = g->gc.malloc + g->gc.accum; + g->gc.total = g->gc.malloc + g->gc.accum + g->gc.old_strings; g->gc.estimate = g->gc.total - (GCSize)udsize; /* Initial estimate. */ + /* Strings are counted during sweep */ + g->gc.old_strings = g->gc.strings; + g->gc.strings = 0; + + g->str.num_small = 0; + g->str.num_dead = 0; /* We must clear the first arena of each type in here as the allocator * only checks when a new arena is acquired. Alternately a new arena @@ -1585,6 +1924,8 @@ static void atomic(global_State *g, lua_State *L) gc_sweep_func1(g, (GCAfunc *)g->gc.func); gc_sweep_uv1(g, (GCAupval *)g->gc.uv); gc_sweep_udata1(g, (GCAudata *)g->gc.udata); + gc_sweep_str_small1(g, (GCAstr*)g->gc.str_small); + gc_sweep_str_med1(g, (GCAstr*)g->gc.str); lj_assertG(g->gc.bloblist_wr > 0, "no blobs?"); g->gc.bloblist_sweep = g->gc.bloblist_wr - 2; @@ -1592,6 +1933,33 @@ static void atomic(global_State *g, lua_State *L) g->gc.bloblist_usage[g->gc.bloblist_wr - 1] = 0; } +static void gc_sweep_hugestrings(global_State *g, uint32_t count) +{ + GCAstr **n = mref(g->gc.sweep, GCAstr*); + GCAstr *a = *n; + while(a) { + GCstr *s = (GCstr *)((char *)a + offsetof(GCAstr, mark[2])); + if (!--count) { + setmref(g->gc.sweep, n); + return; + } + if((a->free_start | a->mark[0])) { + a->mark[0] = 0; + g->gc.strings += a->free_h; /* This is the total size */ + n = (GCAstr**)&a->hdr.gray; + a = (GCAstr*)a->hdr.gray; + } else { + GCAstr *f = a; + *n = (GCAstr*)a->hdr.gray; + a = (GCAstr*)a->hdr.gray; + gc_clear_strtab(g, s->hid); + + lj_arena_freehuge(&g->gc.ctx, f, lj_huge_str_size(s->len)); + } + } + setmrefu(g->gc.sweep, 0); +} + static void gc_sweepblobs(global_State *g) { GCAblob **list = g->gc.bloblist; @@ -1640,28 +2008,14 @@ static size_t gc_onestep(lua_State *L) if (tvref(g->jit_base)) /* Don't run atomic phase on trace. */ return LJ_MAX_MEM; atomic(g, L); - g->gc.state = GCSsweepstring; /* Start of sweep phase. */ - g->gc.sweepstr = 0; + g->gc.state = GCSsweep; /* Start of sweep phase. */ return 0; - case GCSsweepstring: { - GCSize old = g->gc.total; - gc_sweepstr(g, &g->str.tab[g->gc.sweepstr++]); /* Sweep one chain. */ - if (g->gc.sweepstr > g->str.mask) - g->gc.state = GCSsweep; /* All string hash chains sweeped. */ - lj_assertG(old >= g->gc.total, "sweep increased memory"); - g->gc.estimate -= old - g->gc.total; - return 0; - } case GCSsweep: { GCSize old = g->gc.total; setmref(g->gc.sweep, gc_sweep(g, mref(g->gc.sweep, GCRef), GCSWEEPMAX)); lj_assertG(old >= g->gc.total, "sweep increased memory"); g->gc.estimate -= old - g->gc.total; if (gcref(*mref(g->gc.sweep, GCRef)) == NULL) { - if (g->str.num <= (g->str.mask >> 2) && - g->str.mask > LJ_MIN_STRTAB * 2 - 1) { - lj_str_resize(L, g->str.mask >> 1); /* Shrink string table. */ - } g->gc.state = GCSsweep_blob; } /* TODO: make this non-atomic again */ @@ -1670,8 +2024,44 @@ static size_t gc_onestep(lua_State *L) case GCSsweep_blob: { if (~g->gc.bloblist_sweep) gc_sweepblobs(g); + g->gc.state = GCSsweep_smallstring; + setmref(g->gc.sweep, find_unswept(g, g->gc.str_small->next)); + return GCSWEEPCOST; + } + case GCSsweep_smallstring: { + if (mrefu(g->gc.sweep)) { + setmref(g->gc.sweep, gc_sweep_str_small(g, mref(g->gc.sweep, GCAstr), 10)); + } else { + g->gc.state = GCSsweep_string; + g->gc.strings += (GCSize)g->str.num_small << 5; + g->str.num += g->str.num_small; + setmref(g->gc.sweep, find_unswept(g, g->gc.str->next)); + } + return GCSWEEPCOST; + } + case GCSsweep_string: { + if (mrefu(g->gc.sweep)) { + setmref(g->gc.sweep, gc_sweep_str_med(g, mref(g->gc.sweep, GCAstr), 10)); + } else { + g->gc.state = GCSsweep_hugestring; + setmref(g->gc.sweep, &g->gc.str_huge); + } + return GCSWEEPCOST; + } + case GCSsweep_hugestring: { + if (mrefu(g->gc.sweep)) { + gc_sweep_hugestrings(g, 20); + return GCSWEEPCOST; + } g->gc.state = GCSsweep_func; setmref(g->gc.sweep, find_unswept(g, g->gc.func->next)); + /* String memory is known at this point, fixup total */ + g->gc.total -= g->gc.old_strings; + g->gc.total += g->gc.strings; + if (g->str.num <= (g->str.mask >> 2) && + g->str.mask > LJ_MIN_STRTAB * 2 - 1) { + lj_str_resize(L, g->str.mask >> 1); /* Shrink string table. */ + } return GCSWEEPCOST; } case GCSsweep_func: @@ -1748,6 +2138,8 @@ static size_t gc_onestep(lua_State *L) g->gc.state = GCSpause; /* End of GC cycle. */ g->gc.debt = 0; return 0; + case GCScompact_strtab: + return 0; default: lj_assertG(0, "bad GC state"); return 0; @@ -1949,7 +2341,7 @@ int checkdead(global_State *g, GCobj *o) * If we are reusing an arena we need to move it to the front of the queue for * the type and possibly sweep it */ -#define NEW_ARENA(fn, atype, otype, idtype, var, freevar, sweepfn, ...) \ +#define NEW_ARENA(fn, atype, otype, idtype, var, freevar, sweepfn, init, ...) \ static atype *fn(global_State *g) \ { \ atype *o; \ @@ -1970,7 +2362,7 @@ int checkdead(global_State *g, GCobj *o) o = (atype *)lj_arena_alloc(&g->gc.ctx); \ if (LJ_UNLIKELY(!o)) \ lj_err_mem(&gcref(g->cur_L)->th); \ - do_arena_init(o, g, idtype, atype, otype); \ + init(o, g, idtype, atype, otype); \ g->gc.var->prev = &o->hdr; \ o->hdr.next = g->gc.var; \ g->gc.var = &o->hdr; \ @@ -1979,13 +2371,14 @@ int checkdead(global_State *g, GCobj *o) } /* All bitmap allocators are basically the same */ -#define BM_ALLOC(type, arena, newfn, otype) \ +#define BM_ALLOC(type, arena, newfn, otype, ...) \ global_State *g = G(L); \ uint32_t i, j; \ uint64_t f; \ otype *x; \ type *o = (type *)g->gc.arena; \ if (LJ_UNLIKELY(!o->free_h)) { \ + __VA_ARGS__ \ o = newfn(g); \ } \ i = tzcount64(o->free_h); \ @@ -1999,14 +2392,170 @@ int checkdead(global_State *g, GCobj *o) x = &((otype *)o)[(i << 6) + j]; \ lj_assertG((char *)x + sizeof(otype) - (char *)o <= ARENA_SIZE, "out of bounds") -NEW_ARENA(lj_arena_tab, GCAtab, GCtab, ~LJ_TTAB, tab, free_tab, gc_sweep_tab) +NEW_ARENA(lj_arena_tab, GCAtab, GCtab, ~LJ_TTAB, tab, free_tab, gc_sweep_tab, + do_arena_init) NEW_ARENA(lj_arena_fintab, GCAtab, GCtab, ~LJ_TTAB, fintab, free_fintab, - gc_sweep_fintab) -NEW_ARENA(lj_arena_uv, GCAupval, GCupval, ~LJ_TUPVAL, uv, free_uv, gc_sweep_uv) + gc_sweep_fintab, do_arena_init) +NEW_ARENA(lj_arena_uv, GCAupval, GCupval, ~LJ_TUPVAL, uv, free_uv, gc_sweep_uv, do_arena_init) NEW_ARENA(lj_arena_func, GCAfunc, GCfunc, ~LJ_TFUNC, func, free_func, - gc_sweep_func) + gc_sweep_func, do_arena_init) NEW_ARENA(lj_arena_udata, GCAudata, GCudata, ~LJ_TUDATA, udata, free_udata, - gc_sweep_udata, o->free4_h = o->free_h;) + gc_sweep_udata, do_arena_init, o->free4_h = o->free_h;) +NEW_ARENA(lj_arena_str_small, GCAstr, GCstr, ~LJ_TSTR, str_small, + free_str_small, gc_sweep_str_small, do_smallstr_arena_init) + +GCAstr *lj_arena_str_med_new(global_State *g) +{ + GCAstr *o; + FreeBlock *b; + o = (GCAstr *)lj_arena_alloc(&g->gc.ctx); + if (LJ_UNLIKELY(!o)) + lj_err_mem(&gcref(g->cur_L)->th); + /* Zero the first 16 byte slot to clear out any existing object data. */ + memset(o, 0, sizeof(GCAstr) + sizeof(FreeBlock)); + o->hdr.obj_type = ~LJ_TSTR; + o->hdr.flags = g->gc.currentsweep; + o->free_start = sizeof(GCAstr); + o->mark[ELEMENTS_OCCUPIED(GCAstr, GCstr) / 64] = + abit(ELEMENTS_OCCUPIED(GCAstr, GCstr) % 64); + b = (FreeBlock *)(o + 1); + b->size = (ARENA_SIZE - sizeof(GCAstr)) >> 4; + if (LJ_LIKELY(g->gc.str)) + g->gc.str->prev = &o->hdr; + o->hdr.next = g->gc.str; + g->gc.str = &o->hdr; + return o; +} + +GCAstr* lj_arena_str_med(global_State *g) +{ + GCAstr *o; + if (LJ_LIKELY(g->gc.free_str)) { + o = (GCAstr *)g->gc.free_str; + lj_assertG(o->hdr.flags & LJ_GC_ON_FREE_LIST, "LJ_GC_ON_FREE_LIST not set"); + relink(g->gc.free_str, g->gc.str); + g->gc.str = &o->hdr; + o->hdr.freenext = o->hdr.freeprev = NULL; + if (LJ_UNLIKELY(!(g->gc.currentsweep & o->hdr.flags))) { + if (LJ_UNLIKELY(mref(g->gc.sweep, GCAstr) == o)) { + setmref(g->gc.sweep, o->hdr.next); + } + gc_sweep_str_med1(g, o); + } + o->hdr.flags &= ~LJ_GC_ON_FREE_LIST; + lj_assertG(o->free_start != 0, "no free data?"); + return o; + } + return lj_arena_str_med_new(g); +} + +StrTab* lj_mem_allocstrtab(lua_State *L, uint32_t *id) +{ + global_State *g = G(L); + if (g->str.secondary_arena_free_head < 0) { + /* No arenas with free space */ + if(LJ_UNLIKELY(g->str.secondary_slot_free_head < 0)) { + /* Array is full */ + uint32_t newsz = g->str.secondary_list_capacity * 2; + if (newsz > STRING_SECONDARY_MAXIMUM_SIZE) { + if (g->str.secondary_list_capacity == STRING_SECONDARY_MAXIMUM_SIZE) { + lj_err_mem(L); + } + newsz = STRING_SECONDARY_MAXIMUM_SIZE; + } + lj_mem_reallocvec(L, g->str.secondary_list, g->str.secondary_list_capacity, newsz, MRef); + for(uint32_t i = g->str.secondary_list_capacity; i < newsz - 1; i++) { + setmrefu(g->str.secondary_list[i], i+1); + } + setmrefu(g->str.secondary_list[newsz - 1], ~0ull); + g->str.secondary_slot_free_head = (int32_t)g->str.secondary_list_capacity; + g->str.secondary_list_capacity = newsz; + } + + MRef *ref = &g->str.secondary_list[g->str.secondary_slot_free_head]; + int32_t next = (int32_t)mrefu(*ref); + GCAstrtab *o = (GCAstrtab *)lj_arena_alloc(&g->gc.ctx); + setmref(*ref, o); + o->next = -1; + o->prev = -1; + o->index = (uint32_t)g->str.secondary_slot_free_head; + o->count = 0; +#if LJ_64 + o->free_h = 0x3F; + for(uint32_t i = 0; i < 5; i++) + o->free[i] = ~0ull; + o->free[5] = 0x1FFFFF; +#else +#error "Need 32-bit string table layout" +#endif + g->str.secondary_arena_free_head = g->str.secondary_slot_free_head; + g->str.secondary_slot_free_head = next; + } + + GCAstrtab *st = mref(g->str.secondary_list[g->str.secondary_arena_free_head], GCAstrtab); + uint32_t i = tzcount32(st->free_h); + uint32_t j = tzcount64(st->free[i]); + StrTab *ret = &st->entries[(i << 6) + j]; + *id = (g->str.secondary_arena_free_head << 13) | (i << 10) | (j << 4); + st->free[i] = reset_lowest64(st->free[i]); + if(!st->free[i]) + st->free_h = reset_lowest32(st->free_h); + if(++st->count == STRTAB_ENTRIES_PER_ARENA) { + g->str.secondary_arena_free_head = st->next; + if (st->next != -1) { + mref(g->str.secondary_list[st->next], GCAstrtab)->prev = -1; + } + } + memset(ret, 0, sizeof(StrTab)); + return ret; +} + +void lj_mem_freechainedstrtab(global_State *g, StrTab *st) +{ + /* Need to unchain this */ + StrTab *prev = get_strtab(g, st->prev_len); + prev->next = st->next; + if (st->next) { + st->next->prev_len = + (st->next->prev_len & 0xF) | (st->prev_len & 0xFFFFFFF0); + } + lj_mem_freestrtab(g, st); +} + +void lj_mem_freestrtab(global_State *g, StrTab *st) +{ + GCAstrtab *a = gcat(st, GCAstrtab); + int32_t index = (int32_t)(st - &a->entries[0]); + + if(!--a->count && index != g->str.secondary_arena_free_head) { + if(a->prev >= 0) { + GCAstrtab *p = mref(g->str.secondary_list[a->prev], GCAstrtab); + p->next = a->next; + } else { + g->str.secondary_arena_free_head = a->next; + } + if(a->next >= 0) { + GCAstrtab *n = mref(g->str.secondary_list[a->next], GCAstrtab); + n->prev = a->prev; + } + + setmrefu(g->str.secondary_list[a->index], g->str.secondary_slot_free_head); + g->str.secondary_slot_free_head = a->index; + lj_arena_free(&g->gc.ctx, a); + return; + } + if(!a->free_h) { + int32_t n = g->str.secondary_arena_free_head; + if(n >= 0) { + mref(g->str.secondary_list[n], GCAstrtab)->prev = a->index; + } + a->prev = -1; + a->next = g->str.secondary_arena_free_head; + g->str.secondary_arena_free_head = a->index; + } + a->free[aidxh(index)] |= abit(aidxl(index)); + a->free_h |= abit(aidxh(index)); +} static void lj_arena_newblobspace(global_State *g) { @@ -2107,10 +2656,128 @@ GCtab *lj_mem_alloctabempty_gc(lua_State *L) return x; } +GCstr* lj_mem_allocstr_huge(lua_State *L, MSize len) +{ + /* mark[0] contains our bit. + * mark[1] is not 16-byte aligned. + * mark[2] is the first legal address + * Special logic prevents fixed from being accessed in fixstring() + */ + global_State *g = G(L); + size_t size = lj_huge_str_size(len); + GCAstr *a = (GCAstr*)lj_arena_allochuge(&g->gc.ctx, size); + a->hdr.gray = g->gc.str_huge; + g->gc.str_huge = &a->hdr; + a->mark[0] = 0; + a->free_h = size; + a->free_start = 0; + g->gc.total += size; + g->gc.strings += size; + return (GCstr *)((char *)a + offsetof(GCAstr, mark[2])); +} + +LJ_STATIC_ASSERT(ELEMENTS_OCCUPIED(GCAstr, GCstr) * sizeof(GCstr) == sizeof(GCAstr)); + +GCstr* lj_mem_allocstr_med(lua_State *L, MSize len) +{ + /* # of blocks required for the payload */ + uint32_t n = (len >> 4) + 2; + global_State *g = G(L); + GCAstr *a = (GCAstr*)g->gc.str; + char *at = (char*)a + a->free_start; + FreeBlock *prev = NULL; + /* # of free arenas to try before getting a new one. + * Maybe vary this by size? + */ + uint32_t count = 3; + + if (!a->free_start) { + a = lj_arena_str_med(g); + at = (char *)a + a->free_start; + } + + while (1) { + FreeBlock *f = (FreeBlock*)at; + uint32_t idx; + if (f->size >= n) { + a->in_use += n << 4; + g->gc.total += n << 4; + g->gc.strings += n << 4; + if (f->size != n) { + f->size -= n; + at += f->size << 4; + idx = (uint32_t)(at - (char*)a) >> 4; + a->free[aidxh(idx)] |= abit(aidxl(idx)); + return (GCstr*)at; + } else if(prev) { + prev->next = f->next; + } else { + a->free_start = f->next; + } + idx = (uint32_t)(at - (char*)a) >> 4; + a->mark[aidxh(idx)] ^= abit(aidxl(idx)); + a->free[aidxh(idx)] ^= abit(aidxl(idx)); + return (GCstr*)at; + } + if (f->next == 0) { + if (!--count) { + /* Give up on the freelist, we are just burning through free arenas */ + a = lj_arena_str_med_new(g); + } else { + a = lj_arena_str_med(g); + } + at = (char *)a + a->free_start; + prev = NULL; + } else { + at = (char*)a + f->next; + prev = f; + } + } +} + GCstr *lj_mem_allocstr(lua_State *L, MSize len) { - GCstr *str = lj_mem_newt(L, lj_str_size(len), GCstr); - return str; + if(len > 15) { + if(len > LJ_HUGE_STR_THRESHOLD) { + return lj_mem_allocstr_huge(L, len); + } + return lj_mem_allocstr_med(L, len); + } + /* Small string. We can't use the macro because string resurrection may + * be randomly clearing free bits and won't fixup free_h. */ + global_State *g = G(L); + uint32_t i, j; + uint64_t f; + GCstr *x; + GCAstr *o = (GCAstr *)g->gc.str_small; + while (1) { + if (LJ_UNLIKELY(!o->free_h)) { + o->hdr.flags |= LJ_GC_SWEEP_DIRTY; + o = lj_arena_str_small(g); + } + i = tzcount64(o->free_h); + if (LJ_LIKELY(o->free[i])) + break; + o->free_h = reset_lowest64(o->free_h); + } + + j = tzcount64(o->free[i]); + lj_assertG((i << 6) + j >= ELEMENTS_OCCUPIED(GCAstr, GCstr), "bad arena"); + f = reset_lowest64(o->free[i]); + o->free[i] = f; + if (!f) + o->free_h = reset_lowest64(o->free_h); + x = &((GCstr *)o)[(i << 6) + j]; + lj_assertG((char *)x + sizeof(GCstr) - (char *)o <= ARENA_SIZE, + "out of bounds"); + + g->gc.total += sizeof(GCstr) * 2; + g->gc.strings += sizeof(GCstr) * 2; + if (o->hdr.flags & LJ_GC_SWEEP_DIRTY) { + /* This string is already in the string table, so remove it. */ + gc_clear_strtab(g, x->hid); + } + return x; } GCupval *lj_mem_allocuv(lua_State *L) @@ -2333,3 +3000,13 @@ void lj_mem_registergc_udata(lua_State *L, GCudata *ud) uint32_t idx = aidx(ud); a->fin_req[aidxh(idx)] |= abit(aidxl(idx)); } + +void *lj_mem_newpages(global_State *g, size_t sz) +{ + return g->gc.ctx.rawalloc(g->gc.ctx.pageud, NULL, 0, sz); +} + +void lj_mem_freepages(global_State *g, void *ptr, size_t sz) +{ + g->gc.ctx.rawalloc(g->gc.ctx.pageud, ptr, sz, 0); +} diff --git a/src/lj_gc.h b/src/lj_gc.h index 0e0721c3..4a505eb5 100644 --- a/src/lj_gc.h +++ b/src/lj_gc.h @@ -12,9 +12,12 @@ /* Garbage collector states. Order matters. */ enum { - GCSpause, GCSpropagate, GCSatomic, GCSsweepstring, GCSsweep, - GCSsweep_blob, GCSsweep_func, GCSsweep_tab, GCSsweep_fintab, - GCSsweep_uv, GCSsweep_udata, GCSfinalize_arena, GCSfinalize + GCSpause, GCSpropagate, GCSatomic, GCSsweep, GCSsweep_blob, + GCSsweep_smallstring, GCSsweep_string, GCSsweep_hugestring, + GCSsweep_func, GCSsweep_tab, GCSsweep_fintab, + GCSsweep_uv, GCSsweep_udata, GCSfinalize_arena, GCSfinalize, + /* These last states are optional */ + GCSclean_smallstr, GCScompact_strchain, GCScompact_strtab, }; /* Bitmasks for marked field of GCobj. */ @@ -25,7 +28,6 @@ enum { #define LJ_GC_WEAKKEY 0x10 #define LJ_GC_WEAKVAL 0x08 #define LJ_GC_CDATA_FIN 0x10 -#define LJ_GC_FIXED 0x20 #define LJ_GC_SFIXED 0x40 #define LJ_GC_MARK_MASK 0xE0 @@ -43,6 +45,12 @@ enum { #define LJ_GC_SWEEP0 0x01 #define LJ_GC_SWEEP1 0x02 #define LJ_GC_SWEEPS (LJ_GC_SWEEP0 | LJ_GC_SWEEP1) +/* If set this arena has new free elements and must be rescanned */ +#define LJ_GC_ON_FREE_LIST 0x8 +/* If set lazy sweeping knows this arena is dirty. */ +#define LJ_GC_SWEEP_DIRTY 0x10 + +#define LJ_STR_SECONDARY 0x10 /* Macros to test and set GCobj colors. */ #define iswhite(g, x) (!((x)->gch.gcflags & (g)->gc.currentblackgray)) @@ -58,15 +66,29 @@ LJ_FUNC int checkdead(global_State *g, GCobj *o); #define makewhite(x) \ ((x)->gch.gcflags = ((x)->gch.gcflags & (uint8_t)~LJ_GC_COLORS)) #define black2gray(x) ((x)->gch.gcflags |= (uint8_t)LJ_GC_GRAY) -#define fixstring(s) ((s)->gcflags |= LJ_GC_FIXED) + +#define fixstring(s) \ + { \ + GCstr *str = (s); \ + uint32_t idx = aidx(str); \ + str->gcflags = LJ_GC_GRAY; \ + if (str->len > LJ_HUGE_STR_THRESHOLD) \ + gcat(str, GCAstr)->free_start = 1; \ + else \ + gcat(str, GCAstr)->fixed[aidxh(idx)] |= abit(aidxl(idx)); \ + } #define markfinalized(x) ((x)->gch.gcflags |= LJ_GC_FINALIZED) -#define maybe_resurrect_str(g, s) \ - if (LJ_UNLIKELY(iswhite(g, obj2gco(s)) && (g)->gc.state == GCSsweepstring && \ - ((s)->hash & (g)->str.mask) >= (g)->gc.sweepstr)) \ - { \ - (s)->gcflags |= (g)->gc.currentblack; \ - } +#define maybe_resurrect_str(g, s) \ + { \ + GCAstr *a = gcat(s, GCAstr); \ + uint32_t idx = aidx(s); \ + uint64_t bit = abit(aidxl(idx)); \ + a->mark[aidxh(idx)] |= bit; \ + /* If this is a small string then we may need to clear the free bit */ \ + if ((s)->len <= 15) a->free[aidxh(idx)] &= ~bit; \ + } + #define isminor(g) (g->gc.gcmode & LJ_GCMODE_MINORSWEEP) @@ -163,8 +185,23 @@ static LJ_AINLINE void lj_mem_free(global_State *g, void *p, size_t osize) #define lj_mem_newt(L, s, t) ((t *)lj_mem_new(L, (s))) #define lj_mem_freet(g, p) lj_mem_free(g, (p), sizeof(*(p))) +void *lj_mem_newpages(global_State *g, size_t sz); +void lj_mem_freepages(global_State *g, void *ptr, size_t sz); + /* New GC */ +#define st_ref(o) ((GCstr *)(gcrefu(o) & ~(uintptr_t)1)) +#define st_alg(o) ((gcrefu(o) & 1)) + +typedef struct StrTab { + StrHash hashes[15]; + uint32_t prev_len; + GCRef strs[15]; + struct StrTab *next; +} StrTab; + +LJ_STATIC_ASSERT(sizeof(StrTab) % 64 == 0); + LJ_FUNC GCtab *lj_mem_alloctab(lua_State *L, uint32_t asize); LJ_FUNC GCtab *lj_mem_alloctabempty_gc(lua_State *L); LJ_FUNC GCstr *lj_mem_allocstr(lua_State *L, MSize len); @@ -173,6 +210,9 @@ LJ_FUNC GCupval *lj_mem_allocuv(lua_State *L); LJ_FUNC GCudata *lj_mem_allocudata(lua_State *L, MSize bytes); LJ_FUNC GCfunc *lj_mem_allocfunc(lua_State *L, MSize bytes); +LJ_FUNC StrTab* lj_mem_allocstrtab(lua_State *L, uint32_t *id); +LJ_FUNC void lj_mem_freestrtab(global_State *g, StrTab *st); +LJ_FUNC void lj_mem_freechainedstrtab(global_State *g, StrTab *st); LJ_FUNC void *lj_mem_newblob(lua_State *L, MSize sz); LJ_FUNC void *lj_mem_reallocblob(lua_State *L, void *p, MSize osz, MSize nsz); @@ -211,7 +251,8 @@ typedef uint64_t bitmap_t; #define FREE_EXTRA_MASK(type) (~0ull >> (WORD_BITS - WORDS_FOR_TYPE(type))) #define FREE_MASK(type) (~0ull >> (WORD_BITS - WORDS_FOR_TYPE_UNROUNDED(type))) -#define FREE_LOW(atype, type) ~0ull << ELEMENTS_OCCUPIED(atype, type) +#define FREE_LOW(atype, type) (~0ull << ELEMENTS_OCCUPIED(atype, type)) +#define FREE_LOW2(atype, type) (~0ull << (ELEMENTS_OCCUPIED(atype, type) - 64)) /* The else branch of the ternary is incorrect and must be guarded against, * but it eliminates UB an a warning. It should be resolved at compile time */ #define FREE_HIGH(type) \ @@ -254,6 +295,24 @@ LJ_STATIC_ASSERT(MAX_BMARRAY_SIZE <= WORD_BITS); if (HIGH_ELEMENTS_OCCUPIED(otype) != 0) \ a->free[FREE_HIGH_INDEX(otype)] = FREE_HIGH(otype) +/* Small strings are 32-byte objects in 16-byte granularity so only every other + * object is valid. + */ +#define EVERY_OTHER_OBJECT 0x5555555555555555ull + +#define do_smallstr_arena_init(a, g, id, atype, otype) \ + memset(a, 0, sizeof(atype)); \ + a->hdr.obj_type = id; \ + a->hdr.flags = g->gc.currentsweep; \ + a->free_h = FREE_MASK(otype) ^ 1; \ + for (uint32_t i = 0; i < WORDS_FOR_TYPE_UNROUNDED(otype); i++) { \ + a->free[i] = EVERY_OTHER_OBJECT; \ + } \ + a->free[0] = 0; \ + a->free[1] = FREE_LOW2(atype, otype) & EVERY_OTHER_OBJECT; \ + if (HIGH_ELEMENTS_OCCUPIED(otype) != 0) \ + a->free[FREE_HIGH_INDEX(otype)] = FREE_HIGH(otype) & EVERY_OTHER_OBJECT + typedef struct GCAcommon { GCArenaHdr hdr; bitmap_t unspecified; @@ -318,6 +377,40 @@ typedef struct GCAfunc { bitmap_t gray[WORDS_FOR_TYPE(GCfunc)]; } GCAfunc; +/* This is designed to overlay on sid & len which are not useful for freeing */ +typedef struct FreeBlock { + uint32_t gc_hdr; + uint32_t next; + uint32_t gcstr_hid; + uint32_t size; /* This is # of 16-byte chunks */ +} FreeBlock; + +/* Ensure the overlay does not clobber hid in the string */ +LJ_STATIC_ASSERT(offsetof(FreeBlock, gcstr_hid) == offsetof(GCstr, hid)); +LJ_STATIC_ASSERT(offsetof(FreeBlock, size) == offsetof(GCstr, len)); + +/* For the general purpose allocator: + * (free mark) + * 0 0 - Extent + * 0 1 - Free block + * 1 0 - In use, white + * 1 1 - In use, black + * + * Allocation is a simple LL chained series of FreeBlock + */ +typedef struct GCAstr { + GCArenaHdr hdr; + bitmap_t free_h; + uint32_t in_use; + uint32_t free_start; + bitmap_t mark[64]; + /* No gray bitmap, padding not required. + * fixed acts as the old GC_FIXED and acts as a permanent mark. + */ + bitmap_t fixed[64]; + bitmap_t free[64]; +} GCAstr; + /* All offsets must match the common arena */ LJ_STATIC_ASSERT(offsetof(GCAtab, gray) == offsetof(GCAcommon, gray)); LJ_STATIC_ASSERT(offsetof(GCAtab, mark) == offsetof(GCAcommon, mark)); @@ -327,6 +420,7 @@ LJ_STATIC_ASSERT(offsetof(GCAupval, gray) == offsetof(GCAcommon, gray)); LJ_STATIC_ASSERT(offsetof(GCAupval, mark) == offsetof(GCAcommon, mark)); LJ_STATIC_ASSERT(offsetof(GCAfunc, gray) == offsetof(GCAcommon, gray)); LJ_STATIC_ASSERT(offsetof(GCAfunc, mark) == offsetof(GCAcommon, mark)); +LJ_STATIC_ASSERT(offsetof(GCAstr, mark) == offsetof(GCAcommon, mark)); #if LJ_HASJIT typedef struct GCAtrace { @@ -342,4 +436,67 @@ LJ_STATIC_ASSERT(offsetof(GCAtrace, gray) == offsetof(GCAcommon, gray)); LJ_STATIC_ASSERT(offsetof(GCAtrace, mark) == offsetof(GCAcommon, mark)); #endif +/* Strings & the String Table + * + * Strings are arena allocated like other objects. Strings are assumed to be + * 16-byte granularity in all cases, which is the smallest permitted size. + * + * Strings are classified into small, medium or huge. + * Small strings have a (NUL-inclusive) payload of <= 16 bytes and are bitmap + * allocated with every other entry being ignored by the sweep code. + * Huge strings have a payload > 3000 bytes and have a dedicated allocation + * Medium strings use a scanning allocator in custom arena. + * + * The string table consists of two areas, the primary and secondary areas. + * Each area consists of some number of StrTab objects, each containing up to + * 15 (hash, GCstr*) mappings. + * The primary area is an array of up to LJ_MAX_STRTAB entries. + * The secondary area is an array of arenas each split into + * STRTAB_ENTRIES_PER_ARENA entries. + * + * Each string holds an reference to where it lives in the string table + * For primary: + * 111111, (22-bit array index), (4-bit entry index) + * For secondary: + * (19-bit array index), (9-bit arena index), (4-bit entry index) + * + * This implies that the theoretical maximum number of strings allowed is + * 15 * LJ_MAX_STRTAB + 15 * STRTAB_ENTRIES_PER_ARENA * 0x7DFFF + * At present for 64-bit this is 62914560 + 2639825925 = 2702740485 + * + * Lazy string collection + * Small strings are collected normally but are removed from the string + * table lazily. This allows string sweep to be extremely fast. + * + * Full collection is done when the new string is allocated - we are + * touching the memory anyway, or when the entire arena is being freed. + */ + +#define STRTAB_ENTRIES_PER_ARENA ((ARENA_SIZE - 64) / sizeof(StrTab)) +#define STRTAB_UPPER_SHIFT ((ARENA_SIZE - 64) / sizeof(StrTab)) + +#define STRING_SECONDARY_MAXIMUM_SIZE 0x7DFFF + +/* */ +typedef struct GCAstrtab { + int32_t next; + int32_t prev; + int32_t index; + uint16_t count; + uint16_t free_h; + /* TODO: layout for 32-bit mode */ + uint64_t free[6]; + StrTab entries[STRTAB_ENTRIES_PER_ARENA]; +} GCAstrtab; + +LJ_STATIC_ASSERT(sizeof(GCAstrtab) == ARENA_SIZE); + +#define strtab_primary(g, id) \ + &mref((g)->str.tab, StrTab)[((id) >> 4) & 0x3FFFFF] +#define strtab_secondary(g, id) \ + &mref((g)->str.secondary_list[(id) >> 13], GCAstrtab) \ + ->entries[((id) >> 4) & 0x1FF] + +GCAstr *lj_arena_str_med(global_State *g); + #endif diff --git a/src/lj_intrin.h b/src/lj_intrin.h index d0c9a5a5..d47d5c55 100644 --- a/src/lj_intrin.h +++ b/src/lj_intrin.h @@ -15,8 +15,18 @@ #define tzcount64(x) (unsigned)_tzcnt_u64(x) /* x & (x - 1) */ -#define reset_lowest32(x) _blsr_u32(x) -#define reset_lowest64(x) _blsr_u64(x) +#define reset_lowest32(x) (uint32_t)_blsr_u32(x) +#define reset_lowest64(x) (uint64_t)_blsr_u64(x) + +/* x ^ (x - 1) */ +#define mask_lowest32(x) (uint32_t)_blsmsk_u32(x) +#define mask_lowest64(x) (uint64_t)_blsmsk_u64(x) + +/* x & ~y */ +#define and_not32(x, y) (uint32_t)_andn_u32(y, x) +#define and_not64(x, y) (uint64_t)_andn_u64(y, x) + +#define popcount64(x) (unsigned)_mm_popcnt_u64(x) /* 256 bit SIMD */ #define LJ_SIMD_256 1 @@ -24,12 +34,21 @@ #define I256_ZERO(o) o = _mm256_setzero_si256() /* vpxor a, a, a; vpcmpeqq a, a, a sets all bits to 1 */ #define I256_ONES(o) o = _mm256_cmpeq_epi64(_mm256_setzero_si256(), _mm256_setzero_si256()) +#define I256_BCAST_8(o, v) o = _mm256_set1_epi8((char)v) +#define I256_BCAST_32(o, v) o = _mm256_set1_epi32((int)v) +#define I256_NEQ_64_MASK(x, y) ((uint64_t)_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(x, y))) ^ 0xF) #define I256_EQ_64_MASK(x, y) (uint64_t)_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(x, y))) +#define I256_EQ_32_MASK(x, y) (uint64_t)_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(x, y))) #define I256_AND(o, x, y) o = _mm256_and_si256(x, y) #define I256_XOR(o, x, y) o = _mm256_xor_si256(x, y) #define I256_OR(o, x, y) o = _mm256_or_si256(x, y) -#define I256_LOADA(o, ptr) o = _mm256_load_si256((__m256i *)ptr) -#define I256_STOREA(ptr, v) _mm256_store_si256((__m256i *)ptr, v) +#define I256_ANDNOT(o, x, y) o = _mm256_andnot_si256(y, x) /* x & ~y */ +#define I256_SHL_64(o, x, n) o = _mm256_slli_epi64(x, n) +#define I256_SHUFFLE_64(o, x, mask) \ + o = _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), mask)) +#define I256_LOADA(o, ptr) o = _mm256_load_si256((__m256i *)(ptr)) +#define I256_STOREA(ptr, v) _mm256_store_si256((__m256i *)(ptr), v) +#define I256_EXTRACT(x, n) (uint64_t)_mm256_extract_epi64(x, n) #else #error "No intrinsics defined for arch" diff --git a/src/lj_meta.c b/src/lj_meta.c index 5940097e..65826602 100644 --- a/src/lj_meta.c +++ b/src/lj_meta.c @@ -34,12 +34,19 @@ void lj_meta_init(lua_State *L) global_State *g = G(L); const char *p, *q; uint32_t mm; + GCstr *prev = NULL; for (mm = 0, p = metanames; *p; mm++, p = q) { GCstr *s; for (q = p+2; *q && *q != '_'; q++) ; s = lj_str_new(L, p, (size_t)(q-p)); - /* NOBARRIER: g->gcroot[] is a GC root. */ - setgcref(g->gcroot[GCROOT_MMNAME+mm], obj2gco(s)); + lj_assertX(q-p <= 15, "metamethod %d not short string", mm); + lj_assertX(!prev || prev + 2 == s, "unexpected string ordering"); + prev = s; + (void)prev; + fixstring(s); + if (mm == 0) { + setgcrefp(g->meta_root, s); + } } } diff --git a/src/lj_obj.h b/src/lj_obj.h index ee537583..a6bc2f5d 100644 --- a/src/lj_obj.h +++ b/src/lj_obj.h @@ -62,7 +62,7 @@ typedef struct GCRef { /* Common GC header for all collectable objects. */ #define GCHeader uint8_t gcflags; uint8_t gct -/* This occupies 6 bytes, so use the next 2 bytes for non-32 bit fields. */ +/* This occupies 2 bytes, so use the next 2 bytes for non-32 bit fields. */ #if LJ_GC64 #define gcref(r) ((GCobj *)(r).gcptr64) @@ -108,6 +108,7 @@ typedef struct GCRef { ** sure nothing invokes the GC inbetween. ** - The target and the source are the same object (self-reference). ** - The target already contains the object (e.g. moving elements around). +** - The target is a fixed string (uncollectible). ** ** The most common case is a store to a stack slot. All other cases where ** a barrier has been omitted are annotated with a NOBARRIER comment. @@ -249,6 +250,7 @@ typedef const TValue cTValue; ** ------MSW------.------LSW------ ** primitive types |1..1|itype|1..................1| ** GC objects |1..1|itype|-------GCRef--------| +** 32bit GC objects |1..1|itype|ptyp|--GCRef ofs----| ** lightuserdata |1..1|itype|seg|------ofs-------| ** int (LJ_DUALNUM) |1..1|itype|0..0|-----int-------| ** number ------------double------------- @@ -307,13 +309,15 @@ typedef uint32_t StrID; /* String ID. */ typedef struct GCstr { GCHeader; uint8_t reserved; /* Used by lexer for fast lookup of reserved words. */ - uint8_t hashalg; /* Hash algorithm. */ - GCRef nextgc; + uint8_t unused; StrID sid; /* Interned string ID. */ - StrHash hash; /* Hash of string. */ + uint32_t hid; /* Location of hash table entry. */ MSize len; /* Size of string. */ } GCstr; +/* String payloads can be assumed to be 16-byte aligned. */ +LJ_STATIC_ASSERT(sizeof(GCstr) == 16); + #define strref(r) (&gcref((r))->str) #define strdata(s) ((const char *)((s)+1)) #define strdatawr(s) ((char *)((s)+1)) @@ -584,8 +588,6 @@ MMDEF(MMENUM) /* GC root IDs. */ typedef enum { - GCROOT_MMNAME, /* Metamethod names. */ - GCROOT_MMNAME_LAST = GCROOT_MMNAME + MM__MAX-1, GCROOT_BASEMT, /* Metatables for base types. */ GCROOT_BASEMT_NUM = GCROOT_BASEMT + ~LJ_TNUMX, GCROOT_IO_INPUT, /* Userdata for default I/O input file. */ @@ -593,9 +595,9 @@ typedef enum { GCROOT_MAX } GCRootID; -#define basemt_it(g, it) ((g)->gcroot[GCROOT_BASEMT+~(it)]) -#define basemt_obj(g, o) ((g)->gcroot[GCROOT_BASEMT+itypemap(o)]) -#define mmname_str(g, mm) (strref((g)->gcroot[GCROOT_MMNAME+(mm)])) +#define basemt_it(g, it) ((g)->gcroot[GCROOT_BASEMT+~(it)]) +#define basemt_obj(g, o) ((g)->gcroot[GCROOT_BASEMT+itypemap(o)]) +#define mmname_str(g, mm) (strref((g)->meta_root) + 2*(mm)) /* Garbage collector state. */ typedef struct GCState { @@ -613,7 +615,6 @@ typedef struct GCState { #else uint8_t unused1; #endif - MSize sweepstr; /* Sweep position in string table. */ GCRef root; /* List of all collectable objects. */ MRef sweep; /* Sweep position in root list. */ GCRef gray; /* List of gray objects. */ @@ -627,6 +628,8 @@ typedef struct GCState { GCSize estimate; /* Estimate of memory actually in use. */ GCSize accum; /* Accumulated memory marked. */ GCSize malloc; /* Malloced memory. */ + GCSize strings; /* String memory */ + GCSize old_strings; MSize stepmul; /* Incremental GC step granularity. */ MSize pause; /* Pause between successive GC cycles. */ #if LJ_64 @@ -641,6 +644,8 @@ typedef struct GCState { GCArenaHdr *uv; GCArenaHdr *func; GCArenaHdr *udata; + GCArenaHdr *str_small; + GCArenaHdr *str; /* This is the allocated-from blob arena. Never NULL */ GCAblob *blob_generic; @@ -663,18 +668,30 @@ typedef struct GCState { GCArenaHdr *free_uv; GCArenaHdr *free_func; GCArenaHdr *free_udata; + GCArenaHdr *free_str_small; + GCArenaHdr *free_str; + + /* Huge string list. Chains with 'gray' */ + GCArenaHdr *str_huge; } GCState; /* String interning state. */ typedef struct StrInternState { - GCRef *tab; /* String hash table anchors. */ + MRef tab; /* String hash table. */ MSize mask; /* String hash mask (size of hash table - 1). */ MSize num; /* Number of strings in hash table. */ - StrID id; /* Next string ID. */ + MSize num_small; + MSize num_dead; + StrID id; /* Next string ID. */ uint8_t idreseed; /* String ID reseed counter. */ uint8_t second; /* String interning table uses secondary hashing. */ uint8_t unused1; uint8_t unused2; + uint32_t secondary_list_capacity; + int32_t secondary_slot_free_head; + int32_t secondary_arena_free_head; + /* Each entry is either a GCAstrtab* or a freelist chain entry */ + MRef *secondary_list; LJ_ALIGN(8) uint64_t seed; /* Random string seed. */ } StrInternState; @@ -683,8 +700,8 @@ typedef struct global_State { lua_Alloc allocf; /* Memory allocator. */ void *allocd; /* Memory allocator data. */ GCState gc; /* Garbage collector. */ - GCstr strempty; /* Empty string. */ - uint8_t stremptyz; /* Zero terminator of empty string. */ + GCstr *strempty; /* Empty string. */ + uint8_t unused; uint8_t hookmask; /* Hook mask. */ uint8_t dispatchmode; /* Dispatch mode. */ uint8_t vmevmask; /* VM event mask. */ @@ -706,6 +723,7 @@ typedef struct global_State { MRef jit_base; /* Current JIT code L->base or NULL. */ MRef ctype_state; /* Pointer to C type state. */ PRNGState prng; /* Global PRNG state. */ + GCRef meta_root; /* Root of metatable strings */ GCRef gcroot[GCROOT_MAX]; /* GC roots. */ } global_State; @@ -801,7 +819,6 @@ LJ_STATIC_ASSERT(offsetof(GChead, gclist) == offsetof(GCproto, gclist)); LJ_STATIC_ASSERT(offsetof(GChead, nextgc) == offsetof(lua_State, nextgc)); LJ_STATIC_ASSERT(offsetof(GChead, nextgc) == offsetof(GCproto, nextgc)); LJ_STATIC_ASSERT(offsetof(GChead, nextgc) == offsetof(GCcdata, nextgc)); -LJ_STATIC_ASSERT(offsetof(GChead, nextgc) == offsetof(GCstr, nextgc)); typedef union GCobj { GChead gch; diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index ce78505b..6e96643e 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -514,7 +514,7 @@ LJFOLD(XSNEW any KINT) LJFOLDF(kfold_snew_empty) { if (fright->i == 0) - return lj_ir_kstr(J, &J2G(J)->strempty); + return lj_ir_kstr(J, J2G(J)->strempty); return NEXTFOLD; } @@ -653,7 +653,7 @@ LJFOLDF(bufstr_kfold_cse) if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) { if (fleft->o == IR_BUFHDR) { /* No put operations? */ if (fleft->op2 == IRBUFHDR_RESET) /* Empty buffer? */ - return lj_ir_kstr(J, &J2G(J)->strempty); + return lj_ir_kstr(J, J2G(J)->strempty); fins->op1 = fleft->op1; fins->op2 = fleft->prev; /* Relies on checks in bufput_append. */ return CSEFOLD; diff --git a/src/lj_record.c b/src/lj_record.c index e3d94e9b..ad8d010e 100644 --- a/src/lj_record.c +++ b/src/lj_record.c @@ -2101,7 +2101,7 @@ static TRef rec_cat(jit_State *J, BCReg baseslot, BCReg topslot) topslot = J->maxslot--; *xbase = tr; top = xbase; - setstrV(J->L, &ix.keyv, &J2G(J)->strempty); /* Simulate string result. */ + setstrV(J->L, &ix.keyv, J2G(J)->strempty); /* Simulate string result. */ } else { J->maxslot = topslot-1; copyTV(J->L, &ix.keyv, &J->L->base[topslot]); diff --git a/src/lj_state.c b/src/lj_state.c index c4676cff..01b3438c 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -187,7 +187,6 @@ static void close_state(lua_State *L) lj_gc_freeall(g); lj_assertG(gcref(g->gc.root) == obj2gco(L), "main thread is not first GC object"); - lj_assertG(g->str.num == 0, "leaked %d strings", g->str.num); lj_trace_freestate(g); #if LJ_HASFFI lj_ctype_freestate(g); @@ -202,6 +201,8 @@ static void close_state(lua_State *L) } #endif lj_arena_cleanup(g); + lj_mem_freevec(g, g->str.secondary_list, g->str.secondary_list_capacity, + MRef); lj_assertG(g->gc.ctx.mem_commit == 0, "memory leak of %u arenas", g->gc.ctx.mem_commit); lj_assertG(g->gc.ctx.mem_huge == 0, "memory leak of %llu huge arena bytes", @@ -222,7 +223,7 @@ lua_State *lj_state_newstate(lua_Alloc allocf, void *allocd) #else LUA_API lua_State *lua_newstate(lua_Alloc allocf, void *allocd) { - return lj_newstate(allocf, allocd, NULL, NULL, NULL, NULL); + return lj_newstate(allocf, allocd, NULL, NULL, NULL, NULL, NULL); } #endif @@ -230,6 +231,7 @@ lua_State *lj_newstate(lua_Alloc allocf, void *allocd, luaJIT_allocpages allocp, luaJIT_freepages freep, luaJIT_reallochuge realloch, + luaJIT_reallocraw rawalloc, void *page_ud) { PRNGState prng; @@ -257,19 +259,17 @@ lua_State *lj_newstate(lua_Alloc allocf, void *allocd, g->allocf = allocf; g->allocd = allocd; g->gc.currentsweep = LJ_GC_SWEEP0; - if (!lj_arena_init(g, allocp, freep, realloch, page_ud)) { + if (!lj_arena_init(g, allocp, freep, realloch, rawalloc, page_ud)) { close_state(L); return NULL; } L->gct = ~LJ_TTHREAD; - L->gcflags = LJ_GC_FIXED | LJ_GC_SFIXED; /* Prevent free. */ + L->gcflags = LJ_GC_SFIXED; /* Prevent free. */ L->dummy_ffid = FF_C; setmref(L->glref, g); g->gc.currentblack = LJ_GC_BLACK0; g->gc.currentblackgray = LJ_GC_BLACK0 | LJ_GC_GRAY; g->gc.safecolor = 0; - g->strempty.gcflags = 0; - g->strempty.gct = ~LJ_TSTR; g->prng = prng; #ifndef LUAJIT_USE_SYSMALLOC if (allocf == lj_alloc_f) { diff --git a/src/lj_state.h b/src/lj_state.h index b0b4dc15..d3e31c86 100644 --- a/src/lj_state.h +++ b/src/lj_state.h @@ -39,6 +39,7 @@ LJ_FUNC lua_State *lj_newstate(lua_Alloc f, void *ud, luaJIT_allocpages allocp, luaJIT_freepages freep, luaJIT_reallochuge realloch, + luaJIT_reallocraw rawalloc, void *page_ud); diff --git a/src/lj_str.c b/src/lj_str.c index f952850c..77759921 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -12,6 +12,7 @@ #include "lj_str.h" #include "lj_char.h" #include "lj_prng.h" +#include "lj_intrin.h" /* -- String helpers ------------------------------------------------------ */ @@ -123,95 +124,158 @@ static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h, /* -- String interning ---------------------------------------------------- */ -#define LJ_STR_MAXCOLL 32 +#define LJ_STR_MAXCOLL 32 +#define LJ_STR_MAXCHAIN 32 + +static uint32_t lj_str_get_free(StrTab *st) +{ +#if LJ_64 + I256 a, b, c, d, z; + uint32_t t1, t2; + I256_ZERO(z); + I256_LOADA(a, &st->strs[0]); + I256_LOADA(b, &st->strs[4]); + I256_LOADA(c, &st->strs[8]); + I256_LOADA(d, &st->strs[12]); + t1 = I256_EQ_64_MASK(a, z) | (I256_EQ_64_MASK(b, z) << 4); + t2 = (I256_EQ_64_MASK(c, z) << 8) | (I256_EQ_64_MASK(d, z) << 12); + return t1 | t2; +#else + I256 x, z; + I256_LOADA(x, &st->strs[0]); + ret = I256_EQ_64_MASK(x, z); + I256_LOADA(x, &st->strs[8]); + return ret | I256_EQ_64_MASK(x, z) << 8; +#endif +} + +static void lj_str_insert(lua_State *L, GCstr *s, StrHash hash, int hashalg) +{ + global_State *g = G(L); + uint32_t index = hash & g->str.mask; + uint32_t hid = (index << 4) | 0xFC000000; + StrTab *st = &mref(g->str.tab, StrTab)[index]; +#if LUAJIT_SECURITY_STRHASH + /* Check for algorithm mismatch, sparse into dense list */ + if ((st->prev_len & LJ_STR_SECONDARY) && !hashalg) { + hashalg = 1; + hash = hash_dense(g->str.seed, hash, strdata(s), s->len); + index = hash & g->str.mask; + hid = (index << 4) | 0xFC000000; + st = &mref(g->str.tab, StrTab)[index]; + } +#endif + while(1) { + if((st->prev_len & 0xF) < 15) { + uint32_t i = tzcount32(lj_str_get_free(st)); + lj_assertG(!gcrefu(st->strs[i]), "bad stringtable index, occupied"); + lj_assertG(i == 0 || gcrefu(st->strs[i-1]), "bad stringtable index, nonsequential"); + st->hashes[i] = hash; + /* NOBARRIER: string table is cleared on demand */ + setgcrefp(st->strs[i], (uintptr_t)s | hashalg); + st->prev_len++; + if (!hid) { + /* There are three options here + * 1. Directly compute it from the arena header + * 2. Find an existing string and reuse it's hid + * 3. Use the prev_len value of ->next + * But since next isn't guaranteed to be populated and + * while we *should* have a valid entry we'd have to find it, + * it's best to just compute it directly. + */ + GCAstrtab *a = gcat(st, GCAstrtab); + hid = + ((uint32_t)a->index << 13) | ((uint32_t)(st - &a->entries[0]) << 4); + } + s->hid = hid | i; + return; + } + if(!st->next) { + StrTab *next = lj_mem_allocstrtab(L, &s->hid); + st->next = next; + next->hashes[0] = hash; + /* NOBARRIER: string table is cleared on demand */ + setgcrefp(next->strs[0], (uintptr_t)s | hashalg); + /* We know all strings are valid but we don't easily know the ID going + * forwards, however every string will contain it in hid. String 1 is + * guaranteed to hold one with the correct value. + */ + next->prev_len = st_ref(st->strs[1])->hid; + return; + } + st = st->next; + hid = 0; + } +} /* Resize the string interning hash table (grow and shrink). */ void lj_str_resize(lua_State *L, MSize newmask) { global_State *g = G(L); - GCRef *newtab, *oldtab = g->str.tab; - MSize i; + StrTab *newtab, *oldtab = mref(g->str.tab, StrTab); + MSize i, j; + MSize oldmask = g->str.mask; + StrTab *tab; - /* No resizing during GC traversal or if already too big. */ - if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1) + /* No resizing if already too big. */ + if (newmask >= LJ_MAX_STRTAB-1) return; - newtab = lj_mem_newvec(L, newmask+1, GCRef); - memset(newtab, 0, (newmask+1)*sizeof(GCRef)); + newtab = (StrTab *)lj_mem_newpages(g, (newmask + 1) * sizeof(StrTab)); + /* Already zeroed */ #if LUAJIT_SECURITY_STRHASH /* Check which chains need secondary hashes. */ if (g->str.second) { - int newsecond = 0; + uint32_t newsecond = 0; /* Compute primary chain lengths. */ - for (i = g->str.mask; i != ~(MSize)0; i--) { - GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); - while (o) { - GCstr *s = gco2str(o); - MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) : - s->hash; - hash &= newmask; - setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1); - o = gcnext(o); - } + for (i = oldmask; i != ~(MSize)0; i--) { + StrTab *tab = (StrTab *)&oldtab[i]; + do { + for (j = 0; j < 15; j++) { + GCstr *s = st_ref(tab->strs[j]); + MSize hash = st_alg(tab->strs[j]) + ? hash_sparse(g->str.seed, strdata(s), s->len) + : tab->hashes[j]; + newtab[hash & newmask].prev_len++; + } + tab = tab->next; + } while (tab); } /* Mark secondary chains. */ for (i = newmask; i != ~(MSize)0; i--) { - int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL; - newsecond |= secondary; - setgcrefp(newtab[i], secondary); + StrTab *tab = (StrTab *)&newtab[i]; + tab->prev_len = + (tab->prev_len > LJ_STR_MAXCOLL * 15) ? LJ_STR_SECONDARY : 0; + newsecond |= tab->prev_len; } g->str.second = newsecond; } #endif + /* Install new table */ + setmref(g->str.tab, newtab); + g->str.mask = newmask; + /* Reinsert all strings from the old table into the new table. */ - for (i = g->str.mask; i != ~(MSize)0; i--) { - GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); - while (o) { - GCobj *next = gcnext(o); - GCstr *s = gco2str(o); - MSize hash = s->hash; -#if LUAJIT_SECURITY_STRHASH - uintptr_t u; - if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */ - hash &= newmask; - u = gcrefu(newtab[hash]); - if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */ - s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len); - s->hashalg = 1; - hash &= newmask; - u = gcrefu(newtab[hash]); - } - } else { /* String hashed with secondary hash. */ - MSize shash = hash_sparse(g->str.seed, strdata(s), s->len); - u = gcrefu(newtab[shash & newmask]); - if (u & 1) { - hash &= newmask; - u = gcrefu(newtab[hash]); - } else { /* Revert string back to primary hash. */ - s->hash = shash; - s->hashalg = 0; - hash = (shash & newmask); - } + for (i = 0; i < oldmask+1; i++) { + tab = &oldtab[i]; + do { + StrTab *old = tab; + for (j = 0; j < 15; j++) { + GCstr *s = st_ref(tab->strs[j]); + if (s) { + lj_str_insert(L, s, tab->hashes[j], st_alg(tab->strs[j])); + } } - /* NOBARRIER: The string table is a GC root. */ - setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); - setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1))); -#else - hash &= newmask; - /* NOBARRIER: The string table is a GC root. */ - setgcrefr(o->gch.nextgc, newtab[hash]); - setgcref(newtab[hash], o); -#endif - o = next; - } + tab = tab->next; + if (old != &oldtab[i]) + lj_mem_freestrtab(g, old); + } while (tab); } - /* Free old table and replace with new table. */ - lj_str_freetab(g); - g->str.tab = newtab; - g->str.mask = newmask; + /* Free old table. */ + lj_mem_freepages(g, oldtab, (oldmask + 1) * sizeof(StrTab)); } #if LUAJIT_SECURITY_STRHASH @@ -220,45 +284,33 @@ static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc, const char *str, MSize len) { global_State *g = G(L); - int sweep = - g->gc.state == GCSsweepstring ? g->gc.safecolor : 0; /* Sweeping? */ - uint8_t mask = isminor(g) ? 0xFF : ~LJ_GC_COLORS; - GCRef *strtab = g->str.tab; MSize strmask = g->str.mask; - GCobj *o = gcref(strtab[hashc & strmask]); - setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1)); + StrTab *tab = &mref(g->str.tab, StrTab)[hashc & strmask]; + StrTab *base = tab; + uint32_t i; + g->str.second = 1; - while (o) { - uintptr_t u; - GCobj *next = gcnext(o); - GCstr *s = gco2str(o); - StrHash hash; - if (sweep) { /* Must sweep while rechaining. */ - if (o->gch.gcflags & sweep) { /* String alive? */ - lj_assertG(!checkdead(g, o) || (o->gch.gcflags & LJ_GC_FIXED), - "sweep of undead string"); - o->gch.gcflags &= mask; - } else { /* Free dead string. */ - lj_assertG(checkdead(g, o) || (sweep & LJ_GC_SFIXED), - "sweep of unlive string"); - lj_str_free(g, s); - o = next; - continue; + tab->prev_len |= LJ_STR_SECONDARY; + do { + StrTab *old = tab; + for (i = 0; i < 15; i++) { + if (!st_alg(tab->strs[i])) { + GCstr *s = st_ref(tab->strs[i]); + if (s) { + setgcrefnull(tab->strs[i]); + tab->prev_len--; + lj_str_insert( + L, s, hash_dense(g->str.seed, tab->hashes[i], strdata(s), s->len), + 1); + } } } - hash = s->hash; - if (!s->hashalg) { /* Rehash with secondary hash. */ - hash = hash_dense(g->str.seed, hash, strdata(s), s->len); - s->hash = hash; - s->hashalg = 1; + tab = tab->next; + if (old != base && !(old->prev_len & 0xF)) { + lj_mem_freechainedstrtab(g, old); } - /* Rechain. */ - hash &= strmask; - u = gcrefu(strtab[hash]); - setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); - setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1))); - o = next; - } + } while (tab); + /* Try to insert the pending string again. */ return lj_str_new(L, str, len); } @@ -279,11 +331,9 @@ static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len, { GCstr *s = lj_mem_allocstr(L, len); global_State *g = G(L); - uintptr_t u; newwhite(s); s->gct = ~LJ_TSTR; s->len = len; - s->hash = hash; #ifndef STRID_RESEED_INTERVAL s->sid = g->str.id++; #elif STRID_RESEED_INTERVAL @@ -297,19 +347,14 @@ static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len, s->sid = (StrID)lj_prng_u64(&g->prng); #endif s->reserved = 0; - s->hashalg = (uint8_t)hashalg; /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */ *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0; memcpy(strdatawr(s), str, len); /* Add to string hash table. */ - hash &= g->str.mask; - u = gcrefu(g->str.tab[hash]); - setgcrefp(s->nextgc, (u & ~(uintptr_t)1)); - /* NOBARRIER: The string table is a GC root. */ - setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1))); - if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */ - lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */ - return s; /* Return newly interned string. */ + lj_str_insert(L, s, hash, hashalg); + if (g->str.num++ > g->str.mask * 15) /* Allow a 100% load factor. */ + lj_str_resize(L, (g->str.mask << 1) + 1); /* Grow string table. */ + return s; /* Return newly interned string. */ } /* Intern a string and return string object. */ @@ -317,31 +362,46 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) { global_State *g = G(L); if (lenx-1 < LJ_MAX_STR-1) { + I256 h0, h1, cmp; MSize len = (MSize)lenx; StrHash hash = hash_sparse(g->str.seed, str, len); MSize coll = 0; + uint32_t chain = 0; int hashalg = 0; /* Check if the string has already been interned. */ - GCobj *o = gcref(g->str.tab[hash & g->str.mask]); + StrTab *st = &mref(g->str.tab, StrTab)[hash & g->str.mask]; + StrTab *root; #if LUAJIT_SECURITY_STRHASH - if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */ + if (LJ_UNLIKELY(st->prev_len & LJ_STR_SECONDARY)) { /* Secondary hash for this chain? */ hashalg = 1; hash = hash_dense(g->str.seed, hash, str, len); - o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1); + st = &mref(g->str.tab, StrTab)[hash & g->str.mask]; } #endif - while (o != NULL) { - GCstr *sx = gco2str(o); - if (sx->hash == hash && sx->len == len) { - if (memcmp(str, strdata(sx), len) == 0) { - maybe_resurrect_str(g, sx); - return sx; /* Return existing string. */ - } - coll++; + root = st; + I256_BCAST_32(cmp, hash); + do { + I256_LOADA(h0, &st->hashes[0]); + I256_LOADA(h1, &st->hashes[8]); + uint32_t eq = (I256_EQ_32_MASK(h0, cmp) | ((I256_EQ_32_MASK(h1, cmp) & 0x7F) << 8)); + + while (eq != 0) { + GCstr *sx = st_ref(st->strs[tzcount32(eq)]); + eq = reset_lowest32(eq); + if (LJ_UNLIKELY(!sx)) + continue; + if (len == sx->len && memcmp(str, strdata(sx), len) == 0) { + maybe_resurrect_str(g, sx); + return sx; /* Return existing string. */ + } + coll++; } - coll++; - o = gcnext(o); - } + chain++; + st = st->next; + } while (st != NULL); + if(LJ_UNLIKELY(chain > 0x3FFFFFF)) + chain = 0x3FFFFFF; + root->prev_len = (root->prev_len & 0x1F) | (chain << 5); #if LUAJIT_SECURITY_STRHASH /* Rehash chain if there are too many collisions. */ if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) { @@ -349,24 +409,37 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) } #endif /* Otherwise allocate a new string. */ + return lj_str_alloc(L, str, len, hash, hashalg); } else { if (lenx) lj_err_msg(L, LJ_ERR_STROV); - return &g->strempty; + return g->strempty; } } -void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s) -{ - g->str.num--; - lj_mem_free(g, s, lj_str_size(s->len)); -} - void LJ_FASTCALL lj_str_init(lua_State *L) { global_State *g = G(L); g->str.seed = lj_prng_u64(&g->prng); - lj_str_resize(L, LJ_MIN_STRTAB-1); + g->str.secondary_arena_free_head = -1; + g->str.secondary_list_capacity = 8; + g->str.secondary_slot_free_head = g->str.secondary_list_capacity - 1; + g->str.secondary_list = lj_mem_newvec(L, g->str.secondary_list_capacity, MRef); + for(uint32_t i = 1; i < g->str.secondary_list_capacity; i++) + setmrefu(g->str.secondary_list[i], i-1); + setmrefu(g->str.secondary_list[0], ~0ull); + + g->strempty = lj_mem_allocstr(L, 0); + memset(g->strempty, 0, 32); + g->strempty->gct = ~LJ_TSTR; + fixstring(g->strempty); + + g->str.mask = LJ_MIN_STRTAB - 1; + setmref(g->str.tab, lj_mem_newpages(g, LJ_MIN_STRTAB * sizeof(StrTab))); } +void lj_str_freetab(global_State *g) +{ + lj_mem_freepages(g, mref(g->str.tab, void), (g->str.mask + 1) * sizeof(StrTab)); +} diff --git a/src/lj_str.h b/src/lj_str.h index 2a5a8190..f59a5e22 100644 --- a/src/lj_str.h +++ b/src/lj_str.h @@ -19,13 +19,11 @@ LJ_FUNC int lj_str_haspattern(GCstr *s); /* String interning. */ LJ_FUNC void lj_str_resize(lua_State *L, MSize newmask); LJ_FUNCA GCstr *lj_str_new(lua_State *L, const char *str, size_t len); -LJ_FUNC void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s); LJ_FUNC void LJ_FASTCALL lj_str_init(lua_State *L); -#define lj_str_freetab(g) \ - (lj_mem_freevec(g, g->str.tab, g->str.mask+1, GCRef)) +LJ_FUNC void lj_str_freetab(global_State *g); +LJ_FUNC void lj_str_shrink(lua_State *L); #define lj_str_newz(L, s) (lj_str_new(L, s, strlen(s))) #define lj_str_newlit(L, s) (lj_str_new(L, "" s, sizeof(s)-1)) -#define lj_str_size(len) (sizeof(GCstr) + (((len)+4) & ~(MSize)3)) #endif diff --git a/src/luajit_rolling.h b/src/luajit_rolling.h index c0179ee5..37b22172 100644 --- a/src/luajit_rolling.h +++ b/src/luajit_rolling.h @@ -77,6 +77,9 @@ typedef unsigned (*luaJIT_allocpages)(void *ud, void **pages, unsigned n); /* Free is called one last time with NULL, 0 to indicate any state should be released */ typedef void (*luaJIT_freepages)(void *ud, void **pages, unsigned n); typedef void* (*luaJIT_reallochuge)(void *ud, void *p, size_t osz, size_t nsz); +/* Raw page allocation, similar to huge but no space is reserved and + * there are no extra alignment requirements. Resize behaviour is not required. */ +typedef void* (*luaJIT_reallocraw)(void *ud, void *p, size_t osz, size_t nsz); /* This many bytes are reserved at the start of each huge arena for the allocator's use */ #define LUAJIT_HUGE_RESERVED_SPACE 20 @@ -100,6 +103,7 @@ LUA_API lua_State *luaJIT_newstate(lua_Alloc f, void *ud, luaJIT_allocpages allocp, luaJIT_freepages freep, luaJIT_reallochuge realloch, + luaJIT_reallocraw rawalloc, void *page_ud); /* As lua_createtable, but can be used with __gc */ diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc index 010b6efc..6295770d 100644 --- a/src/vm_x64.dasc +++ b/src/vm_x64.dasc @@ -1243,7 +1243,8 @@ static void build_subroutines(BuildCtx *ctx) | jz ->fff_res1 | settp TAB:RC, TAB:RB, LJ_TTAB | mov [BASE-16], TAB:RC // Store metatable as default result. - | mov STR:RC, [DISPATCH+DISPATCH_GL(gcroot)+8*(GCROOT_MMNAME+MM_metatable)] + | mov STR:RC, [DISPATCH+DISPATCH_GL(meta_root)] + | add STR:RC, 32*MM_metatable | mov RAd, TAB:RB->hmask | and RAd, STR:RC->sid | settp STR:RC, LJ_TSTR