From ac02a120ef249aac37b4847705a3099bd4b92967 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 7 Jun 2021 12:03:22 +0200 Subject: [PATCH] String buffers, part 2e: add serialization string dictionary. Sponsored by fmad.io. --- doc/ext_buffer.html | 70 ++++++++++++++++++++++++++++++++++++----- src/lib_buffer.c | 60 ++++++++++++++++++++++++----------- src/lj_buf.h | 16 +++++----- src/lj_errmsg.h | 2 ++ src/lj_gc.c | 5 +-- src/lj_obj.h | 2 +- src/lj_serialize.c | 77 ++++++++++++++++++++++++++++++++++++++++----- src/lj_serialize.h | 1 + src/lj_tab.c | 23 ++------------ src/lj_tab.h | 23 ++++++++++++-- 10 files changed, 214 insertions(+), 65 deletions(-) diff --git a/doc/ext_buffer.html b/doc/ext_buffer.html index 94af757d..2443fc90 100644 --- a/doc/ext_buffer.html +++ b/doc/ext_buffer.html @@ -175,14 +175,19 @@ object itself as a convenience. This allows method chaining, e.g.:

Buffer Creation and Management

-

local buf = buffer.new([size])

+

local buf = buffer.new([size [,options]])
+local buf = buffer.new([options])

Creates a new buffer object.

The optional size argument ensures a minimum initial buffer -size. This is strictly an optimization for cases where the required -buffer size is known beforehand. +size. This is strictly an optimization when the required buffer size is +known beforehand. The buffer space will grow as needed, in any case. +

+

+The optional table options sets various +serialization options.

buf = buf:reset()

@@ -205,7 +210,7 @@ immediately.

Buffer Writers

-

buf = buf:put([str|num|obj] [, ...])

+

buf = buf:put([str|num|obj] [,…])

Appends a string str, a number num or any object obj with a __tostring metamethod to the buffer. @@ -217,7 +222,7 @@ internally. But it still involves a copy. Better combine the buffer writes to use a single buffer.

-

buf = buf:putf(format, ...)

+

buf = buf:putf(format, …)

Appends the formatted arguments to the buffer. The format string supports the same options as string.format(). @@ -298,7 +303,7 @@ method, if nothing is added to the buffer (e.g. on error). Returns the current length of the buffer data in bytes.

-

res = str|num|buf .. str|num|buf [...]

+

res = str|num|buf .. str|num|buf […]

The Lua concatenation operator .. also accepts buffers, just like strings or numbers. It always returns a string and not a buffer. @@ -319,7 +324,7 @@ Skips (consumes) len bytes from the buffer up to the current length of the buffer data.

-

str, ... = buf:get([len|nil] [,...])

+

str, … = buf:get([len|nil] [,…])

Consumes the buffer data and returns one or more strings. If called without arguments, the whole buffer data is consumed. If called with a @@ -444,6 +449,56 @@ data after decoding a single top-level object. The buffer method leaves any left-over data in the buffer.

+

Serialization Options

+

+The options table passed to buffer.new() may contain +the following members (all optional): +

+ +

+dict needs to be an array of strings, starting at index 1 and +without holes (no nil inbetween). The table is anchored in the +buffer object and internally modified into a two-way index (don't do +this yourself, just pass a plain array). The table must not be modified +after it has been passed to buffer.new(). +

+

+The dict tables used by the encoder and decoder must be the +same. Put the most common entries at the front. Extend at the end to +ensure backwards-compatibility — older encodings can then still be +read. You may also set some indexes to false to explicitly drop +backwards-compatibility. Old encodings that use these indexes will throw +an error when decoded. +

+

+Note: parsing and preparation of the options table is somewhat +expensive. Create a buffer object only once and recycle it for multiple +uses. Avoid mixing encoder and decoder buffers, since the +buf:set() method frees the already allocated buffer space: +

+
+local options = {
+  dict = { "commonly", "used", "string", "keys" },
+}
+local buf_enc = buffer.new(options)
+local buf_dec = buffer.new(options)
+
+local function encode(obj)
+  return buf_enc:reset():encode(obj):get()
+end
+
+local function decode(str)
+  return buf_dec:set(str):decode()
+end
+
+

Streaming Serialization

In some contexts, it's desirable to do piecewise serialization of large @@ -536,6 +591,7 @@ uint64 → 0x11 uint.L // FFI uint64_t complex → 0x12 re.L im.L // FFI complex string → (0x20+len).U len*char.B + | 0x0f (index-1).U // Dict entry .B = 8 bit .I = 32 bit little-endian diff --git a/src/lib_buffer.c b/src/lib_buffer.c index 78c4eeb9..f13320c4 100644 --- a/src/lib_buffer.c +++ b/src/lib_buffer.c @@ -29,9 +29,7 @@ #include "lj_serialize.h" #include "lj_lib.h" -/* ------------------------------------------------------------------------ */ - -#define LJLIB_MODULE_buffer_method +/* -- Helper functions ---------------------------------------------------- */ /* Check that the first argument is a string buffer. */ static SBufExt *buffer_tobuf(lua_State *L) @@ -49,11 +47,16 @@ static LJ_AINLINE SBufExt *buffer_tobufw(lua_State *L) return sbx; } +#define buffer_toudata(sbx) ((GCudata *)(sbx)-1) + +/* -- Buffer methods ------------------------------------------------------ */ + +#define LJLIB_MODULE_buffer_method + LJLIB_CF(buffer_method_free) { SBufExt *sbx = buffer_tobuf(L); - lj_bufx_free(G(L), sbx); - lj_bufx_init(L, sbx); + lj_bufx_free(L, sbx); L->top = L->base+1; /* Chain buffer object. */ return 1; } @@ -83,6 +86,7 @@ LJLIB_CF(buffer_method_skip) LJLIB_CF(buffer_method_set) { SBufExt *sbx = buffer_tobuf(L); + GCobj *ref; const char *p; MSize len; #if LJ_HASFFI @@ -98,9 +102,11 @@ LJLIB_CF(buffer_method_set) p = strdata(str); len = str->len; } - lj_bufx_free(G(L), sbx); - lj_bufx_init_cow(L, sbx, p, len); - setgcref(sbx->cowref, gcV(L->base+1)); + lj_bufx_free(L, sbx); + lj_bufx_set_cow(L, sbx, p, len); + ref = gcV(L->base+1); + setgcref(sbx->cowref, ref); + lj_gc_objbarrier(L, buffer_toudata(sbx), ref); L->top = L->base+1; /* Chain buffer object. */ return 1; } @@ -249,8 +255,7 @@ LJLIB_CF(buffer_method_decode) LJLIB_CF(buffer_method___gc) { SBufExt *sbx = buffer_tobuf(L); - lj_bufx_free(G(L), sbx); - lj_bufx_init(L, sbx); + lj_bufx_free(L, sbx); return 0; } @@ -272,7 +277,7 @@ LJLIB_CF(buffer_method___len) LJLIB_PUSH("buffer") LJLIB_SET(__metatable) LJLIB_PUSH(top-1) LJLIB_SET(__index) -/* ------------------------------------------------------------------------ */ +/* -- Buffer library functions -------------------------------------------- */ #define LJLIB_MODULE_buffer @@ -280,16 +285,33 @@ LJLIB_PUSH(top-2) LJLIB_SET(!) /* Set environment. */ LJLIB_CF(buffer_new) { - MSize sz = L->base == L->top ? 0u : - (MSize)lj_lib_checkintrange(L, 1, 0, LJ_MAX_BUF); - GCtab *env = tabref(curr_func(L)->c.env); - GCudata *ud = lj_udata_new(L, sizeof(SBufExt), env); - SBufExt *sbx = (SBufExt *)uddata(ud); + MSize sz = 0; + int targ = 1; + GCtab *env, *dict = NULL; + GCudata *ud; + SBufExt *sbx; + if (L->base < L->top && !tvistab(L->base)) { + targ = 2; + if (!tvisnil(L->base)) + sz = (MSize)lj_lib_checkintrange(L, 1, 0, LJ_MAX_BUF); + } + if (L->base+targ-1 < L->top) { + GCtab *options = lj_lib_checktab(L, targ); + cTValue *opt_dict = lj_tab_getstr(options, lj_str_newlit(L, "dict")); + if (opt_dict && tvistab(opt_dict)) { + dict = tabV(opt_dict); + lj_serialize_dict_prep(L, dict); + } + } + env = tabref(curr_func(L)->c.env); + ud = lj_udata_new(L, sizeof(SBufExt), env); ud->udtype = UDTYPE_BUFFER; /* NOBARRIER: The GCudata is new (marked white). */ setgcref(ud->metatable, obj2gco(env)); setudataV(L, L->top++, ud); + sbx = (SBufExt *)uddata(ud); lj_bufx_init(L, sbx); + setgcref(sbx->dict, obj2gco(dict)); if (sz > 0) lj_buf_need2((SBuf *)sbx, sz); return 1; } @@ -298,7 +320,8 @@ LJLIB_CF(buffer_encode) { cTValue *o = lj_lib_checkany(L, 1); SBufExt sbx; - lj_bufx_init_borrow(L, &sbx, &G(L)->tmpbuf); + memset(&sbx, 0, sizeof(SBufExt)); + lj_bufx_set_borrow(L, &sbx, &G(L)->tmpbuf); lj_serialize_put(&sbx, o); setstrV(L, L->top++, lj_buf_str(L, (SBuf *)&sbx)); lj_gc_check(L); @@ -309,7 +332,8 @@ LJLIB_CF(buffer_decode) { GCstr *str = lj_lib_checkstrx(L, 1); SBufExt sbx; - lj_bufx_init_cow(L, &sbx, strdata(str), str->len); + memset(&sbx, 0, sizeof(SBufExt)); + lj_bufx_set_cow(L, &sbx, strdata(str), str->len); /* No need to set sbx.cowref here. */ setnilV(L->top++); lj_serialize_get(&sbx, L->top-1); diff --git a/src/lj_buf.h b/src/lj_buf.h index 02f0ac61..b97d55ef 100644 --- a/src/lj_buf.h +++ b/src/lj_buf.h @@ -27,6 +27,7 @@ typedef struct SBufExt { MRef bsb; /* Borrowed string buffer. */ }; char *r; /* Read pointer. */ + GCRef dict; /* Serialization string dictionary table. */ int depth; /* Remaining recursion depth. */ } SBufExt; @@ -114,19 +115,17 @@ static LJ_AINLINE void lj_bufx_init(lua_State *L, SBufExt *sbx) setsbufXL(sbx, L, SBUF_FLAG_EXT); } -static LJ_AINLINE void lj_bufx_init_borrow(lua_State *L, SBufExt *sbx, SBuf *sb) +static LJ_AINLINE void lj_bufx_set_borrow(lua_State *L, SBufExt *sbx, SBuf *sb) { - memset(sbx, 0, sizeof(SBufExt)); setsbufXL(sbx, L, SBUF_FLAG_EXT | SBUF_FLAG_BORROW); setmref(sbx->bsb, sb); sbx->r = sbx->w = sbx->b = sb->b; sbx->e = sb->e; } -static LJ_AINLINE void lj_bufx_init_cow(lua_State *L, SBufExt *sbx, - const char *p, MSize len) +static LJ_AINLINE void lj_bufx_set_cow(lua_State *L, SBufExt *sbx, + const char *p, MSize len) { - memset(sbx, 0, sizeof(SBufExt)); setsbufXL(sbx, L, SBUF_FLAG_EXT | SBUF_FLAG_COW); sbx->r = sbx->b = (char *)p; sbx->w = sbx->e = (char *)p + len; @@ -142,9 +141,12 @@ static LJ_AINLINE void lj_bufx_reset(SBufExt *sbx) sbx->r = sbx->w = sbx->b; } -static LJ_AINLINE void lj_bufx_free(global_State *g, SBufExt *sbx) +static LJ_AINLINE void lj_bufx_free(lua_State *L, SBufExt *sbx) { - if (!sbufiscow(sbx)) lj_mem_free(g, sbx->b, sbufsz(sbx)); + if (!sbufiscow(sbx)) lj_mem_free(G(L), sbx->b, sbufsz(sbx)); + setsbufXL(sbx, L, SBUF_FLAG_EXT); + setgcrefnull(sbx->cowref); + sbx->r = sbx->w = sbx->b = sbx->e = NULL; } /* Low-level buffer put operations */ diff --git a/src/lj_errmsg.h b/src/lj_errmsg.h index af4a03dd..56be4bb9 100644 --- a/src/lj_errmsg.h +++ b/src/lj_errmsg.h @@ -182,8 +182,10 @@ ERRDEF(FFI_NYICALL, "NYI: cannot call this C function (yet)") #if LJ_HASBUFFER /* String buffer errors. */ +ERRDEF(BUFFER_BADOPT, "bad options table") ERRDEF(BUFFER_BADENC, "cannot serialize " LUA_QS) ERRDEF(BUFFER_BADDEC, "cannot deserialize tag 0x%02x") +ERRDEF(BUFFER_BADDICTX, "cannot deserialize dictionary index %d") ERRDEF(BUFFER_DEPTH, "too deep to serialize") ERRDEF(BUFFER_DUPKEY, "duplicate table key") ERRDEF(BUFFER_EOB, "unexpected end of buffer") diff --git a/src/lj_gc.c b/src/lj_gc.c index 1f382ea0..646a27b2 100644 --- a/src/lj_gc.c +++ b/src/lj_gc.c @@ -67,9 +67,10 @@ static void gc_mark(global_State *g, GCobj *o) gc_markobj(g, tabref(gco2ud(o)->env)); if (LJ_HASBUFFER && gco2ud(o)->udtype == UDTYPE_BUFFER) { SBufExt *sbx = (SBufExt *)uddata(gco2ud(o)); - if (sbufiscow(sbx) && gcref(sbx->cowref) != NULL) { + if (sbufiscow(sbx) && gcref(sbx->cowref)) gc_markobj(g, gcref(sbx->cowref)); - } + if (gcref(sbx->dict)) + gc_markobj(g, gcref(sbx->dict)); } } else if (LJ_UNLIKELY(gct == ~LJ_TUPVAL)) { GCupval *uv = gco2uv(o); diff --git a/src/lj_obj.h b/src/lj_obj.h index 0dae5fec..5547a79b 100644 --- a/src/lj_obj.h +++ b/src/lj_obj.h @@ -923,7 +923,7 @@ static LJ_AINLINE void setgcV(lua_State *L, TValue *o, GCobj *v, uint32_t it) } #define define_setV(name, type, tag) \ -static LJ_AINLINE void name(lua_State *L, TValue *o, type *v) \ +static LJ_AINLINE void name(lua_State *L, TValue *o, const type *v) \ { \ setgcV(L, o, obj2gco(v), tag); \ } diff --git a/src/lj_serialize.c b/src/lj_serialize.c index 49a25a7c..d84ebcb8 100644 --- a/src/lj_serialize.c +++ b/src/lj_serialize.c @@ -32,7 +32,7 @@ enum { SER_TAG_NUM, SER_TAG_TAB, /* 0x08 */ SER_TAG_0x0e = SER_TAG_TAB+6, - SER_TAG_0x0f, + SER_TAG_DICT, SER_TAG_INT64, /* 0x10 */ SER_TAG_UINT64, SER_TAG_COMPLEX, @@ -120,6 +120,26 @@ static LJ_AINLINE char *serialize_ru124(char *r, char *w, uint32_t *pv) return NULL; } +/* Prepare string dictionary for use (once). */ +void LJ_FASTCALL lj_serialize_dict_prep(lua_State *L, GCtab *dict) +{ + if (!dict->hmask) { /* No hash part means not prepared, yet. */ + MSize i, len = lj_tab_len(dict); + if (!len) return; + lj_tab_resize(L, dict, dict->asize, hsize2hbits(len)); + for (i = 1; i <= len && i < dict->asize; i++) { + cTValue *o = arrayslot(dict, i); + if (tvisstr(o)) { + if (!lj_tab_getstr(dict, strV(o))) { /* Ignore dups. */ + lj_tab_newkey(L, dict, o)->u64 = (uint64_t)(i-1); + } + } else if (!tvisfalse(o)) { + lj_err_caller(L, LJ_ERR_BUFFER_BADOPT); + } + } + } +} + /* -- Internal serializer ------------------------------------------------- */ /* Put serialized object into buffer. */ @@ -174,12 +194,45 @@ static char *serialize_put(char *w, SBufExt *sbx, cTValue *o) } if (nhash) { /* Write hash entries. */ const Node *node = noderef(t->node) + t->hmask; - for (;; node--) - if (!tvisnil(&node->val)) { - w = serialize_put(w, sbx, &node->key); - w = serialize_put(w, sbx, &node->val); - if (--nhash == 0) break; - } + GCtab *dict = tabref(sbx->dict); + if (LJ_UNLIKELY(dict)) { + for (;; node--) + if (!tvisnil(&node->val)) { + if (LJ_LIKELY(tvisstr(&node->key))) { + /* Inlined lj_tab_getstr is 30% faster. */ + const GCstr *str = strV(&node->key); + Node *n = hashstr(dict, str); + do { + if (tvisstr(&n->key) && strV(&n->key) == str) { + uint32_t idx = n->val.u32.lo; + w = serialize_more(w, sbx, 1+5); + *w++ = SER_TAG_DICT; + w = serialize_wu124(w, idx); + break; + } + n = nextnode(n); + if (!n) { + MSize len = str->len; + w = serialize_more(w, sbx, 5+len); + w = serialize_wu124(w, SER_TAG_STR + len); + w = lj_buf_wmem(w, strdata(str), len); + break; + } + } while (1); + } else { + w = serialize_put(w, sbx, &node->key); + } + w = serialize_put(w, sbx, &node->val); + if (--nhash == 0) break; + } + } else { + for (;; node--) + if (!tvisnil(&node->val)) { + w = serialize_put(w, sbx, &node->key); + w = serialize_put(w, sbx, &node->val); + if (--nhash == 0) break; + } + } } sbx->depth++; #if LJ_HASFFI @@ -266,6 +319,16 @@ static char *serialize_get(char *r, SBufExt *sbx, TValue *o) if (!tvisnum(o)) setnanV(o); } else if (tp <= SER_TAG_TRUE) { setpriV(o, ~tp); + } else if (tp == SER_TAG_DICT) { + GCtab *dict; + uint32_t idx; + r = serialize_ru124(r, w, &idx); + idx++; + dict = tabref(sbx->dict); + if (dict && idx < dict->asize && tvisstr(arrayslot(dict, idx))) + copyTV(sbufL(sbx), o, arrayslot(dict, idx)); + else + lj_err_callerv(sbufL(sbx), LJ_ERR_BUFFER_BADDICTX, idx); } else if (tp >= SER_TAG_TAB && tp < SER_TAG_TAB+6) { uint32_t narray = 0, nhash = 0; GCtab *t; diff --git a/src/lj_serialize.h b/src/lj_serialize.h index f5617790..ccf1d63d 100644 --- a/src/lj_serialize.h +++ b/src/lj_serialize.h @@ -13,6 +13,7 @@ #define LJ_SERIALIZE_DEPTH 100 /* Default depth. */ +LJ_FUNC void LJ_FASTCALL lj_serialize_dict_prep(lua_State *L, GCtab *dict); LJ_FUNC SBufExt * LJ_FASTCALL lj_serialize_put(SBufExt *sbx, cTValue *o); LJ_FUNC SBufExt * LJ_FASTCALL lj_serialize_get(SBufExt *sbx, TValue *o); diff --git a/src/lj_tab.c b/src/lj_tab.c index 27e58f0a..ed5fd2dd 100644 --- a/src/lj_tab.c +++ b/src/lj_tab.c @@ -16,25 +16,6 @@ /* -- Object hashing ------------------------------------------------------ */ -/* Hash values are masked with the table hash mask and used as an index. */ -static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash) -{ - Node *n = noderef(t->node); - return &n[hash & t->hmask]; -} - -/* String IDs are generated when a string is interned. */ -#define hashstr(t, s) hashmask(t, (s)->sid) - -#define hashlohi(t, lo, hi) hashmask((t), hashrot((lo), (hi))) -#define hashnum(t, o) hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1)) -#if LJ_GC64 -#define hashgcref(t, r) \ - hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32)) -#else -#define hashgcref(t, r) hashlohi((t), gcrefu(r), gcrefu(r) + HASH_BIAS) -#endif - /* Hash an arbitrary key and return its anchor position in the hash table. */ static Node *hashkey(const GCtab *t, cTValue *key) { @@ -413,7 +394,7 @@ cTValue * LJ_FASTCALL lj_tab_getinth(GCtab *t, int32_t key) return NULL; } -cTValue *lj_tab_getstr(GCtab *t, GCstr *key) +cTValue *lj_tab_getstr(GCtab *t, const GCstr *key) { Node *n = hashstr(t, key); do { @@ -546,7 +527,7 @@ TValue *lj_tab_setinth(lua_State *L, GCtab *t, int32_t key) return lj_tab_newkey(L, t, &k); } -TValue *lj_tab_setstr(lua_State *L, GCtab *t, GCstr *key) +TValue *lj_tab_setstr(lua_State *L, GCtab *t, const GCstr *key) { TValue k; Node *n = hashstr(t, key); diff --git a/src/lj_tab.h b/src/lj_tab.h index 97436cc0..1efa9506 100644 --- a/src/lj_tab.h +++ b/src/lj_tab.h @@ -31,6 +31,25 @@ static LJ_AINLINE uint32_t hashrot(uint32_t lo, uint32_t hi) return hi; } +/* Hash values are masked with the table hash mask and used as an index. */ +static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash) +{ + Node *n = noderef(t->node); + return &n[hash & t->hmask]; +} + +/* String IDs are generated when a string is interned. */ +#define hashstr(t, s) hashmask(t, (s)->sid) + +#define hashlohi(t, lo, hi) hashmask((t), hashrot((lo), (hi))) +#define hashnum(t, o) hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1)) +#if LJ_GC64 +#define hashgcref(t, r) \ + hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32)) +#else +#define hashgcref(t, r) hashlohi((t), gcrefu(r), gcrefu(r) + HASH_BIAS) +#endif + #define hsize2hbits(s) ((s) ? ((s)==1 ? 1 : 1+lj_fls((uint32_t)((s)-1))) : 0) LJ_FUNCA GCtab *lj_tab_new(lua_State *L, uint32_t asize, uint32_t hbits); @@ -50,14 +69,14 @@ LJ_FUNCA void lj_tab_reasize(lua_State *L, GCtab *t, uint32_t nasize); /* Caveat: all getters except lj_tab_get() can return NULL! */ LJ_FUNCA cTValue * LJ_FASTCALL lj_tab_getinth(GCtab *t, int32_t key); -LJ_FUNC cTValue *lj_tab_getstr(GCtab *t, GCstr *key); +LJ_FUNC cTValue *lj_tab_getstr(GCtab *t, const GCstr *key); LJ_FUNCA cTValue *lj_tab_get(lua_State *L, GCtab *t, cTValue *key); /* Caveat: all setters require a write barrier for the stored value. */ LJ_FUNCA TValue *lj_tab_newkey(lua_State *L, GCtab *t, cTValue *key); LJ_FUNCA TValue *lj_tab_setinth(lua_State *L, GCtab *t, int32_t key); -LJ_FUNC TValue *lj_tab_setstr(lua_State *L, GCtab *t, GCstr *key); +LJ_FUNC TValue *lj_tab_setstr(lua_State *L, GCtab *t, const GCstr *key); LJ_FUNC TValue *lj_tab_set(lua_State *L, GCtab *t, cTValue *key); #define inarray(t, key) ((MSize)(key) < (MSize)(t)->asize)