String buffers, part 2e: add serialization string dictionary.

Sponsored by fmad.io.
This commit is contained in:
Mike Pall 2021-06-07 12:03:22 +02:00
parent 4216bdfb2a
commit ac02a120ef
10 changed files with 214 additions and 65 deletions

View File

@ -175,14 +175,19 @@ object itself as a convenience. This allows method chaining, e.g.:
<h2 id="create">Buffer Creation and Management</h2>
<h3 id="buffer_new"><tt>local buf = buffer.new([size])</tt></h3>
<h3 id="buffer_new"><tt>local buf = buffer.new([size [,options]])<br>
local buf = buffer.new([options])</tt></h3>
<p>
Creates a new buffer object.
</p>
<p>
The optional <tt>size</tt> argument ensures a minimum initial buffer
size. This is strictly an optimization for cases where the required
buffer size is known beforehand.
size. This is strictly an optimization when the required buffer size is
known beforehand. The buffer space will grow as needed, in any case.
</p>
<p>
The optional table <tt>options</tt> sets various
<a href="#serialize_options">serialization options</a>.
</p>
<h3 id="buffer_reset"><tt>buf = buf:reset()</tt></h3>
@ -205,7 +210,7 @@ immediately.
<h2 id="write">Buffer Writers</h2>
<h3 id="buffer_put"><tt>buf = buf:put([str|num|obj] [, ...])</tt></h3>
<h3 id="buffer_put"><tt>buf = buf:put([str|num|obj] [,])</tt></h3>
<p>
Appends a string <tt>str</tt>, a number <tt>num</tt> or any object
<tt>obj</tt> with a <tt>__tostring</tt> metamethod to the buffer.
@ -217,7 +222,7 @@ internally. But it still involves a copy. Better combine the buffer
writes to use a single buffer.
</p>
<h3 id="buffer_putf"><tt>buf = buf:putf(format, ...)</tt></h3>
<h3 id="buffer_putf"><tt>buf = buf:putf(format, )</tt></h3>
<p>
Appends the formatted arguments to the buffer. The <tt>format</tt>
string supports the same options as <tt>string.format()</tt>.
@ -298,7 +303,7 @@ method, if nothing is added to the buffer (e.g. on error).
Returns the current length of the buffer data in bytes.
</p>
<h3 id="buffer_concat"><tt>res = str|num|buf .. str|num|buf [...]</tt></h3>
<h3 id="buffer_concat"><tt>res = str|num|buf .. str|num|buf []</tt></h3>
<p>
The Lua concatenation operator <tt>..</tt> also accepts buffers, just
like strings or numbers. It always returns a string and not a buffer.
@ -319,7 +324,7 @@ Skips (consumes) <tt>len</tt> bytes from the buffer up to the current
length of the buffer data.
</p>
<h3 id="buffer_get"><tt>str, ... = buf:get([len|nil] [,...])</tt></h3>
<h3 id="buffer_get"><tt>str, … = buf:get([len|nil] [,…])</tt></h3>
<p>
Consumes the buffer data and returns one or more strings. If called
without arguments, the whole buffer data is consumed. If called with a
@ -444,6 +449,56 @@ data after decoding a single top-level object. The buffer method leaves
any left-over data in the buffer.
</p>
<h3 id="serialize_options">Serialization Options</h3>
<p>
The <tt>options</tt> table passed to <tt>buffer.new()</tt> may contain
the following members (all optional):
</p>
<ul>
<li>
<tt>dict</tt> is a Lua table holding a <b>dictionary of strings</b> that
commonly occur as table keys of objects you are serializing. These keys
are compactly encoded as indexes during serialization. A well chosen
dictionary saves space and improves serialization performance.
</li>
</ul>
<p>
<tt>dict</tt> needs to be an array of strings, starting at index 1 and
without holes (no <tt>nil</tt> inbetween). The table is anchored in the
buffer object and internally modified into a two-way index (don't do
this yourself, just pass a plain array). The table must not be modified
after it has been passed to <tt>buffer.new()</tt>.
</p>
<p>
The <tt>dict</tt> tables used by the encoder and decoder must be the
same. Put the most common entries at the front. Extend at the end to
ensure backwards-compatibility &mdash; older encodings can then still be
read. You may also set some indexes to <tt>false</tt> to explicitly drop
backwards-compatibility. Old encodings that use these indexes will throw
an error when decoded.
</p>
<p>
Note: parsing and preparation of the options table is somewhat
expensive. Create a buffer object only once and recycle it for multiple
uses. Avoid mixing encoder and decoder buffers, since the
<tt>buf:set()</tt> method frees the already allocated buffer space:
</p>
<pre class="code">
local options = {
dict = { "commonly", "used", "string", "keys" },
}
local buf_enc = buffer.new(options)
local buf_dec = buffer.new(options)
local function encode(obj)
return buf_enc:reset():encode(obj):get()
end
local function decode(str)
return buf_dec:set(str):decode()
end
</pre>
<h3 id="serialize_stream">Streaming Serialization</h3>
<p>
In some contexts, it's desirable to do piecewise serialization of large
@ -536,6 +591,7 @@ uint64 → 0x11 uint.L // FFI uint64_t
complex → 0x12 re.L im.L // FFI complex
string → (0x20+len).U len*char.B
| 0x0f (index-1).U // Dict entry
.B = 8 bit
.I = 32 bit little-endian

View File

@ -29,9 +29,7 @@
#include "lj_serialize.h"
#include "lj_lib.h"
/* ------------------------------------------------------------------------ */
#define LJLIB_MODULE_buffer_method
/* -- Helper functions ---------------------------------------------------- */
/* Check that the first argument is a string buffer. */
static SBufExt *buffer_tobuf(lua_State *L)
@ -49,11 +47,16 @@ static LJ_AINLINE SBufExt *buffer_tobufw(lua_State *L)
return sbx;
}
#define buffer_toudata(sbx) ((GCudata *)(sbx)-1)
/* -- Buffer methods ------------------------------------------------------ */
#define LJLIB_MODULE_buffer_method
LJLIB_CF(buffer_method_free)
{
SBufExt *sbx = buffer_tobuf(L);
lj_bufx_free(G(L), sbx);
lj_bufx_init(L, sbx);
lj_bufx_free(L, sbx);
L->top = L->base+1; /* Chain buffer object. */
return 1;
}
@ -83,6 +86,7 @@ LJLIB_CF(buffer_method_skip)
LJLIB_CF(buffer_method_set)
{
SBufExt *sbx = buffer_tobuf(L);
GCobj *ref;
const char *p;
MSize len;
#if LJ_HASFFI
@ -98,9 +102,11 @@ LJLIB_CF(buffer_method_set)
p = strdata(str);
len = str->len;
}
lj_bufx_free(G(L), sbx);
lj_bufx_init_cow(L, sbx, p, len);
setgcref(sbx->cowref, gcV(L->base+1));
lj_bufx_free(L, sbx);
lj_bufx_set_cow(L, sbx, p, len);
ref = gcV(L->base+1);
setgcref(sbx->cowref, ref);
lj_gc_objbarrier(L, buffer_toudata(sbx), ref);
L->top = L->base+1; /* Chain buffer object. */
return 1;
}
@ -249,8 +255,7 @@ LJLIB_CF(buffer_method_decode)
LJLIB_CF(buffer_method___gc)
{
SBufExt *sbx = buffer_tobuf(L);
lj_bufx_free(G(L), sbx);
lj_bufx_init(L, sbx);
lj_bufx_free(L, sbx);
return 0;
}
@ -272,7 +277,7 @@ LJLIB_CF(buffer_method___len)
LJLIB_PUSH("buffer") LJLIB_SET(__metatable)
LJLIB_PUSH(top-1) LJLIB_SET(__index)
/* ------------------------------------------------------------------------ */
/* -- Buffer library functions -------------------------------------------- */
#define LJLIB_MODULE_buffer
@ -280,16 +285,33 @@ LJLIB_PUSH(top-2) LJLIB_SET(!) /* Set environment. */
LJLIB_CF(buffer_new)
{
MSize sz = L->base == L->top ? 0u :
(MSize)lj_lib_checkintrange(L, 1, 0, LJ_MAX_BUF);
GCtab *env = tabref(curr_func(L)->c.env);
GCudata *ud = lj_udata_new(L, sizeof(SBufExt), env);
SBufExt *sbx = (SBufExt *)uddata(ud);
MSize sz = 0;
int targ = 1;
GCtab *env, *dict = NULL;
GCudata *ud;
SBufExt *sbx;
if (L->base < L->top && !tvistab(L->base)) {
targ = 2;
if (!tvisnil(L->base))
sz = (MSize)lj_lib_checkintrange(L, 1, 0, LJ_MAX_BUF);
}
if (L->base+targ-1 < L->top) {
GCtab *options = lj_lib_checktab(L, targ);
cTValue *opt_dict = lj_tab_getstr(options, lj_str_newlit(L, "dict"));
if (opt_dict && tvistab(opt_dict)) {
dict = tabV(opt_dict);
lj_serialize_dict_prep(L, dict);
}
}
env = tabref(curr_func(L)->c.env);
ud = lj_udata_new(L, sizeof(SBufExt), env);
ud->udtype = UDTYPE_BUFFER;
/* NOBARRIER: The GCudata is new (marked white). */
setgcref(ud->metatable, obj2gco(env));
setudataV(L, L->top++, ud);
sbx = (SBufExt *)uddata(ud);
lj_bufx_init(L, sbx);
setgcref(sbx->dict, obj2gco(dict));
if (sz > 0) lj_buf_need2((SBuf *)sbx, sz);
return 1;
}
@ -298,7 +320,8 @@ LJLIB_CF(buffer_encode)
{
cTValue *o = lj_lib_checkany(L, 1);
SBufExt sbx;
lj_bufx_init_borrow(L, &sbx, &G(L)->tmpbuf);
memset(&sbx, 0, sizeof(SBufExt));
lj_bufx_set_borrow(L, &sbx, &G(L)->tmpbuf);
lj_serialize_put(&sbx, o);
setstrV(L, L->top++, lj_buf_str(L, (SBuf *)&sbx));
lj_gc_check(L);
@ -309,7 +332,8 @@ LJLIB_CF(buffer_decode)
{
GCstr *str = lj_lib_checkstrx(L, 1);
SBufExt sbx;
lj_bufx_init_cow(L, &sbx, strdata(str), str->len);
memset(&sbx, 0, sizeof(SBufExt));
lj_bufx_set_cow(L, &sbx, strdata(str), str->len);
/* No need to set sbx.cowref here. */
setnilV(L->top++);
lj_serialize_get(&sbx, L->top-1);

View File

@ -27,6 +27,7 @@ typedef struct SBufExt {
MRef bsb; /* Borrowed string buffer. */
};
char *r; /* Read pointer. */
GCRef dict; /* Serialization string dictionary table. */
int depth; /* Remaining recursion depth. */
} SBufExt;
@ -114,19 +115,17 @@ static LJ_AINLINE void lj_bufx_init(lua_State *L, SBufExt *sbx)
setsbufXL(sbx, L, SBUF_FLAG_EXT);
}
static LJ_AINLINE void lj_bufx_init_borrow(lua_State *L, SBufExt *sbx, SBuf *sb)
static LJ_AINLINE void lj_bufx_set_borrow(lua_State *L, SBufExt *sbx, SBuf *sb)
{
memset(sbx, 0, sizeof(SBufExt));
setsbufXL(sbx, L, SBUF_FLAG_EXT | SBUF_FLAG_BORROW);
setmref(sbx->bsb, sb);
sbx->r = sbx->w = sbx->b = sb->b;
sbx->e = sb->e;
}
static LJ_AINLINE void lj_bufx_init_cow(lua_State *L, SBufExt *sbx,
const char *p, MSize len)
static LJ_AINLINE void lj_bufx_set_cow(lua_State *L, SBufExt *sbx,
const char *p, MSize len)
{
memset(sbx, 0, sizeof(SBufExt));
setsbufXL(sbx, L, SBUF_FLAG_EXT | SBUF_FLAG_COW);
sbx->r = sbx->b = (char *)p;
sbx->w = sbx->e = (char *)p + len;
@ -142,9 +141,12 @@ static LJ_AINLINE void lj_bufx_reset(SBufExt *sbx)
sbx->r = sbx->w = sbx->b;
}
static LJ_AINLINE void lj_bufx_free(global_State *g, SBufExt *sbx)
static LJ_AINLINE void lj_bufx_free(lua_State *L, SBufExt *sbx)
{
if (!sbufiscow(sbx)) lj_mem_free(g, sbx->b, sbufsz(sbx));
if (!sbufiscow(sbx)) lj_mem_free(G(L), sbx->b, sbufsz(sbx));
setsbufXL(sbx, L, SBUF_FLAG_EXT);
setgcrefnull(sbx->cowref);
sbx->r = sbx->w = sbx->b = sbx->e = NULL;
}
/* Low-level buffer put operations */

View File

@ -182,8 +182,10 @@ ERRDEF(FFI_NYICALL, "NYI: cannot call this C function (yet)")
#if LJ_HASBUFFER
/* String buffer errors. */
ERRDEF(BUFFER_BADOPT, "bad options table")
ERRDEF(BUFFER_BADENC, "cannot serialize " LUA_QS)
ERRDEF(BUFFER_BADDEC, "cannot deserialize tag 0x%02x")
ERRDEF(BUFFER_BADDICTX, "cannot deserialize dictionary index %d")
ERRDEF(BUFFER_DEPTH, "too deep to serialize")
ERRDEF(BUFFER_DUPKEY, "duplicate table key")
ERRDEF(BUFFER_EOB, "unexpected end of buffer")

View File

@ -67,9 +67,10 @@ static void gc_mark(global_State *g, GCobj *o)
gc_markobj(g, tabref(gco2ud(o)->env));
if (LJ_HASBUFFER && gco2ud(o)->udtype == UDTYPE_BUFFER) {
SBufExt *sbx = (SBufExt *)uddata(gco2ud(o));
if (sbufiscow(sbx) && gcref(sbx->cowref) != NULL) {
if (sbufiscow(sbx) && gcref(sbx->cowref))
gc_markobj(g, gcref(sbx->cowref));
}
if (gcref(sbx->dict))
gc_markobj(g, gcref(sbx->dict));
}
} else if (LJ_UNLIKELY(gct == ~LJ_TUPVAL)) {
GCupval *uv = gco2uv(o);

View File

@ -923,7 +923,7 @@ static LJ_AINLINE void setgcV(lua_State *L, TValue *o, GCobj *v, uint32_t it)
}
#define define_setV(name, type, tag) \
static LJ_AINLINE void name(lua_State *L, TValue *o, type *v) \
static LJ_AINLINE void name(lua_State *L, TValue *o, const type *v) \
{ \
setgcV(L, o, obj2gco(v), tag); \
}

View File

@ -32,7 +32,7 @@ enum {
SER_TAG_NUM,
SER_TAG_TAB, /* 0x08 */
SER_TAG_0x0e = SER_TAG_TAB+6,
SER_TAG_0x0f,
SER_TAG_DICT,
SER_TAG_INT64, /* 0x10 */
SER_TAG_UINT64,
SER_TAG_COMPLEX,
@ -120,6 +120,26 @@ static LJ_AINLINE char *serialize_ru124(char *r, char *w, uint32_t *pv)
return NULL;
}
/* Prepare string dictionary for use (once). */
void LJ_FASTCALL lj_serialize_dict_prep(lua_State *L, GCtab *dict)
{
if (!dict->hmask) { /* No hash part means not prepared, yet. */
MSize i, len = lj_tab_len(dict);
if (!len) return;
lj_tab_resize(L, dict, dict->asize, hsize2hbits(len));
for (i = 1; i <= len && i < dict->asize; i++) {
cTValue *o = arrayslot(dict, i);
if (tvisstr(o)) {
if (!lj_tab_getstr(dict, strV(o))) { /* Ignore dups. */
lj_tab_newkey(L, dict, o)->u64 = (uint64_t)(i-1);
}
} else if (!tvisfalse(o)) {
lj_err_caller(L, LJ_ERR_BUFFER_BADOPT);
}
}
}
}
/* -- Internal serializer ------------------------------------------------- */
/* Put serialized object into buffer. */
@ -174,12 +194,45 @@ static char *serialize_put(char *w, SBufExt *sbx, cTValue *o)
}
if (nhash) { /* Write hash entries. */
const Node *node = noderef(t->node) + t->hmask;
for (;; node--)
if (!tvisnil(&node->val)) {
w = serialize_put(w, sbx, &node->key);
w = serialize_put(w, sbx, &node->val);
if (--nhash == 0) break;
}
GCtab *dict = tabref(sbx->dict);
if (LJ_UNLIKELY(dict)) {
for (;; node--)
if (!tvisnil(&node->val)) {
if (LJ_LIKELY(tvisstr(&node->key))) {
/* Inlined lj_tab_getstr is 30% faster. */
const GCstr *str = strV(&node->key);
Node *n = hashstr(dict, str);
do {
if (tvisstr(&n->key) && strV(&n->key) == str) {
uint32_t idx = n->val.u32.lo;
w = serialize_more(w, sbx, 1+5);
*w++ = SER_TAG_DICT;
w = serialize_wu124(w, idx);
break;
}
n = nextnode(n);
if (!n) {
MSize len = str->len;
w = serialize_more(w, sbx, 5+len);
w = serialize_wu124(w, SER_TAG_STR + len);
w = lj_buf_wmem(w, strdata(str), len);
break;
}
} while (1);
} else {
w = serialize_put(w, sbx, &node->key);
}
w = serialize_put(w, sbx, &node->val);
if (--nhash == 0) break;
}
} else {
for (;; node--)
if (!tvisnil(&node->val)) {
w = serialize_put(w, sbx, &node->key);
w = serialize_put(w, sbx, &node->val);
if (--nhash == 0) break;
}
}
}
sbx->depth++;
#if LJ_HASFFI
@ -266,6 +319,16 @@ static char *serialize_get(char *r, SBufExt *sbx, TValue *o)
if (!tvisnum(o)) setnanV(o);
} else if (tp <= SER_TAG_TRUE) {
setpriV(o, ~tp);
} else if (tp == SER_TAG_DICT) {
GCtab *dict;
uint32_t idx;
r = serialize_ru124(r, w, &idx);
idx++;
dict = tabref(sbx->dict);
if (dict && idx < dict->asize && tvisstr(arrayslot(dict, idx)))
copyTV(sbufL(sbx), o, arrayslot(dict, idx));
else
lj_err_callerv(sbufL(sbx), LJ_ERR_BUFFER_BADDICTX, idx);
} else if (tp >= SER_TAG_TAB && tp < SER_TAG_TAB+6) {
uint32_t narray = 0, nhash = 0;
GCtab *t;

View File

@ -13,6 +13,7 @@
#define LJ_SERIALIZE_DEPTH 100 /* Default depth. */
LJ_FUNC void LJ_FASTCALL lj_serialize_dict_prep(lua_State *L, GCtab *dict);
LJ_FUNC SBufExt * LJ_FASTCALL lj_serialize_put(SBufExt *sbx, cTValue *o);
LJ_FUNC SBufExt * LJ_FASTCALL lj_serialize_get(SBufExt *sbx, TValue *o);

View File

@ -16,25 +16,6 @@
/* -- Object hashing ------------------------------------------------------ */
/* Hash values are masked with the table hash mask and used as an index. */
static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash)
{
Node *n = noderef(t->node);
return &n[hash & t->hmask];
}
/* String IDs are generated when a string is interned. */
#define hashstr(t, s) hashmask(t, (s)->sid)
#define hashlohi(t, lo, hi) hashmask((t), hashrot((lo), (hi)))
#define hashnum(t, o) hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1))
#if LJ_GC64
#define hashgcref(t, r) \
hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32))
#else
#define hashgcref(t, r) hashlohi((t), gcrefu(r), gcrefu(r) + HASH_BIAS)
#endif
/* Hash an arbitrary key and return its anchor position in the hash table. */
static Node *hashkey(const GCtab *t, cTValue *key)
{
@ -413,7 +394,7 @@ cTValue * LJ_FASTCALL lj_tab_getinth(GCtab *t, int32_t key)
return NULL;
}
cTValue *lj_tab_getstr(GCtab *t, GCstr *key)
cTValue *lj_tab_getstr(GCtab *t, const GCstr *key)
{
Node *n = hashstr(t, key);
do {
@ -546,7 +527,7 @@ TValue *lj_tab_setinth(lua_State *L, GCtab *t, int32_t key)
return lj_tab_newkey(L, t, &k);
}
TValue *lj_tab_setstr(lua_State *L, GCtab *t, GCstr *key)
TValue *lj_tab_setstr(lua_State *L, GCtab *t, const GCstr *key)
{
TValue k;
Node *n = hashstr(t, key);

View File

@ -31,6 +31,25 @@ static LJ_AINLINE uint32_t hashrot(uint32_t lo, uint32_t hi)
return hi;
}
/* Hash values are masked with the table hash mask and used as an index. */
static LJ_AINLINE Node *hashmask(const GCtab *t, uint32_t hash)
{
Node *n = noderef(t->node);
return &n[hash & t->hmask];
}
/* String IDs are generated when a string is interned. */
#define hashstr(t, s) hashmask(t, (s)->sid)
#define hashlohi(t, lo, hi) hashmask((t), hashrot((lo), (hi)))
#define hashnum(t, o) hashlohi((t), (o)->u32.lo, ((o)->u32.hi << 1))
#if LJ_GC64
#define hashgcref(t, r) \
hashlohi((t), (uint32_t)gcrefu(r), (uint32_t)(gcrefu(r) >> 32))
#else
#define hashgcref(t, r) hashlohi((t), gcrefu(r), gcrefu(r) + HASH_BIAS)
#endif
#define hsize2hbits(s) ((s) ? ((s)==1 ? 1 : 1+lj_fls((uint32_t)((s)-1))) : 0)
LJ_FUNCA GCtab *lj_tab_new(lua_State *L, uint32_t asize, uint32_t hbits);
@ -50,14 +69,14 @@ LJ_FUNCA void lj_tab_reasize(lua_State *L, GCtab *t, uint32_t nasize);
/* Caveat: all getters except lj_tab_get() can return NULL! */
LJ_FUNCA cTValue * LJ_FASTCALL lj_tab_getinth(GCtab *t, int32_t key);
LJ_FUNC cTValue *lj_tab_getstr(GCtab *t, GCstr *key);
LJ_FUNC cTValue *lj_tab_getstr(GCtab *t, const GCstr *key);
LJ_FUNCA cTValue *lj_tab_get(lua_State *L, GCtab *t, cTValue *key);
/* Caveat: all setters require a write barrier for the stored value. */
LJ_FUNCA TValue *lj_tab_newkey(lua_State *L, GCtab *t, cTValue *key);
LJ_FUNCA TValue *lj_tab_setinth(lua_State *L, GCtab *t, int32_t key);
LJ_FUNC TValue *lj_tab_setstr(lua_State *L, GCtab *t, GCstr *key);
LJ_FUNC TValue *lj_tab_setstr(lua_State *L, GCtab *t, const GCstr *key);
LJ_FUNC TValue *lj_tab_set(lua_State *L, GCtab *t, cTValue *key);
#define inarray(t, key) ((MSize)(key) < (MSize)(t)->asize)